In [1]:
from datasets import load_dataset, concatenate_datasets, Dataset
import numpy as np
import torch
import torchcodec
from datasets import ClassLabel, Audio as DatasetAudio
from IPython.display import display, Audio
labels = ['other', 'drone']



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Get the dataset from hugging faces.
drone_audio_detection_from_hugging_face_dataset = load_dataset("geronimobasso/drone-audio-detection-samples")
drone_audio_detection_from_hugging_face_dataset = drone_audio_detection_from_hugging_face_dataset.cast_column("label", ClassLabel(names=labels))
drone_audio_detection_from_hugging_face_dataset = drone_audio_detection_from_hugging_face_dataset.cast_column(
    "audio",
    DatasetAudio(sampling_rate=16000)  # resample audio to 16 kHz
)


In [3]:
# Create the hibou dataset previously generated
hibou_dataset = load_dataset("audiofolder", data_dir="../data/raw/hibou_dataset")
hibou_dataset = hibou_dataset.cast_column("label", ClassLabel(names=labels))
hibou_dataset = hibou_dataset.cast_column(
    "audio",
    DatasetAudio(sampling_rate=16000)  # resample audio to 16 kHz
)
# Remap values
hibou_dataset = hibou_dataset.map(lambda x: {"label": 1 - x["label"]})

Map: 100%|██████████| 7890/7890 [00:00<00:00, 58249.81 examples/s]


In [4]:
# Verify the datasets
label_feature = drone_audio_detection_from_hugging_face_dataset["train"].features["label"]
ad1 = drone_audio_detection_from_hugging_face_dataset["train"][17000]
print(label_feature.int2str(ad1["label"]))
display(Audio(ad1["audio"]["array"], rate=ad1["audio"]["sampling_rate"]))


label_feature = hibou_dataset["train"].features["label"]
ad1 = hibou_dataset["train"][1]
print(label_feature.int2str(ad1["label"]))
display(Audio(ad1["audio"]["array"], rate=ad1["audio"]["sampling_rate"]))

drone


drone


In [5]:
# Concatenate the 2 datasets
dataset = concatenate_datasets([drone_audio_detection_from_hugging_face_dataset["train"], hibou_dataset["train"]])

In [8]:
# Load the dataset from kaggle
dataset2 = load_dataset("audiofolder", data_dir="../data/raw/dataset2")
dataset2 = dataset2.cast_column("label", ClassLabel(names=labels))
dataset2 = dataset2.cast_column(
    "audio",
    DatasetAudio(sampling_rate=16000)  # resample audio to 16 kHz
)
dataset2 = dataset2.map(lambda x: {"label": 1 - x["label"]})

Map: 100%|██████████| 3560/3560 [00:00<00:00, 62130.08 examples/s]


In [9]:
# Concatenate the datasets
dataset = concatenate_datasets([dataset, dataset2["train"]])

# Free memory
import gc
del drone_audio_detection_from_hugging_face_dataset
del hibou_dataset
del dataset2

In [10]:
# Check dataset size
from collections import Counter

label_counts = Counter(dataset["label"])
print(label_counts)
print(f"{labels[1]}: {label_counts[1]}", f"{labels[0]}: {label_counts[0]}")

Counter({1: 169517, 0: 22253})
drone: 169517 other: 22253


In [11]:
def split_audio_into_chunks(audio_array, sampling_rate, chunk_duration=0.5):
    # Calculate the number of samples per chunk
    samples_per_chunk = int(chunk_duration * sampling_rate)
    # Pad the audio if necessary to make it divisible by samples_per_chunk
    remainder = audio_array.shape[-1] % samples_per_chunk
    if remainder != 0:
        padding = samples_per_chunk - remainder
        audio_array = torch.nn.functional.pad(audio_array, (0, padding))
    # Split the audio into chunks
    return audio_array.unfold(-1, samples_per_chunk, samples_per_chunk)

def chunk_audio_batch(examples):
    # Process by batch to allow multi processing
    all_audios = []
    all_sampling_rates = []
    all_labels = []

    for audio, label in zip(examples["audio"], examples["label"]):
        audio_array = audio["array"]
        sampling_rate = audio["sampling_rate"]
        audio_array = torch.tensor(audio_array).float()
        chunks = split_audio_into_chunks(audio_array, sampling_rate, 0.5)

        all_audios.extend([chunk.numpy() for chunk in chunks])
        all_sampling_rates.extend([sampling_rate] * len(chunks))
        all_labels.extend([label] * len(chunks))

    return {
        "audio": all_audios,
        "sampling_rate": all_sampling_rates,
        "label": all_labels,
    }

chunked_dataset = dataset.map(
    chunk_audio_batch,
    batched=True,
    num_proc=12,
    batch_size=16,
    remove_columns=dataset.column_names,
)

Map (num_proc=12): 100%|██████████| 191770/191770 [01:09<00:00, 2771.01 examples/s] 


In [12]:
# Check dataset size
from collections import Counter

label_counts = Counter(chunked_dataset["label"])
print(label_counts)
print(f"{labels[1]}: {label_counts[1]}", f"{labels[0]}: {label_counts[0]}")

sampling_counts = Counter(chunked_dataset["sampling_rate"])
print(sampling_counts)

Counter({0: 255291, 1: 207410})
drone: 207410 other: 255291
Counter({16000: 462701})


In [24]:
import random
# Verify the datasets
i = random.randint(0, len(chunked_dataset) - 1)
ad1 = chunked_dataset[i]

# label_feature = chunked_dataset.features["label"]
print(i, len(ad1["audio"]), label_feature.int2str(ad1["label"]))
display(Audio(ad1["audio"], rate=ad1["sampling_rate"]))


327442 8000 drone


In [19]:
chunked_dataset.save_to_disk("../data/datasets/ds_462700")

Saving the dataset (30/30 shards): 100%|██████████| 462701/462701 [00:16<00:00, 27379.42 examples/s]
