In [1]:
from datasets import load_dataset
from datasets import ClassLabel, Audio
import numpy as np
import librosa
import matplotlib.pyplot as plt
from IPython.display import display, Audio as DisplayAudio
import torch

labels = ["other", "drone"]
SEED = 42
NUM_PROC = 23
SAMPLING_RATE = 16000
CHUNK_DURATION = 0.5
BATCH_SIZE = 32
THRESHOLD_AUGMENTATION = 0.5

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
DS = load_dataset("geronimobasso/drone-audio-detection-samples")

In [3]:
DS = DS["train"]
# Cast labels
DS = DS.cast_column("label", ClassLabel(names=labels))

In [6]:
# Take only n% of the dataset
n = 1
dataset = DS.shuffle(seed=SEED).select(range(int(n * len(DS))))

In [7]:
# Cast to 16khz
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [8]:
def noise_injection(data, noise_factor):
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    # Cast back to the same data type
    augmented_data = augmented_data.astype(type(data[0]))
    return augmented_data

def change_pitch(data, sampling_rate, pitch_factor):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_factor)

def time_stretch(data, stretch_factor):
    return librosa.effects.time_stretch(data, rate=stretch_factor)

def frequency_mask(data, mask_width=10):
    stft = librosa.stft(data)
    freq_bins = stft.shape[0]
    f0 = np.random.randint(0, freq_bins - mask_width)
    stft[f0:f0+mask_width, :] = 0
    return librosa.istft(stft)

def time_mask(data, mask_width=10):
    d = data.copy()
    t0 = np.random.randint(0, len(d) - mask_width)
    d[t0:t0+mask_width] = 0
    return d

def dynamic_range_compression(data, threshold, ratio):
    # Simple compression: reduce amplitude above threshold
    compressed = data.copy()
    compressed[np.abs(data) > threshold] = threshold + (compressed[np.abs(data) > threshold] - threshold) / ratio
    return compressed


In [9]:
def plot_spectrogram(audio, title, sr=16000):
    D = librosa.amplitude_to_db(
        librosa.stft(audio), ref=np.max
    )
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(D, y_axis='log', x_axis='time', sr=sr)
    plt.colorbar(format='%+2.0f dB')
    plt.title(title)
    plt.show()

def plot_waveform(audio, title, sr=16000):
    time = np.linspace(0, len(audio) / sr, num=len(audio))
    plt.figure(figsize=(10, 4))
    plt.plot(time, audio, alpha=0.7)
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.title(title)
    plt.grid(True)
    plt.show()

In [11]:
print(f"Size of dataset: {len(dataset)}")

Size of dataset: 1803


In [12]:
def split_audio_into_chunks(audio_array, chunk_duration=CHUNK_DURATION, sampling_rate=SAMPLING_RATE):
    samples_per_chunk = int(chunk_duration * sampling_rate)
    num_chunks = audio_array.shape[-1] // samples_per_chunk

    # Only split into full chunks, no padding
    chunks = [audio_array[i * samples_per_chunk:(i + 1) * samples_per_chunk]
              for i in range(num_chunks)]

    return chunks

def chunk_audio_batch(batch):
    # Process by batch to allow multi processing
    all_audios = []
    all_sampling_rates = []
    all_labels = []

    for audio, label in zip(batch["audio"], batch["label"]):
        audio_array = audio["array"]
        sampling_rate = audio["sampling_rate"]
        audio_array = torch.tensor(audio_array).float()
        chunks = split_audio_into_chunks(audio_array)

        all_audios.extend([chunk.numpy() for chunk in chunks])
        all_sampling_rates.extend([sampling_rate] * len(chunks))
        all_labels.extend([label] * len(chunks))

    return {
        "audio": all_audios,
        "label": all_labels,
    }

chunked_dataset = dataset.map(
    chunk_audio_batch,
    batched=True,
    num_proc=NUM_PROC,
    batch_size=BATCH_SIZE,
    remove_columns=dataset.column_names,
)
len(chunked_dataset)

4235

In [13]:
def augmente_batch(batch):
    rng = np.random.default_rng()
    all_audios = []
    all_labels = []

    for audio, label in zip(batch["audio"], batch["label"]):
        all_audios.append(audio)
        all_labels.append(label)

        augmented_audio = None
        if rng.random() > THRESHOLD_AUGMENTATION:
            r = rng.random()
            data = np.array(audio)
            if r < 0.20:
                augmented_audio = noise_injection(data, noise_factor=0.05)
            elif r < 0.40:
                augmented_audio = change_pitch(data, sampling_rate=SAMPLING_RATE, pitch_factor=2)
            elif r < 0.60:
                augmented_audio = time_stretch(data, stretch_factor=1.1)
            elif r < 0.80:
                augmented_audio = dynamic_range_compression(data, threshold=0.05, ratio=1.0)
            else:
                augmented_audio = frequency_mask(data, mask_width=20)

            all_audios.append(augmented_audio)
            all_labels.append(label)


    return {
        "audio": all_audios,
        "label": all_labels,
    }

augmented_dataset = chunked_dataset.map(augmente_batch, batched=True, num_proc=NUM_PROC, batch_size=BATCH_SIZE, remove_columns=dataset.column_names)
len(augmented_dataset)

Map (num_proc=23): 100%|██████████| 4235/4235 [00:08<00:00, 526.85 examples/s] 


6352

In [14]:
print(len(augmented_dataset))

6352


In [19]:
import librosa
import numpy as np

# -----------------------------
# Audio transforms for linear spectrogram
# -----------------------------
def convert_to_linear_spectrogram(batch):
    all_linear_db = []
    all_labels = []

    for audio, label in zip(batch["audio"], batch["label"]):
        data = np.array(audio)
        # Compute STFT
        stft = librosa.stft(data, n_fft=2048, hop_length=512)

        # Compute magnitude
        magnitude = np.abs(stft)

        # Convert to dB
        linear_db = librosa.amplitude_to_db(magnitude, ref=np.max)

        all_linear_db.append(linear_db)
        all_labels.append(label)

    return {
        "audio": all_linear_db,
        "label": all_labels,
    }

# Apply to dataset
spectrogram_dataset = augmented_dataset.map(
    convert_to_linear_spectrogram,
    batched=True,
    num_proc=NUM_PROC,
    batch_size=BATCH_SIZE,
    remove_columns=augmented_dataset.column_names
)
len(spectrogram_dataset)


Map (num_proc=23): 100%|██████████| 6352/6352 [00:09<00:00, 682.83 examples/s] 


6352

In [25]:
augmented_dataset.save_to_disk("../../data/datasets/1_augmented_spectogram_test")


Saving the dataset (1/1 shards): 100%|██████████| 6352/6352 [00:00<00:00, 40195.86 examples/s]


In [None]:
# import torchaudio
#
# # -----------------------------
# # Audio transforms
# # -----------------------------
# mel_transform = torchaudio.transforms.MelSpectrogram(
#     sample_rate=SAMPLING_RATE,
#     n_fft=2048,
#     hop_length=512,
#     n_mels=128,
# )
# db_transform = torchaudio.transforms.AmplitudeToDB()
#
# def convert_to_mel_spectogram(batch):
#     all_mel_db = []
#     all_labels = []
#
#     for audio, label in zip(batch["audio"], batch["label"]):
#         mel = mel_transform(audio)
#         mel_db = db_transform(mel)
#
#         all_mel_db.append(mel_db)
#         all_labels.append(label)
#
#
#     return {
#         "audio": all_mel_db,
#         "label": all_labels,
#     }
#
# spectogram_dataset = chunked_dataset.map(augmente_batch, batched=True, num_proc=NUM_PROC, batch_size=BATCH_SIZE, remove_columns=dataset.column_names)
# len(spectogram_dataset)

In [12]:
augmented_dataset.save_to_disk("../../data/datasets/1_augmented_mel_spectogram")

Saving the dataset (42/42 shards): 100%|██████████| 654521/654521 [00:26<00:00, 24417.11 examples/s]
