In [2]:
from datasets import load_dataset
from datasets import ClassLabel, Audio, DatasetDict
import numpy as np
import librosa
import matplotlib.pyplot as plt
from IPython.display import display, Audio as DisplayAudio
import torch
import librosa
import numpy as np

SEED = 42
NUM_PROC = 24
SAMPLING_RATE = 16000
CHUNK_DURATION = 0.5
BATCH_SIZE = 32
THRESHOLD_AUGMENTATION = 1

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print("Loading DS")
# Load dataset
DS = load_dataset("n1coc4cola/maotouying")
print(len(DS))
DS_train = DS["train"]

Loading DS
1


In [4]:
print("Shuffling")
# Take only n% of the dataset
n = 1.0
DS_train_shuffled = DS_train.shuffle(seed=SEED).select(range(int(n * len(DS_train))))

Shuffling


In [5]:
train_test = DS_train_shuffled.train_test_split(test_size=0.2, seed=SEED)
test_val = train_test["test"].train_test_split(test_size=0.5, seed=SEED)
dataset = DatasetDict({
    "train": train_test["train"],
    "val": test_val["train"],
    "test": test_val["test"],
})

In [6]:
# Cast to 16khz
for split in dataset.keys():
    dataset[split] = dataset[split].cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))

In [7]:
def noise_injection(data, noise_factor):
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    # Cast back to the same data type
    augmented_data = augmented_data.astype(type(data[0]))
    return augmented_data

def change_pitch(data, sampling_rate, pitch_factor):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_factor)

def time_stretch(data, stretch_factor):
    return librosa.effects.time_stretch(data, rate=stretch_factor)

def frequency_mask(data, mask_width=10):
    stft = librosa.stft(data)
    freq_bins = stft.shape[0]
    f0 = np.random.randint(0, freq_bins - mask_width)
    stft[f0:f0+mask_width, :] = 0
    return librosa.istft(stft)

def time_mask(data, mask_width=10):
    d = data.copy()
    t0 = np.random.randint(0, len(d) - mask_width)
    d[t0:t0+mask_width] = 0
    return d

def dynamic_range_compression(data, threshold, ratio):
    # Simple compression: reduce amplitude above threshold
    compressed = data.copy()
    compressed[np.abs(data) > threshold] = threshold + (compressed[np.abs(data) > threshold] - threshold) / ratio
    return compressed

In [8]:
print(f"Size of dataset: {len(dataset)}")
print(f"Size of splits: train={len(dataset['train'])}, val={len(dataset['val'])}, test={len(dataset['test'])}")

Size of dataset: 3
Size of splits: train=168771, val=21096, test=21097


In [9]:
def split_audio_into_chunks(audio_array, chunk_duration=CHUNK_DURATION, sampling_rate=SAMPLING_RATE):
    samples_per_chunk = int(chunk_duration * sampling_rate)
    num_chunks = audio_array.shape[-1] // samples_per_chunk
    # Only split into full chunks, no padding
    chunks = [audio_array[i * samples_per_chunk:(i + 1) * samples_per_chunk]
              for i in range(num_chunks)]

    return chunks

def chunk_audio_batch(batch):
    # Process by batch to allow multi processing
    all_audios = []
    all_sampling_rates = []
    all_labels = []

    for audio, label in zip(batch["audio"], batch["label"]):
        audio_array = audio["array"]
        sampling_rate = audio["sampling_rate"]
        audio_array = torch.tensor(audio_array).float()
        chunks = split_audio_into_chunks(audio_array)

        all_audios.extend([chunk.numpy() for chunk in chunks])
        all_sampling_rates.extend([sampling_rate] * len(chunks))
        all_labels.extend([label] * len(chunks))

    return {
        "audio": all_audios,
        "label": all_labels,
    }


chunked_dataset = DatasetDict()
for split in dataset.keys():
    print(f"Chunking split: {split}, original length: {len(dataset[split])}")
    chunked_split = dataset[split].map(
        chunk_audio_batch,
        batched=True,
        num_proc=NUM_PROC,
        batch_size=BATCH_SIZE,
        remove_columns=dataset[split].column_names,
    )
    print(f"Generated split {split} length, {len(chunked_split)}")
    chunked_dataset[split] = chunked_split

Chunking split: train, original length: 168771
Generated split train length, 617500
Chunking split: val, original length: 21096
Generated split val length, 78726
Chunking split: test, original length: 21097
Generated split test length, 78483


In [10]:

# def augmente_batch(batch):
#     rng = np.random.default_rng()
#     all_audios = []
#     all_labels = []
#
#     for audio, label in zip(batch["audio"], batch["label"]):
#         all_audios.append(audio)
#         all_labels.append(label)
#
#         augmented_audio = None
#         if rng.random() > THRESHOLD_AUGMENTATION:
#             r = rng.random()
#             data = np.array(audio)
#             if r < 0.20:
#                 augmented_audio = noise_injection(data, noise_factor=0.05)
#             elif r < 0.40:
#                 augmented_audio = change_pitch(data, sampling_rate=SAMPLING_RATE, pitch_factor=2)
#             elif r < 0.60:
#                 augmented_audio = time_stretch(data, stretch_factor=1.1)
#             elif r < 0.80:
#                 augmented_audio = dynamic_range_compression(data, threshold=0.05, ratio=1.0)
#             else:
#                 augmented_audio = frequency_mask(data, mask_width=20)
#
#             all_audios.append(augmented_audio)
#             all_labels.append(label)
#
#
#     return {
#         "audio": all_audios,
#         "label": all_labels,
#     }
#
# augmented_dataset = DatasetDict()
# for split in chunked_dataset.keys():
#     print(f"Augmenting split: {split}, length before: {len(chunked_dataset[split])}")
#     augmented_split = chunked_dataset[split].map(
#         augmente_batch,
#         batched=True,
#         num_proc=NUM_PROC,
#         batch_size=BATCH_SIZE,
#         remove_columns=chunked_dataset[split].column_names,
#     )
#     print(f"Length after: {len(augmented_split)}")
#     augmented_dataset[split] = augmented_split


In [11]:
# # -----------------------------
# # Audio transforms for linear spectrogram
# # -----------------------------
# def convert_to_linear_spectrogram(batch):
#     all_linear_db = []
#     all_labels = []
#
#     for audio, label in zip(batch["audio"], batch["label"]):
#         data = np.array(audio)
#         # Compute STFT
#         stft = librosa.stft(data, n_fft=2048, hop_length=512)
#
#         # Compute magnitude
#         magnitude = np.abs(stft)
#
#         # Convert to dB
#         linear_db = librosa.amplitude_to_db(magnitude, ref=np.max)
#         all_linear_db.append(torch.tensor(linear_db))
#         all_labels.append(torch.tensor(label))
#
#     return {
#         # Convert to torch tensors
#         "audio": all_linear_db,
#         "label": all_labels,
#     }
#
# spectrogram_dataset = DatasetDict()
# for split in chunked_dataset.keys():
#     print(f"Converting to spectrogram split: {split}")
#     spectrogram_split = chunked_dataset[split].map(
#         convert_to_linear_spectrogram,
#         batched=True,
#         num_proc=NUM_PROC,
#         batch_size=BATCH_SIZE,
#         remove_columns=chunked_dataset[split].column_names,
#     )
#     spectrogram_dataset[split] = spectrogram_split


In [12]:
chunked_dataset.save_to_disk("./ds_3_raw_chunked.hf")

Saving the dataset (40/40 shards): 100%|██████████| 617500/617500 [00:46<00:00, 13175.66 examples/s]
Saving the dataset (6/6 shards): 100%|██████████| 78726/78726 [00:03<00:00, 23421.35 examples/s]
Saving the dataset (6/6 shards): 100%|██████████| 78483/78483 [00:05<00:00, 15082.57 examples/s]
