In [18]:
import torch
import torchaudio
from datasets import load_dataset, DatasetDict
from tqdm import tqdm
import numpy as np

SEED = 42
SAMPLE_RATE = 16000

In [20]:
# Load dataset
ds = load_dataset("Usernameeeeee/df_462700_2")
ds_split = ds["train"].train_test_split(test_size=0.3, seed=SEED, shuffle=True)
test_and_valid = ds_split["test"].train_test_split(test_size=0.5, seed=SEED, shuffle=True)


ds = DatasetDict({
    "train": ds_split["train"],
    "valid": test_and_valid["train"],
    "test": test_and_valid["test"],
})


In [22]:
# Define transforms
mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=2048,
    hop_length=256,
    n_mels=64,
)
db_transform = torchaudio.transforms.AmplitudeToDB()

def preprocess_split(split_data, split_name):
    """Preprocess audio to mel spectrograms"""
    processed = []

    for item in tqdm(split_data, desc=f"Processing {split_name}"):
        waveform = torch.tensor(item["audio"]).float()

        # Convert to mono
        if waveform.ndim > 1:
            waveform = waveform.mean(dim=0)

        # Compute mel spectrogram
        mel = mel_transform(waveform)
        mel_db = db_transform(mel)

        processed.append({
            "mel_spectrogram": mel_db.numpy(),
            "label": item["label"]
        })

    return processed

# Process all splits
print("Preprocessing dataset...")
processed_ds = {
    "train": preprocess_split(ds["train"], "train"),
    "valid": preprocess_split(ds["valid"], "valid"),
    "test": preprocess_split(ds["test"], "test"),
}

Preprocessing dataset...


Processing train:   2%|‚ñè         | 6285/323890 [00:20<17:32, 301.73it/s]


KeyboardInterrupt: 