In [4]:
import os
import random
import librosa
import numpy as np
import soundfile as sf
import pandas as pd
from tqdm import tqdm
import deeplake

# Load NSynth dataset
ds = deeplake.load("hub://activeloop/nsynth-train")

# Output folder
OUTPUT_PATH = "mixed_out_of_tune_samples"
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Parameters
sample_rate = 16000
duration_sec = 4
pitch_shift_range = [-2, -1, 1, 2]
mix_size = 3
total_samples = 1000

# Instrument mapping: human-readable names
INSTRUMENT_MAP = {
    0: 'string_bass',
    1: 'bass_guitar',
    18: 'flute',
    20: 'trumpet',
    21: 'trombone',
    22: 'tuba',
    24: 'guitar_acoustic',
    26: 'guitar_electric',
    47: 'viola',
    48: 'violin',
    50: 'saxophone',
    54: 'oboe',
    55: 'bassoon'
}

target_instruments = set(INSTRUMENT_MAP.keys())

# Filter dataset once to only include target instruments
filtered_indices = [
    i for i in range(len(ds))
    if int(ds[i]['instrument'].numpy()) in target_instruments
]

# Preload instrument IDs for fast access
filtered_samples = [(i, int(ds[i]['instrument'].numpy())) for i in filtered_indices]

# Memory-efficient pitch shift
def simple_pitch_shift(y, semitones, sr):
    rate = 2 ** (semitones / 12)
    y_stretched = librosa.effects.time_stretch(y, rate=1 / rate)
    y_shifted = librosa.resample(y_stretched, orig_sr=sr, target_sr=sr)
    return librosa.util.fix_length(y_shifted, size=len(y))

# Metadata storage
metadata = []

# Sample generation loop
for i in tqdm(range(total_samples)):
    chosen = random.sample(filtered_samples, mix_size)
    out_of_tune_idx = random.randint(0, mix_size - 1)
    pitch_shift = random.choice(pitch_shift_range)

    mixture = np.zeros((sample_rate * duration_sec,), dtype=np.float32)
    label = {}
    all_instruments = []

    for idx, (sample_idx, instrument_id) in enumerate(chosen):
        sample = ds[sample_idx]
        instrument_name = INSTRUMENT_MAP.get(instrument_id, str(instrument_id))
        all_instruments.append(instrument_name)

        y = np.array(sample['audios'], dtype=np.float32)
        if y.ndim > 1:
            y = y.flatten()
        y = librosa.util.fix_length(y, size=sample_rate * duration_sec)

        if idx == out_of_tune_idx:
            y = simple_pitch_shift(y, pitch_shift, sample_rate)
            label['out_of_tune'] = instrument_name
        else:
            label.setdefault('in_tune', []).append(instrument_name)

        mixture += y

    # Normalize audio
    mixture /= np.max(np.abs(mixture) + 1e-6)

    # Save file
    filename = f"mix_{i}.wav"
    sf.write(os.path.join(OUTPUT_PATH, filename), mixture, sample_rate)

    # Metadata
    label['filename'] = filename
    label['instruments_all'] = all_instruments
    metadata.append(label)

# Save metadata CSV
df = pd.DataFrame(metadata)
df.to_csv(os.path.join(OUTPUT_PATH, "labels.csv"), index=False)


/

Opening dataset in read-only mode as you don't have write permissions.


\

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/nsynth-train



|

hub://activeloop/nsynth-train loaded successfully.



 

OSError: [WinError 433] A device which does not exist was specified: 'mixed_out_of_tune_samples'