In [None]:
import os
import random
import librosa
import librosa.display
import numpy as np
import pandas as pd
from tqdm import tqdm
import deeplake
import matplotlib.pyplot as plt

# Load NSynth from DeepLake
ds = deeplake.load("hub://activeloop/nsynth-train")

# Output folder
OUTPUT_PATH = "mixed_out_of_tune_samples"
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Parameters
sample_rate = 16000
duration_sec = 4
pitch_shift_range = [-2, -1, 1, 2]  # in semitones
mix_size = 3   # number of instruments to mix
total_samples = 10  # number of mixed samples to generate

# Memory-friendly pitch shift function
def simple_pitch_shift(y, semitones, sr):
    rate = 2 ** (semitones / 12)
    y_stretched = librosa.effects.time_stretch(y, rate=1 / rate)
    y_shifted = librosa.resample(y_stretched, orig_sr=sr, target_sr=sr)
    return librosa.util.fix_length(y_shifted, size=len(y))

metadata = []

for i in tqdm(range(total_samples)):
    # Randomly choose mix_size samples from NSynth
    chosen_samples = random.sample(list(ds), mix_size)
    out_of_tune_idx = random.randint(0, mix_size - 1)
    pitch_shift = random.choice(pitch_shift_range)
    
    # Initialize the mixture
    mixture = np.zeros((sample_rate * duration_sec,), dtype=np.float32)
    label = {}
    
    for idx, sample in enumerate(chosen_samples):
        # Get the audio array from the sample and flatten if necessary
        y = np.array(sample["audios"], dtype=np.float32)
        if y.ndim > 1:
            y = y.flatten()
        y = librosa.util.fix_length(y, size=sample_rate * duration_sec)
        
        # Convert instrument value to a string
        instrument_val = sample["instrument"].numpy()
        if instrument_val.ndim == 0:
            instrument_val = instrument_val.item()
        instrument_name = str(instrument_val)
        
        # Apply pitch shifting if this is the out-of-tune sample
        if idx == out_of_tune_idx:
            y = simple_pitch_shift(y, pitch_shift, sample_rate)
            label['out_of_tune'] = instrument_name
        else:
            label.setdefault('in_tune', []).append(instrument_name)
        
        # Add to the overall mixture
        mixture += y
    
    # Normalize the mixed audio to prevent clipping
    mixture /= np.max(np.abs(mixture) + 1e-6)
    
    # Generate and save the mel-spectrogram image
    filename_png = f"mix_{i}_spec.png"
    S = librosa.feature.melspectrogram(mixture, sr=sample_rate, n_mels=128)
    S_dB = librosa.power_to_db(S, ref=np.max)
    
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(S_dB, sr=sample_rate, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel-frequency Spectrogram')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_PATH, filename_png))
    plt.close()
    
    label['spectrogram_filename'] = filename_png
    metadata.append(label)

# Save metadata to CSV
df = pd.DataFrame(metadata)
df.to_csv(os.path.join(OUTPUT_PATH, "labels.csv"), index=False)
