In [9]:
import os, random
from gtts import gTTS
from pydub import AudioSegment, effects
from tqdm import tqdm
import numpy as np


In [7]:
cwd = os.getcwd()
print(cwd)

/Users/sethwright/Documents/audio-model


In [10]:
# === CONFIGURATION ===
OUTPUT_DIR = "/Users/sethwright/Documents/audio-model/data"
MUSAN_DIR = "/Users/sethwright/Downloads/musan"       # root folder of MUSAN dataset
N_POS = 300               # number of positive "Hey Chef" samples
N_NEG = 600               # number of negative samples (mix of hard + random)
SAMPLE_RATE = 16000

# make directories
os.makedirs(f"{OUTPUT_DIR}/positive", exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/negative", exist_ok=True)

# hard negatives
HARD_NEGATIVES = ["Hey chief", "Hey Steph", "Okay chef"]
# random filler negatives
RANDOM_PHRASES = [
    "How are you", "Good morning", "Turn on the light", "Hey there", 
    "Check this out", "Let's cook", "Hello", "Okay", "Hey Jeff", "What's up"
]

def random_speed_change(sound):
    speed = random.uniform(0.9, 1.2)
    sound = sound._spawn(sound.raw_data, overrides={
        "frame_rate": int(sound.frame_rate * speed)
    })
    return sound.set_frame_rate(SAMPLE_RATE)

def random_pitch_shift(sound):
    semitones = random.uniform(-1.5, 1.5)
    new_sr = int(sound.frame_rate * (2.0 ** (semitones / 12.0)))
    shifted = sound._spawn(sound.raw_data, overrides={'frame_rate': new_sr})
    return shifted.set_frame_rate(SAMPLE_RATE)

def get_random_musan_clip():
    """Pick a random short clip from MUSAN (speech, noise, or music)."""
    subfolders = ["speech", "noise", "music"]
    folder = random.choice(subfolders)
    full_dir = os.path.join(MUSAN_DIR, folder)
    if not os.path.exists(full_dir):
        return None
    candidates = []
    for root, _, files in os.walk(full_dir):
        for f in files:
            if f.endswith((".wav", ".mp3")):
                candidates.append(os.path.join(root, f))
    if not candidates:
        return None
    return random.choice(candidates)

def add_noise(sound):
    noise_path = get_random_musan_clip()
    if not noise_path:
        return sound
    noise = AudioSegment.from_file(noise_path).set_frame_rate(SAMPLE_RATE).set_channels(1)
    if len(noise) > len(sound):
        start = random.randint(0, len(noise) - len(sound))
        noise = noise[start:start + len(sound)]
    else:
        noise = noise * (len(sound) // len(noise) + 1)
    noise = noise[:len(sound)]
    noise = noise - random.randint(8, 20)  # reduce noise volume
    return sound.overlay(noise)

def synthesize_tts(text, out_path):
    tts = gTTS(text=text, lang="en")
    temp_path = out_path.replace(".wav", ".mp3")
    tts.save(temp_path)
    sound = AudioSegment.from_mp3(temp_path).set_channels(1).set_frame_rate(SAMPLE_RATE)
    os.remove(temp_path)
    # augmentations
    if random.random() < 0.7: sound = random_speed_change(sound)
    if random.random() < 0.7: sound = random_pitch_shift(sound)
    if random.random() < 0.7: sound = add_noise(sound)
    if random.random() < 0.3: sound = effects.normalize(sound)
    sound.export(out_path, format="wav")

# === Generate positives ===
print(f"Generating {N_POS} positive 'Hey Chef' samples...")
for i in tqdm(range(N_POS)):
    synthesize_tts("Hey Chef", f"{OUTPUT_DIR}/positive/heychef_{i}.wav")

# === Generate negatives ===
print(f"Generating {N_NEG} negative samples...")
for i in tqdm(range(N_NEG)):
    if random.random() < 0.3:
        text = random.choice(HARD_NEGATIVES)
    else:
        text = random.choice(RANDOM_PHRASES)
    synthesize_tts(text, f"{OUTPUT_DIR}/negative/neg_{i}.wav")

print("✅ Done! Dataset ready in:", os.path.abspath(OUTPUT_DIR))
print(f" - Positive samples: {N_POS}")
print(f" - Negative samples: {N_NEG}")

Generating 300 positive 'Hey Chef' samples...


100%|█████████████████████████████████████████| 300/300 [01:23<00:00,  3.61it/s]


Generating 600 negative samples...


100%|█████████████████████████████████████████| 600/600 [02:45<00:00,  3.62it/s]

✅ Done! Dataset ready in: /Users/sethwright/Documents/audio-model/data
 - Positive samples: 300
 - Negative samples: 600



