# Augment Audio — Folder Batch

Applies **training augmentations** to every WAV in a folder:
- Background noise mixing (at random SNR)
- Music overlay
- Competing speech overlay
- Room impulse response (reverb)
- Pitch shifting
- Speed perturbation

Each augmentation is applied with configurable probability.
Reads noise/music/speech/RIR files from separate folders.

---

> No GPU required. Uses `librosa` + `soundfile` + `numpy`.

In [None]:
import os

# ════════════════════════════════════════════
# ⚙️  CONFIGURATION
# ════════════════════════════════════════════
os.environ.setdefault("INPUT_DIR",       "/data/audio_in")
os.environ.setdefault("OUTPUT_DIR",      "/data/audio_augmented")
os.environ.setdefault("BG_NOISE_DIR",    "")            # Background noise WAVs
os.environ.setdefault("MUSIC_DIR",       "")            # Music WAVs
os.environ.setdefault("BG_SPEECH_DIR",   "")            # Competing speaker WAVs
os.environ.setdefault("RIR_DIR",         "")            # Room impulse response WAVs
os.environ.setdefault("SNR_MIN",         "5")           # dB
os.environ.setdefault("SNR_MAX",         "20")          # dB
os.environ.setdefault("PITCH_MIN",       "-2.0")        # semitones
os.environ.setdefault("PITCH_MAX",       "2.0")
os.environ.setdefault("SPEED_MIN",       "0.9")
os.environ.setdefault("SPEED_MAX",       "1.1")
os.environ.setdefault("AUG_PROB",        "0.7")         # Per-augmentation probability
os.environ.setdefault("TARGET_SR",       "16000")       # All audio normalized to this SR

INPUT_DIR = os.environ["INPUT_DIR"]
OUTPUT_DIR = os.environ["OUTPUT_DIR"]
SNR_MIN   = float(os.environ["SNR_MIN"])
SNR_MAX   = float(os.environ["SNR_MAX"])
PITCH_MIN = float(os.environ["PITCH_MIN"])
PITCH_MAX = float(os.environ["PITCH_MAX"])
SPEED_MIN = float(os.environ["SPEED_MIN"])
SPEED_MAX = float(os.environ["SPEED_MAX"])
AUG_PROB  = float(os.environ["AUG_PROB"])
SR        = int(os.environ["TARGET_SR"])

print(f"Input:  {INPUT_DIR}")
print(f"Output: {OUTPUT_DIR}")
print(f"SNR:    {SNR_MIN}–{SNR_MAX} dB | Pitch: {PITCH_MIN}–{PITCH_MAX} st")
print(f"Speed:  {SPEED_MIN}–{SPEED_MAX} | Prob: {AUG_PROB}")

In [None]:
!pip install --quiet --break-system-packages librosa soundfile numpy tqdm

In [None]:
import os, random
import numpy as np
import librosa
import soundfile as sf
from pathlib import Path
from tqdm import tqdm


def collect_wavs(d):
    d = d.strip()
    if not d or not os.path.isdir(d):
        return []
    return sorted(Path(d).rglob("*.wav"))


def load_mono(path, sr):
    try:
        y, _ = librosa.load(str(path), sr=sr, mono=True)
        return y
    except Exception:
        return None


def mix_at_snr(signal, noise, snr_db):
    """Mix noise into signal at given SNR."""
    if len(noise) < len(signal):
        reps = (len(signal) // len(noise)) + 1
        noise = np.tile(noise, reps)
    start = random.randint(0, max(0, len(noise) - len(signal)))
    noise_seg = noise[start : start + len(signal)]

    sig_power = np.mean(signal ** 2) + 1e-10
    noise_power = np.mean(noise_seg ** 2) + 1e-10
    scale = np.sqrt(sig_power / (noise_power * 10 ** (snr_db / 10)))
    return signal + scale * noise_seg


def apply_reverb(signal, rir):
    """Convolve signal with room impulse response."""
    from numpy.fft import fft, ifft
    n = len(signal) + len(rir) - 1
    result = np.real(ifft(fft(signal, n) * fft(rir, n)))[:len(signal)]
    peak = np.max(np.abs(result)) + 1e-10
    return result * (np.max(np.abs(signal)) / peak)


def pitch_shift(y, sr, n_steps):
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)


def speed_perturb(y, factor):
    return librosa.effects.time_stretch(y, rate=factor)


# Collect augmentation sources
noise_files  = collect_wavs(os.environ.get("BG_NOISE_DIR", ""))
music_files  = collect_wavs(os.environ.get("MUSIC_DIR", ""))
speech_files = collect_wavs(os.environ.get("BG_SPEECH_DIR", ""))
rir_files    = collect_wavs(os.environ.get("RIR_DIR", ""))

print(f"Noise: {len(noise_files)} | Music: {len(music_files)} | "
      f"Speech: {len(speech_files)} | RIR: {len(rir_files)}")

os.makedirs(OUTPUT_DIR, exist_ok=True)
files = sorted(Path(INPUT_DIR).rglob("*.wav"))
print(f"Found {len(files)} source files")

done, skipped, failed = 0, 0, 0
for src in tqdm(files, desc="Augmenting", unit="file"):
    rel = src.relative_to(INPUT_DIR)
    dst = Path(OUTPUT_DIR) / rel
    dst.parent.mkdir(parents=True, exist_ok=True)

    if dst.exists():
        skipped += 1
        continue

    y = load_mono(src, SR)
    if y is None:
        failed += 1
        continue

    try:
        snr = random.uniform(SNR_MIN, SNR_MAX)

        # Additive noise augmentations
        for pool in [noise_files, music_files, speech_files]:
            if pool and random.random() < AUG_PROB:
                n = load_mono(random.choice(pool), SR)
                if n is not None:
                    y = mix_at_snr(y, n, snr)

        # Reverb
        if rir_files and random.random() < AUG_PROB:
            rir = load_mono(random.choice(rir_files), SR)
            if rir is not None:
                y = apply_reverb(y, rir)

        # Pitch
        if random.random() < AUG_PROB:
            steps = random.uniform(PITCH_MIN, PITCH_MAX)
            y = pitch_shift(y, SR, steps)

        # Speed
        if random.random() < AUG_PROB:
            factor = random.uniform(SPEED_MIN, SPEED_MAX)
            y = speed_perturb(y, factor)

        # Clip & save
        y = np.clip(y, -1.0, 1.0)
        sf.write(str(dst), y, SR)
        done += 1
    except Exception as e:
        tqdm.write(f"  FAIL {src.name}: {e}")
        failed += 1

print(f"\nDone: {done} augmented, {skipped} skipped, {failed} failed")