# Filter by Words-Per-Minute — Folder Batch

Filters a TTS/ASR dataset by **speaking rate** (words per minute).
Removes outliers that speak too fast or too slow — common in crowd-sourced data.

Requires a `metadata.csv` with `filename|transcript` (LJSpeech-style, pipe-separated).

---

> No GPU required. Uses `soundfile` + `pandas`.

In [None]:
import os

# ════════════════════════════════════════════
# ⚙️  CONFIGURATION
# ════════════════════════════════════════════
os.environ.setdefault("INPUT_DIR",       "/data/audio_in")         # WAV folder
os.environ.setdefault("METADATA_CSV",    "/data/metadata.csv")     # pipe-separated: filename|text
os.environ.setdefault("OUTPUT_DIR",      "/data/audio_filtered")
os.environ.setdefault("CSV_SEP",         "|")
os.environ.setdefault("STD_THRESHOLD",   "2.0")    # Keep within mean ± N std devs

INPUT_DIR     = os.environ["INPUT_DIR"]
METADATA_CSV  = os.environ["METADATA_CSV"]
OUTPUT_DIR    = os.environ["OUTPUT_DIR"]
CSV_SEP       = os.environ["CSV_SEP"]
STD_THRESHOLD = float(os.environ["STD_THRESHOLD"])

print(f"Input:     {INPUT_DIR}")
print(f"Metadata:  {METADATA_CSV}")
print(f"Output:    {OUTPUT_DIR}")
print(f"Threshold: mean ± {STD_THRESHOLD} std")

In [None]:
!pip install --quiet --break-system-packages pandas soundfile tqdm

In [None]:
import os, re, shutil
import pandas as pd
import soundfile as sf
from pathlib import Path
from tqdm import tqdm

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load metadata
try:
    df = pd.read_csv(METADATA_CSV, sep=CSV_SEP,
                     names=["file_name", "transcript"], header=None)
except Exception:
    df = pd.read_csv(METADATA_CSV)

print(f"Loaded {len(df)} rows from metadata")

# Calculate WPM for each row using header-only reads
def get_wpm(row):
    fname = str(row["file_name"])
    if not fname.endswith(".wav"):
        fname += ".wav"
    fpath = os.path.join(INPUT_DIR, fname)

    text = re.sub(r"[^\w\s]", "", str(row["transcript"])).strip()
    word_count = len(text.split())
    if word_count == 0 or not os.path.exists(fpath):
        return 0.0

    try:
        with sf.SoundFile(fpath) as f:
            dur_min = (len(f) / f.samplerate) / 60.0
        return word_count / dur_min if dur_min > 0 else 0.0
    except Exception:
        return 0.0

print("Calculating WPM...")
df["wpm"] = df.apply(get_wpm, axis=1)
valid = df[df["wpm"] > 0]

mean_wpm = valid["wpm"].mean()
std_wpm  = valid["wpm"].std()
lo = mean_wpm - STD_THRESHOLD * std_wpm
hi = mean_wpm + STD_THRESHOLD * std_wpm

print(f"WPM stats: mean={mean_wpm:.1f}, std={std_wpm:.1f}, range=[{lo:.1f}, {hi:.1f}]")

filtered = valid[(valid["wpm"] >= lo) & (valid["wpm"] <= hi)]
print(f"Keeping {len(filtered)}/{len(df)} files ({len(df) - len(filtered)} removed)")

# Copy filtered files + write new metadata
new_meta = []
for _, row in tqdm(filtered.iterrows(), total=len(filtered), desc="Copying"):
    fname = str(row["file_name"])
    if not fname.endswith(".wav"):
        fname += ".wav"
    src = os.path.join(INPUT_DIR, fname)
    dst = os.path.join(OUTPUT_DIR, fname)
    if os.path.exists(src):
        shutil.copy2(src, dst)
        new_meta.append(f"{fname}{CSV_SEP}{row['transcript']}")

meta_out = os.path.join(OUTPUT_DIR, "metadata.csv")
with open(meta_out, "w", encoding="utf-8") as f:
    f.write("\n".join(new_meta) + "\n")

print(f"\nDone. Metadata saved to {meta_out}")