In [12]:
import os, sys, pathlib, pandas as pd

GT_CSV = "../data/refined_dataset.csv"
SB_PRED_DIR = pathlib.Path("../results/speechbrain_predictions")
SB_SUMMARY  = "../results/speechbrain_summary.csv"

SB_PRED_DIR.mkdir(parents=True, exist_ok=True)

print("PY:", sys.version)
print("CWD:", os.getcwd())
print("GT_CSV exists:", os.path.exists(GT_CSV))

df = pd.read_csv(GT_CSV)
print("GT rows:", len(df))
df.head(2)[["audio","speaker_count"]]

PY: 3.9.23 (main, Jun  3 2025, 18:47:52) 
[Clang 16.0.0 (clang-1600.0.26.6)]
CWD: /Users/s.n.h/Voice-AI/Audio-AI/notebooks
GT_CSV exists: True
GT rows: 12


Unnamed: 0,audio,speaker_count
0,../audios-wav/12-audios-ar-en/6-audios-ar/1_sp...,1.0
1,../audios-wav/12-audios-ar-en/6-audios-ar/1_sp...,1.0


  import pkg_resources
  from .autonotebook import tqdm as notebook_tqdm
  from speechbrain.pretrained import EncoderClassifier


In [7]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

def windowed_embeddings(wav, sr, win_s=1.0, hop_s=0.5):
    W = int(sr*win_s); H = int(sr*hop_s)
    embs, times = [], []
    with torch.no_grad():
        for start in range(0, len(wav)-W+1, H):
            chunk = wav[start:start+W].unsqueeze(0).to(sb_device)
            emb = sb_enc.encode_batch(chunk).squeeze(0).squeeze(0).cpu().numpy()
            embs.append(emb)
            times.append((start/sr, (start+W)/sr))
    return np.array(embs), times

def cluster_auto(embs, k_min=1, k_max=4):
    if len(embs) < 2:
        return np.zeros(len(embs), dtype=int), 1
    best_k, best_score, best_labels = 1, -1.0, np.zeros(len(embs), dtype=int)
    for k in range(k_min, min(k_max, len(embs)) + 1):
        try:
            lab = AgglomerativeClustering(n_clusters=k, linkage="ward").fit_predict(embs)
            score = silhouette_score(embs, lab) if k > 1 else -1.0
            if score > best_score:
                best_k, best_score, best_labels = k, score, lab
        except Exception:
            pass
    return best_labels, best_k

def windows_to_segments(times, labels, min_seg=0.30, gap_merge=0.25):
    if not times: return []
    ordered = sorted(zip(times, labels), key=lambda x: x[0][0])
    out = []
    cs, ce, cl = ordered[0][0][0], ordered[0][0][1], ordered[0][1]
    for (t0, t1), lab in ordered[1:]:
        if lab == cl and t0 - ce <= gap_merge:
            ce = max(ce, t1)
        else:
            if ce - cs >= min_seg:
                out.append({"start": float(cs), "end": float(ce), "labels": [f"Speaker {int(cl)+1}"]})
            cs, ce, cl = t0, t1, lab
    if ce - cs >= min_seg:
        out.append({"start": float(cs), "end": float(ce), "labels": [f"Speaker {int(cl)+1}"]})
    return out

In [13]:
import torchaudio, torch, os

def read_wav(path, target_sr=16000):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Error loading audio file: not found {path}")
    wav, sr = torchaudio.load(path)
    if wav.dim() > 1:  # make mono
        wav = wav.mean(dim=0)
    if sr != target_sr:
        wav = torchaudio.functional.resample(wav, sr, target_sr)
        sr = target_sr
    return wav.squeeze(0), sr

# webrtcvad: frames must be exactly 10, 20, or 30 ms; sr must be 8000/16000/32000
def frames_vad(wav, sr, frame_ms=30, vad_aggr=2):
    assert frame_ms in (10, 20, 30), "webrtcvad requires 10/20/30 ms frames"
    import webrtcvad
    from scipy.signal import medfilt

    vad = webrtcvad.Vad(vad_aggr)
    frame_len = int(sr * frame_ms / 1000)
    hop = frame_len

    speech = []
    for start in range(0, len(wav), hop):
        end = min(start + frame_len, len(wav))
        frm = wav[start:end]
        if len(frm) < frame_len:
            frm = torch.nn.functional.pad(frm, (0, frame_len - len(frm)))
        pcm16 = (frm.clamp(-1, 1) * 32767.0).to(torch.int16).cpu().numpy().tobytes()
        speech.append(1 if vad.is_speech(pcm16, sr) else 0)

    speech = medfilt(torch.tensor(speech, dtype=torch.int32).numpy(), kernel_size=5)

    segs = []
    i = 0
    n = len(speech)
    while i < n:
        if speech[i] == 1:
            j = i + 1
            while j < n and speech[j] == 1:
                j += 1
            segs.append((i, j))
            i = j
        else:
            i += 1

    segs_sec = []
    for s, e in segs:
        start = s * frame_len / sr
        end = e * frame_len / sr
        if end - start >= 0.20:
            segs_sec.append((start, end))
    return segs_sec, frame_len / sr

In [14]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

def windowed_embeddings(wav, sr, win_s=1.0, hop_s=0.5):
    W = int(sr*win_s); H = int(sr*hop_s)
    embs, times = [], []
    with torch.no_grad():
        for start in range(0, len(wav)-W+1, H):
            chunk = wav[start:start+W].unsqueeze(0).to(sb_device)
            emb = sb_enc.encode_batch(chunk).squeeze(0).squeeze(0).cpu().numpy()
            embs.append(emb)
            times.append((start/sr, (start+W)/sr))
    return np.array(embs), times

def cluster_auto(embs, k_min=1, k_max=4):
    if len(embs) < 2:
        return np.zeros(len(embs), dtype=int), 1
    best_k, best_score, best_labels = 1, -1.0, np.zeros(len(embs), dtype=int)
    for k in range(k_min, min(k_max, len(embs)) + 1):
        try:
            lab = AgglomerativeClustering(n_clusters=k, linkage="ward").fit_predict(embs)
            score = silhouette_score(embs, lab) if k > 1 else -1.0
            if score > best_score:
                best_k, best_score, best_labels = k, score, lab
        except Exception:
            pass
    return best_labels, best_k

def windows_to_segments(times, labels, min_seg=0.30, gap_merge=0.25):
    if not times: return []
    ordered = sorted(zip(times, labels), key=lambda x: x[0][0])
    out = []
    cs, ce, cl = ordered[0][0][0], ordered[0][0][1], ordered[0][1]
    for (t0, t1), lab in ordered[1:]:
        if lab == cl and t0 - ce <= gap_merge:
            ce = max(ce, t1)
        else:
            if ce - cs >= min_seg:
                out.append({"start": float(cs), "end": float(ce), "labels": [f"Speaker {int(cl)+1}"]})
            cs, ce, cl = t0, t1, lab
    if ce - cs >= min_seg:
        out.append({"start": float(cs), "end": float(ce), "labels": [f"Speaker {int(cl)+1}"]})
    return out

In [15]:
import time, json, pathlib, numpy as np, pandas as pd

OUT_DIR = pathlib.Path("../results/speechbrain_predictions"); OUT_DIR.mkdir(parents=True, exist_ok=True)
SUMMARY_CSV = "../results/speechbrain_summary.csv"

results = []
for i, row in df.iterrows():
    audio = row["audio"]; stem = pathlib.Path(audio).stem
    print(f"[SB] ({i+1}/{len(df)}) {stem}")
    t0 = time.time()
    try:
        wav, sr = read_wav(audio, 16000)
        vad_segs, _ = frames_vad(wav, sr, frame_ms=30, vad_aggr=2)

        all_embs, all_times = [], []
        for (s,e) in vad_segs:
            seg = wav[int(s*sr):int(e*sr)]
            embs, times = windowed_embeddings(seg, sr, win_s=1.0, hop_s=0.5)
            times = [(s+a, s+b) for (a,b) in times]
            if len(embs):
                all_embs.append(embs); all_times.extend(times)

        embs = np.vstack(all_embs) if len(all_embs) else np.zeros((0,192))
        if len(embs)==0:
            preds = []
        else:
            labels, k = cluster_auto(embs, k_min=1, k_max=4)
            preds = windows_to_segments(all_times, labels, min_seg=0.30, gap_merge=0.25)

        out_path = OUT_DIR / f"{stem}_speechbrain.json"
        with open(out_path, "w") as f: json.dump(preds, f, indent=2)

        dur = time.time() - t0
        results.append({"audio": audio, "n_segments": len(preds), "runtime_sec": dur, "output_file": str(out_path)})
        print(f"  -> {len(preds)} segs, {dur:.2f}s")
    except Exception as e:
        results.append({"audio": audio, "error": str(e)})
        print(f"  !! ERROR: {e}")

pd.DataFrame(results).to_csv(SUMMARY_CSV, index=False)
print(f"[SB] Done -> {SUMMARY_CSV}")

[SB] (1/12) solo10_ar


KeyboardInterrupt: 