In [1]:
import pandas as pd
from pathlib import Path

# project root (assumes this notebook is in notebooks/)
ROOT = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
DATASET_CSV = ROOT / "data" / "cleaned_dataset.csv"

df = pd.read_csv(DATASET_CSV)
print("Rows:", len(df))
display(df.head())


Rows: 60


Unnamed: 0,audio,speaker_count,language
0,../audios-wav/audios-ar/3_speakers_ar/three_sp...,3 Speakers,ar
1,../audios-wav/audios-en/2_speakers_en/two_spea...,2 Speakers,en
2,../audios-wav/audios-ar/2_speakers_ar/two_spea...,2 Speakers,ar
3,../audios-wav/audios-ar/3_speakers_ar/three_sp...,3 Speakers,ar
4,../audios-wav/audios-en/2_speakers_en/two_spea...,2 Speakers,en


In [3]:
import torch, torchaudio
from pathlib import Path
from speechbrain.pretrained import SpeakerRecognition

# load model
spkrec = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")

row = df.iloc[0]
audio_path = Path(row["audio"]).resolve()
print(f"Testing: {audio_path.name} | language={row['language']} | true={row['speaker_count']}")

# 1) load waveform
wav, sr = torchaudio.load(str(audio_path))           # [C, T]
if wav.shape[0] > 1:                                 # to mono
    wav = wav.mean(dim=0, keepdim=True)
wav = wav.squeeze(0)                                 # [T]

# 2) resample to 16k (ECAPA was trained at 16k)
if sr != 16000:
    wav = torchaudio.transforms.Resample(sr, 16000)(wav)
    sr = 16000

# 3) batchify: [B, T]
wav = wav.unsqueeze(0)

# 4) encode
with torch.inference_mode():
    emb = spkrec.encode_batch(wav)                   # [B, D]
print("Embedding shape:", tuple(emb.shape))


Testing: three_speakers7_ar.wav | language=ar | true=3 Speakers
Embedding shape: (1, 1, 192)


In [4]:
import time, re
from pathlib import Path
import numpy as np
import pandas as pd
import torch, torchaudio
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

RESULTS_DIR = (Path.cwd().parents[0] / "results") if Path.cwd().name == "notebooks" else (Path.cwd() / "results")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV_SB = RESULTS_DIR / "speechbrain_predictions.csv"

def true_count(s):
    m = re.search(r"\d+", str(s))
    return int(m.group()) if m else np.nan

assert 'spkrec' in globals(), "Run the SpeechBrain smoke test cell first."

def to_mono_16k(audio_path: Path):
    wav, sr = torchaudio.load(str(audio_path))          # [C,T]
    if wav.shape[0] > 1:
        wav = wav.mean(dim=0)                           # [T]
    else:
        wav = wav.squeeze(0)                            # [T]
    if sr != 16000:
        wav = torchaudio.transforms.Resample(sr, 16000)(wav)
    return wav  # [T] at 16k

def embed_chunks(wav_16k: torch.Tensor, win_s=1.5, hop_s=0.75):
    """Split mono 16k waveform [T] -> embeddings [N,D]."""
    sr = 16000
    win = int(win_s * sr)
    hop = int(hop_s * sr)
    T = wav_16k.numel()
    if T < win:
        wav_16k = torch.nn.functional.pad(wav_16k, (0, win - T))
        T = wav_16k.numel()
    starts = range(0, T - win + 1, hop)
    embs = []
    with torch.inference_mode():
        for s in starts:
            chunk = wav_16k[s:s+win]            # [win]
            # encode_batch expects [B,T]
            e = spkrec.encode_batch(chunk.unsqueeze(0))  # -> [1,1,D] for ECAPA
            e = e.squeeze().cpu().numpy()      # -> [D]
            embs.append(e)
    return np.stack(embs, axis=0) if embs else None      # [N,D] or None

def choose_k(embs: np.ndarray, ks=(1,2,3)):
    """Pick k via silhouette; fallback to smallest inertia if needed."""
    best_k, best_score, best_labels = None, -np.inf, None
    inertias = {}
    for k in ks:
        if k == 1:
            labels = np.zeros(len(embs), dtype=int)
            centroid = embs.mean(axis=0, keepdims=True)
            inertia = float(((embs - centroid)**2).sum())
            inertias[k] = -inertia
            score = -1e9
        else:
            model = AgglomerativeClustering(n_clusters=k, linkage="ward")
            labels = model.fit_predict(embs)
            if len(np.unique(labels)) < 2:
                score = -1e9
            else:
                score = float(silhouette_score(embs, labels))
        if score > best_score:
            best_k, best_score, best_labels = k, score, labels
    if best_k is None:
        best_k = 1
    return int(best_k)

rows, failures = [], 0
t0_all = time.time()

for idx, r in df.iterrows():
    audio_path = Path(r["audio"]).resolve()
    print(f"[{idx+1}/{len(df)}] {audio_path.name} ...", end=" ", flush=True)

    t0 = time.time()
    try:
        wav = to_mono_16k(audio_path)
        embs = embed_chunks(wav, win_s=1.5, hop_s=0.75)
        if embs is None or len(embs) < 2:
            pred = 1
            status = "✓ short→pred=1"
        else:
            pred = choose_k(embs, ks=(1,2,3))
            status = f"✓ pred={pred}"
    except Exception as e:
        pred = np.nan
        failures += 1
        status = f"✗ failed ({e})"
    dt = time.time() - t0

    print(f"{status} | {dt:.1f}s")

    rows.append({
        "audio": str(audio_path),
        "language": r["language"],
        "true_speakers": true_count(r["speaker_count"]),
        "pred_speakers": pred,
        "runtime_sec": dt,
    })

sb_df = pd.DataFrame(rows)
sb_df.to_csv(OUT_CSV_SB, index=False)

print(f"\nSaved: {OUT_CSV_SB}")
print(f"Total rows: {len(sb_df)} | Failures: {failures} | Total runtime: {(time.time()-t0_all)/60:.1f} min")
sb_df.head(8)


[1/60] three_speakers7_ar.wav ... 

KeyboardInterrupt: 

In [None]:
import pandas as pd, seaborn as sns, matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from pathlib import Path

RESULTS_DIR = (Path.cwd().parents[0] / "results") if Path.cwd().name == "notebooks" else (Path.cwd() / "results")
OUT_CSV_SB = RESULTS_DIR / "speechbrain_predictions.csv"

pred_df = pd.read_csv(OUT_CSV_SB)

y_true = pred_df["true_speakers"].astype(int)
y_pred = pred_df["pred_speakers"].fillna(-1).astype(int)

print("=== SpeechBrain Evaluation ===")
print(f"Accuracy         : {accuracy_score(y_true, y_pred):.2%}")
print(f"Precision (macro): {precision_score(y_true, y_pred, average='macro', zero_division=0):.2%}")
print(f"Recall (macro)   : {recall_score(y_true, y_pred, average='macro', zero_division=0):.2%}")
print(f"F1-score (macro) : {f1_score(y_true, y_pred, average='macro', zero_division=0):.2%}\n")
print("Per-class report:")
print(classification_report(y_true, y_pred, digits=3, zero_division=0))

cm = confusion_matrix(y_true, y_pred, labels=[1,2,3,-1])
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=[1,2,3,"fail"], yticklabels=[1,2,3,"fail"])
plt.title("Confusion Matrix - SpeechBrain Speaker Count")
plt.xlabel("Predicted"); plt.ylabel("True"); plt.show()
