In [1]:
import os, pathlib

BASE = pathlib.Path.cwd()
for p in [BASE / "data" / "suspects", BASE / "data" / "probes", BASE / "models", BASE / "artifacts"]:
    p.mkdir(parents=True, exist_ok=True)

print("Project root:", BASE)
print("Folders created:\n - data/suspects\n - data/probes\n - models\n - artifacts")


Project root: D:\ppaudio-surveillance
Folders created:
 - data/suspects
 - data/probes
 - models
 - artifacts


In [5]:
import sys, numpy as np, librosa, soundfile as sf
import speechbrain as sb
print("Python", sys.version)
print("speechbrain", sb.__version__)
print("librosa", librosa.__version__)


Python 3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:42:04) [MSC v.1943 64 bit (AMD64)]
speechbrain 1.0.3
librosa 0.11.0


In [6]:
#embedding extractor (ECAPA from SpeechBrain)
#(this loads the pretrained model and defines helpers

import torch
from speechbrain.pretrained import EncoderClassifier
import numpy as np
import librosa, soundfile as sf
from pathlib import Path

# load the pretrained ECAPA TDNN speaker encoder
encoder = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    run_opts={"device": "cuda" if torch.cuda.is_available() else "cpu"}
)

def load_audio_16k(path, target_sr=16000):
    y, sr = librosa.load(path, sr=None, mono=True)
    if sr != target_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        sr = target_sr
    return y, sr

def wav_to_embedding(path):
    y, sr = load_audio_16k(path)
    # speechbrain expects torch tensor [batch, time]
    import torch
    signal = torch.tensor(y, dtype=torch.float32).unsqueeze(0)
    with torch.no_grad():
        emb = encoder.encode_batch(signal).squeeze(0).squeeze(0).cpu().numpy()
    return emb  # 192-d vector for ECAPA


In [4]:
#sanity Check


import ipywidgets as widgets, tqdm, torch, torchaudio
print("ipywidgets:", widgets.__version__)
print("tqdm ok")
print("torch:", torch.__version__)
print("torchaudio:", torchaudio.__version__)


ipywidgets: 8.1.7
tqdm ok
torch: 2.8.0+cpu
torchaudio: 2.8.0+cpu


In [7]:
#quick test (optional) — point to one of your WAVs:
test_files = list((BASE / "data" / "suspects").rglob("*.wav"))
print("Found", len(test_files), "wav files")
if test_files:
    e = wav_to_embedding(test_files[0])
    print("Embedding shape:", e.shape, "example norm:", np.linalg.norm(e))


Found 0 wav files


In [9]:
#enroll suspects (compute average embedding per person)

import json
from collections import defaultdict

SUSPECTS_DIR = BASE / "data" / "suspects"
ARTIFACTS = BASE / "artifacts"
ARTIFACTS.mkdir(exist_ok=True)

def enroll_suspects(suspects_dir=SUSPECTS_DIR):
    speaker_embs = {}
    for speaker_dir in sorted(p for p in suspects_dir.iterdir() if p.is_dir()):
        wavs = sorted(speaker_dir.glob("*.wav"))
        if not wavs:
            print(f"[WARN] No wavs in {speaker_dir.name}, skipping.")
            continue
        embs = []
        for w in wavs:
            try:
                embs.append(wav_to_embedding(w))
            except Exception as e:
                print(f"[ERR] {w.name}: {e}")
        if embs:
            mean_emb = np.mean(np.stack(embs, axis=0), axis=0)
            # l2-normalize (helps cosine)
            mean_emb = mean_emb / (np.linalg.norm(mean_emb) + 1e-9)
            speaker_embs[speaker_dir.name] = mean_emb.tolist()
            print(f"[OK] {speaker_dir.name}: {len(embs)} files -> enrolled.")
    # save
    out = ARTIFACTS / "suspects_embeddings.json"
    json.dump(speaker_embs, open(out, "w"))
    print(f"\nSaved {len(speaker_embs)} enrolled speakers to {out}")
    return speaker_embs

suspects = enroll_suspects()




Saved 0 enrolled speakers to D:\ppaudio-surveillance\artifacts\suspects_embeddings.json


In [10]:
#baseline identification (cosine similarity)
#we’ll start with cosine scoring (simple), then you can swap to PLDA later if time permits.

import json
from pathlib import Path

def cosine(a, b):
    a = a / (np.linalg.norm(a) + 1e-9)
    b = b / (np.linalg.norm(b) + 1e-9)
    return float(np.dot(a, b))

# load enrolled embeddings
enrolled_path = ARTIFACTS / "suspects_embeddings.json"
enrolled = json.load(open(enrolled_path))

def identify_probe(wav_path, threshold=0.55):
    probe_emb = wav_to_embedding(wav_path)
    probe_emb = probe_emb / (np.linalg.norm(probe_emb) + 1e-9)
    best_name, best_score = None, -1.0
    for spk, emb_list in enrolled.items():
        score = cosine(probe_emb, np.array(emb_list))
        if score > best_score:
            best_name, best_score = spk, score
    verdict = best_name if best_score >= threshold else "NO MATCH"
    return best_name, best_score, verdict

# try it
probe_files = sorted((BASE / "data" / "probes").glob("*.wav"))
if probe_files:
    name, score, verdict = identify_probe(probe_files[0])
    print("Best:", name, "score:", round(score, 3), "⇒", verdict)
else:
    print("Put a probe wav into data/probes and re-run.")


Put a probe wav into data/probes and re-run.
