In [1]:
!pip install -q openai-whisper speechbrain torchaudio librosa scikit-learn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m864.1/864.1 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m788.2/788.2 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone


In [2]:
import os
import librosa
import torch
import torchaudio
import numpy as np
from collections import defaultdict
import whisper
from speechbrain.pretrained import EncoderClassifier
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score

SAMPLE_RATE = 16000
ASR_CHUNK_SEC = 10.0

CHUNK_SAMPLES = int(ASR_CHUNK_SEC * SAMPLE_RATE)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Device:", DEVICE)
print("Chunk samples:", CHUNK_SAMPLES)

  available_backends = torchaudio.list_audio_backends()
  from speechbrain.pretrained import EncoderClassifier


Device: cuda
Chunk samples: 160000


In [3]:
class TalkNotePipeline:
    def __init__(self, session_id: str, run_dir: str, sample_rate: int = 16000, speaker_estimator=None):
        print("[LOG] Initializing TalkNotePipeline")

        self.session_id = session_id
        self.sample_rate = sample_rate
        self.run_dir = run_dir
        os.makedirs(self.run_dir, exist_ok=True)

        self.total_audio_sec = 0.0
        self.audio_offset_sec = 0.0

        self.audio = None
        self.asr_segments = []
        self.embeddings = []

        self.aligned_segments = []
        self.merged_segments = []
        self.final_transcript = []

        self.memory_bank = {}
        self.speaker_groups = {}

        self.user_transcript = []

        self.speaker_estimator = speaker_estimator

        print(f"[LOG] Initialization complete | session={self.session_id}")
        
    def load_audio(self, audio_chunk: np.ndarray):
        if audio_chunk is None or len(audio_chunk) == 0:
            self.audio = None
            return
    
        if audio_chunk.dtype != np.float32:
            audio_chunk = audio_chunk.astype(np.float32)
    
        self.audio = audio_chunk
        self.audio_offset_sec = self.total_audio_sec
    
        chunk_duration = len(audio_chunk) / self.sample_rate
        self.total_audio_sec += chunk_duration
    
        print(
            f"[LOG] Audio chunk loaded | "
            f"samples={len(audio_chunk)} | "
            f"offset={self.audio_offset_sec:.2f}s"
        )

    def run_asr(self, whisper_model):
        print("[LOG] Entering run_asr")
    
        if self.audio is None or len(self.audio) < self.sample_rate:
            self.asr_segments = []
            print("[LOG] ASR skipped (audio too short)")
            return
    
        result = whisper_model.transcribe(
            self.audio,
            beam_size=1,
            verbose=False
        )
    
        self.asr_segments = [
            {
                "start": seg["start"] + self.audio_offset_sec,
                "end": seg["end"] + self.audio_offset_sec,
                "text": seg["text"].strip()
            }
            for seg in result.get("segments", [])
        ]
    
        print(f"[LOG] ASR batch completed | segments={len(self.asr_segments)}")

    def extract_embeddings(self, embedder):
        print("[LOG] Entering extract_embeddings")
    
        if self.audio is None:
            return
    
        window_len = int(self.sample_rate * 10)
        stride = int(self.sample_rate * 5)
    
        audio_tensor = torch.from_numpy(self.audio).to(embedder.device)
    
        batch_embeddings = []

        for start in range(0, max(1, len(self.audio) - window_len + 1), stride):
            window = audio_tensor[start:start + window_len]
            if window.shape[0] < window_len:
                continue
    
            window = window.unsqueeze(0)
    
            with torch.no_grad():
                emb = embedder.encode_batch(window)[0].squeeze(0).cpu().numpy()
    
            emb = emb / np.linalg.norm(emb)
    
            batch_embeddings.append({
                "start": self.audio_offset_sec + start / self.sample_rate,
                "end": self.audio_offset_sec + (start + window_len) / self.sample_rate,
                "embedding": emb,
            })
    
        if hasattr(self, "speaker_model_frozen") and self.speaker_model_frozen:
            for emb in batch_embeddings:
                self.assign_embedding_to_speaker(emb)
    
        self.embeddings.extend(batch_embeddings)
        self.embeddings.sort(key=lambda x: x["start"])
    
        print(f"[LOG] Embeddings appended | total={len(self.embeddings)}")

    def cosine(self, a, b):
        return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

    def cluster_speakers(self, min_embeddings: int = 16):
        print("[LOG] Entering cluster_speakers (warm-up mode)")
    
        if self.speaker_estimator is None:
            raise RuntimeError(
                "speaker_estimator not provided to TalkNotePipeline"
            )
    
        if hasattr(self, "speaker_model_frozen") and self.speaker_model_frozen:
            print("[LOG] Speaker model frozen — skipping clustering")
            return
    
        if len(self.embeddings) < min_embeddings:
            print("[LOG] Insufficient embeddings for initial clustering")
            return
    
        X = np.stack([e["embedding"] for e in self.embeddings])
    
        n_speakers = self.speaker_estimator(X)
        print(f"[LOG] Initial speaker count estimated: {n_speakers}")
    
        labels = SpectralClustering(
            n_clusters=n_speakers,
            affinity="nearest_neighbors",
            assign_labels="kmeans",
            random_state=42,
        ).fit_predict(X)
    
        for emb, label in zip(self.embeddings, labels):
            emb["cluster"] = int(label)
    
        for emb in self.embeddings:
            spk_id = f"SPEAKER_{emb['cluster']}"
            if spk_id not in self.memory_bank:
                self.memory_bank[spk_id] = {
                    "centroid": emb["embedding"],
                    "count": 1,
                    "sim_history": [],
                }
    
        self.speaker_model_frozen = True
        print("[LOG] Speaker clustering frozen")

    def align_segments(self):
        print("[LOG] Entering align_segments")

        self.aligned_segments = []

        if not self.asr_segments:
            print("[LOG] No ASR segments to align")
            return
    
        if not self.embeddings:
            print("[LOG] No embeddings available for alignment")
            return
    
        if not self.memory_bank:
            print("[LOG] Memory bank empty — cannot assign speakers")
            return
    
        dropped_segments = 0
    
        for seg in self.asr_segments:
            best_score = -1.0
            best_speaker = None
    
            seg_start = seg["start"]
            seg_end = seg["end"]
    
            for emb in self.embeddings:
                overlap = min(seg_end, emb["end"]) - max(seg_start, emb["start"])
                if overlap <= 0:
                    continue
    
                if "cluster" not in emb:
                    continue
    
                spk_id = f"SPEAKER_{emb['cluster']}"
    
                if spk_id not in self.memory_bank:
                    continue
    
                centroid = self.memory_bank[spk_id]["centroid"]
    
                sim = float(np.dot(centroid, emb["embedding"]))
    
                score = overlap * sim
    
                if score > best_score:
                    best_score = score
                    best_speaker = spk_id
    
            if best_speaker is None:
                dropped_segments += 1
                continue
    
            self.aligned_segments.append({
                "start": seg_start,
                "end": seg_end,
                "speaker": best_speaker,
                "text": seg["text"],
            })
    
        print(
            f"[LOG] Alignment completed | "
            f"aligned={len(self.aligned_segments)} | "
            f"dropped={dropped_segments}"
        )

    def update_memory_bank(self):
        print("[LOG] Entering update_memory_bank")

        if not hasattr(self, "speaker_model_frozen") or not self.speaker_model_frozen:
            print("[LOG] Speaker model not frozen — skipping memory update")
            return
    
        if not hasattr(self, "last_memory_update_idx"):
            self.last_memory_update_idx = 0

        new_embeddings = self.embeddings[self.last_memory_update_idx :]
    
        for emb in new_embeddings:
            if "cluster" not in emb:
                continue
            
            speaker_id = f"SPEAKER_{emb['cluster']}"
            vec = emb["embedding"]
    
            if speaker_id not in self.memory_bank:
                self.memory_bank[speaker_id] = {
                    "centroid": vec,
                    "count": 1,
                    "sim_history": []
                }
                continue
    
            old = self.memory_bank[speaker_id]["centroid"]
            sim = float(np.dot(old, vec))
    
            alpha = 0.1 + 0.4 * max(sim, 0)
            new = (1 - alpha) * old + alpha * vec
            new /= np.linalg.norm(new)
    
            self.memory_bank[speaker_id]["centroid"] = new
            self.memory_bank[speaker_id]["count"] += 1
            self.memory_bank[speaker_id]["sim_history"].append(sim)
    
        self.last_memory_update_idx = len(self.embeddings)
    
        print("[LOG] Memory bank updated | speakers:", len(self.memory_bank))

    def deduplicate_text(self, text):
        words = text.split()
        deduped = []
        for w in words:
            if not deduped or deduped[-1] != w:
                deduped.append(w)
        return " ".join(deduped)

    def merge_segments(self, finalize_before_sec: float):
        print("[LOG] Entering merge_segments")
    
        if not self.aligned_segments:
            return
    
        self.aligned_segments.sort(key=lambda x: x["start"])
    
        merged = []
        cur = self.aligned_segments[0]
    
        for seg in self.aligned_segments[1:]:
            gap = seg["start"] - cur["end"]
    
            if seg["speaker"] == cur["speaker"] and gap < 1.0:
                cur["end"] = max(cur["end"], seg["end"])
                cur["text"] += " " + seg["text"]
            else:
                merged.append(cur)
                cur = seg
    
        merged.append(cur)
    
        finalized = []
        remaining = []
    
        for seg in merged:
            if seg["end"] <= finalize_before_sec:
                seg["text"] = self.deduplicate_text(seg["text"])
                finalized.append(seg)
            else:
                remaining.append(seg)
    
        self.final_transcript.extend(finalized)
        self.aligned_segments = remaining
    
        if not hasattr(self, "finalized_until_sec"):
            self.finalized_until_sec = 0.0
    
        self.finalized_until_sec = max(self.finalized_until_sec, finalize_before_sec)
    
        print(
            f"[LOG] Segments finalized={len(finalized)} | "
            f"remaining={len(remaining)}"
        )

    def _collect_speaker_stats(self):
        stats = {}
    
        for seg in self.final_transcript:
            spk = seg["speaker"]
            stats.setdefault(spk, {
                "total_time": 0.0,
                "segments": []
            })
            stats[spk]["total_time"] += seg["end"] - seg["start"]
            stats[spk]["segments"].append(seg)
    
        return stats

    def _should_merge(self, spk_a, spk_b, stats):
        mem = self.memory_bank
    
        dur_a = stats[spk_a]["total_time"]
        dur_b = stats[spk_b]["total_time"]
    
        if min(dur_a, dur_b) < 3.0:
            return False
    
        sim = self.cosine(mem[spk_a]["centroid"], mem[spk_b]["centroid"])
    
        if sim < 0.85:
            return False
    
        for seg_a in stats[spk_a]["segments"]:
            for seg_b in stats[spk_b]["segments"]:
                if max(seg_a["start"], seg_b["start"]) < min(seg_a["end"], seg_b["end"]):
                    return False
    
        dominance_ratio = min(dur_a, dur_b) / max(dur_a, dur_b)
        if dominance_ratio > 0.6:
            return False
    
        return True

    def collapse_speakers(self, min_interval_sec: float = 30.0):
        print("[LOG] Entering collapse_speakers")

        if not self.final_transcript:
            return
    
        if not hasattr(self, "last_collapse_time_sec"):
            self.last_collapse_time_sec = 0.0
    
        if self.total_audio_sec - self.last_collapse_time_sec < min_interval_sec:
            print("[LOG] Skipping collapse (interval gate)")
            return
    
        if not self.final_transcript:
            return
    
        exposed_speakers = {
            seg["speaker"]
            for seg in self.user_transcript
            if "speaker" in seg
        }
    
        stats = self._collect_speaker_stats()
        speakers = list(stats.keys())
        parent = {s: s for s in speakers}
    
        def find(x):
            while parent[x] != x:
                parent[x] = parent[parent[x]]
                x = parent[x]
            return x
    
        def union(a, b):
            ra, rb = find(a), find(b)
            if ra != rb:
                parent[rb] = ra
    
        for i in range(len(speakers)):
            for j in range(i + 1, len(speakers)):
                a, b = speakers[i], speakers[j]
    
                if a in exposed_speakers or b in exposed_speakers:
                    continue
    
                if self._should_merge(a, b, stats):
                    union(a, b)
    
        collapse_map = {}
        for s in speakers:
            root = find(s)
            collapse_map.setdefault(root, []).append(s)
    
        self.speaker_groups = collapse_map
        self.last_collapse_time_sec = self.total_audio_sec
    
        print(f"[LOG] Speaker groups stabilized: {self.speaker_groups}")

    def apply_speaker_collapse(self):
        if not self.speaker_groups:
            return
    
        if not hasattr(self, "speaker_to_person"):
            self.speaker_to_person = {}
            self.next_person_id = 0
    
        for seg in self.user_transcript:
            spk = seg.get("speaker")
            pid = seg.get("person")
            if spk is not None and pid is not None:
                self.speaker_to_person[spk] = pid
                try:
                    self.next_person_id = max(
                        self.next_person_id,
                        int(pid.split("_")[-1]) + 1
                    )
                except Exception:
                    pass
    
        for group in self.speaker_groups.values():
            existing_pid = None
    
            for spk in group:
                if spk in self.speaker_to_person:
                    existing_pid = self.speaker_to_person[spk]
                    break
    
            if existing_pid is None:
                existing_pid = f"PERSON_{self.next_person_id}"
                self.next_person_id += 1
    
            for spk in group:
                self.speaker_to_person[spk] = existing_pid
    
        for seg in self.final_transcript:
            seg["person"] = self.speaker_to_person.get(seg["speaker"])

    def assign_embedding_to_speaker(self, emb):
        best_spk = None
        best_sim = -1.0
    
        for spk_id, mem in self.memory_bank.items():
            sim = float(np.dot(mem["centroid"], emb["embedding"]))
            if sim > best_sim:
                best_sim = sim
                best_spk = spk_id
    
        emb["cluster"] = int(best_spk.split("_")[-1])

    def build_user_transcript(self, force_rebuild: bool = False):
        if force_rebuild:
            print("[LOG] Rebuilding user transcript from scratch")
            self.user_transcript = []
            self.last_emitted_end_sec = -1.0
    
        if not hasattr(self, "last_emitted_end_sec"):
            self.last_emitted_end_sec = -1.0
    
        if not self.final_transcript:
            return
    
        new_segments = []
    
        for seg in self.final_transcript:
            if not force_rebuild and seg["end"] <= self.last_emitted_end_sec:
                continue
    
            person = seg.get("person")
            if person is None:
                continue
    
            new_segments.append({
                "person": person,
                "start": seg["start"],
                "end": seg["end"],
                "text": self.deduplicate_text(seg["text"]),
            })
    
        if not new_segments:
            return
    
        if self.user_transcript:
            prev = self.user_transcript[-1]
            first = new_segments[0]
    
            if (
                prev["person"] == first["person"]
                and first["start"] - prev["end"] < 1.2
            ):
                prev["end"] = first["end"]
                prev["text"] += " " + first["text"]
                new_segments = new_segments[1:]
    
        self.user_transcript.extend(new_segments)
        self.last_emitted_end_sec = self.user_transcript[-1]["end"]
    
        print(
            f"[LOG] User transcript updated | "
            f"segments_added={len(new_segments)} | "
            f"total={len(self.user_transcript)}"
        )

    def finalize_remaining_segments(self):
        print("[LOG] Finalizing remaining segments")
    
        if self.aligned_segments:
            for seg in self.aligned_segments:
                seg["text"] = self.deduplicate_text(seg["text"])
                self.final_transcript.append(seg)
    
            self.aligned_segments = []
    
        self.build_user_transcript(force_rebuild=True)
    
        print(f"[LOG] Final transcript turns={len(self.final_transcript)}")

    def reset_batch_state(self):
        self.audio = None
        self.asr_segments = []
        self.embeddings = []
    
        self.aligned_segments = []
        self.merged_segments = []
        self.final_transcript = []

    def reset_session(self):
        self.total_audio_sec = 0.0
    
        self.memory_bank = {}
        self.speaker_groups = {}
    
        self.user_transcript = []

        for attr in [
            "speaker_model_frozen",
            "last_memory_update_idx",
            "last_collapse_time_sec",
            "speaker_to_person",
            "next_person_id",
        ]:
            if hasattr(self, attr):
                delattr(self, attr)
    
        self.reset_batch_state()

    def run(self, whisper_model, embedder):
        print("[LOG] Starting pipeline step")
    
        if self.audio is None:
            return
    
        self.run_asr(whisper_model)
        self.extract_embeddings(embedder)
    
        if not hasattr(self, "speaker_model_frozen"):
            self.cluster_speakers()
    
        self.update_memory_bank()
        self.align_segments()
    
        finalize_cutoff = (
            self.total_audio_sec - 2.0
            if self.total_audio_sec > 2.0
            else None
        )
    
        if finalize_cutoff is not None:
            self.merge_segments(finalize_before_sec=finalize_cutoff)
    
        self.collapse_speakers()
        self.apply_speaker_collapse()
        self.build_user_transcript()
    
        print("[LOG] Pipeline step complete")

In [4]:
class TalkNoteHealthMonitor:
    def __init__(self, window_sec: float = 60.0):
        print("[LOG] Initializing TalkNoteHealthMonitor")

        self.window_sec = window_sec

        print("[LOG] Health monitor ready")

    def detect_temporal_overlap_zones(self, final_transcript):
        zones = []

        if not final_transcript:
            return zones

        segs = sorted(final_transcript, key=lambda x: x["start"])

        for i in range(len(segs)):
            for j in range(i + 1, len(segs)):
                if segs[i]["speaker"] == segs[j]["speaker"]:
                    continue

                start = max(segs[i]["start"], segs[j]["start"])
                end = min(segs[i]["end"], segs[j]["end"])

                if start < end:
                    zones.append({
                        "start": start,
                        "end": end,
                        "source": "temporal_overlap",
                        "speakers": [segs[i]["speaker"], segs[j]["speaker"]],
                    })

        return zones

    def detect_boundary_ambiguity_zones(
        self,
        embeddings,
        bci_low: float = 0.4,
        bci_high: float = 0.75,
    ):
        zones = []

        if not embeddings or len(embeddings) < 2:
            return zones

        for i in range(len(embeddings) - 1):
            a = embeddings[i]
            b = embeddings[i + 1]

            bci = float(np.dot(a["embedding"], b["embedding"]))

            if not (bci_low <= bci <= bci_high):
                continue

            if b["start"] > a["end"]:
                zones.append({
                    "start": a["end"],
                    "end": b["start"],
                    "source": "boundary_ambiguity",
                    "bci": bci,
                })

        return zones

    def detect_speaker_switch_zones(
        self,
        final_transcript,
        delta: float = 1.0,
    ):
        zones = []

        if not final_transcript or len(final_transcript) < 2:
            return zones

        segs = sorted(final_transcript, key=lambda x: x["start"])

        for i in range(len(segs) - 1):
            a = segs[i]
            b = segs[i + 1]

            if a["speaker"] == b["speaker"]:
                continue

            gap = b["start"] - a["end"]

            if abs(gap) <= delta:
                zones.append({
                    "start": a["end"],
                    "end": b["start"],
                    "source": "speaker_switch_proximity",
                    "speakers": [a["speaker"], b["speaker"]],
                })

        return zones

    def collect_ambiguity_zones(
        self,
        final_transcript,
        embeddings,
        bci_low: float = 0.4,
        bci_high: float = 0.75,
        delta: float = 1.0,
    ):
        zones = []

        zones.extend(
            self.detect_temporal_overlap_zones(final_transcript)
        )

        zones.extend(
            self.detect_boundary_ambiguity_zones(
                embeddings,
                bci_low=bci_low,
                bci_high=bci_high,
            )
        )

        zones.extend(
            self.detect_speaker_switch_zones(
                final_transcript,
                delta=delta,
            )
        )

        return zones

    def compute_ambiguity_coverage(
        self,
        ambiguous_zones,
        total_audio_sec: float,
        window_sec: float | None = None,
    ):
        if not ambiguous_zones or total_audio_sec <= 0.0:
            return 0.0, {}

        window = window_sec if window_sec is not None else self.window_sec
        window_start = max(0.0, total_audio_sec - window)

        total_ambiguous = 0.0
        by_source = defaultdict(float)

        for z in ambiguous_zones:
            start = max(z["start"], window_start)
            end = min(z["end"], total_audio_sec)

            if end > start:
                dur = end - start
                total_ambiguous += dur
                by_source[z["source"]] += dur

        coverage = total_ambiguous / max(window, 1e-6)
        by_source_norm = {
            src: dur / max(window, 1e-6)
            for src, dur in by_source.items()
        }

        return coverage, by_source_norm

    def compute_memory_stability(self, memory_bank):
        sims = []

        for spk in memory_bank.values():
            sims.extend(spk.get("sim_history", []))

        if not sims:
            return 0.0

        return float(np.mean(sims))

    def compute_boundary_consistency(self, embeddings):
        if not embeddings or len(embeddings) < 2:
            return 0.0

        bci_vals = [
            float(np.dot(
                embeddings[i]["embedding"],
                embeddings[i + 1]["embedding"]
            ))
            for i in range(len(embeddings) - 1)
        ]

        if not bci_vals:
            return 0.0

        return float(np.mean(bci_vals))

    def compute_health_metrics(
        self,
        memory_bank,
        embeddings,
        ambiguity_coverage: float,
    ):
        memory_stability = self.compute_memory_stability(memory_bank)
        boundary_consistency = self.compute_boundary_consistency(embeddings)

        high_instability = (
            boundary_consistency < 0.6 and
            ambiguity_coverage > 0.05
        )

        return {
            "memory_stability": memory_stability,
            "boundary_consistency": boundary_consistency,
            "ambiguity_coverage": ambiguity_coverage,
            "high_instability": high_instability,
        }

    def evaluate(
        self,
        *,
        final_transcript,
        embeddings,
        memory_bank,
        total_audio_sec: float,
        bci_low: float = 0.4,
        bci_high: float = 0.75,
        delta: float = 1.0,
    ):
        ambiguous_zones = self.collect_ambiguity_zones(
            final_transcript=final_transcript,
            embeddings=embeddings,
            bci_low=bci_low,
            bci_high=bci_high,
            delta=delta,
        )

        ambiguity_coverage, coverage_by_source = (
            self.compute_ambiguity_coverage(
                ambiguous_zones=ambiguous_zones,
                total_audio_sec=total_audio_sec,
            )
        )

        metrics = self.compute_health_metrics(
            memory_bank=memory_bank,
            embeddings=embeddings,
            ambiguity_coverage=ambiguity_coverage,
        )

        return {
            "ambiguous_zones": ambiguous_zones,
            "ambiguity_coverage": ambiguity_coverage,
            "coverage_by_source": coverage_by_source,
            "metrics": metrics,
        }

In [5]:
METRIC_DOCS = {
    "memory_stability": "Mean cosine similarity observed during memory updates",
    "boundary_consistency": "Mean cosine similarity between adjacent embeddings",
    "ambiguity_coverage": "Fraction of recent audio marked ambiguous",
    "high_instability": "Flag for low BCI + high ambiguity"
}

In [6]:
def estimate_num_speakers(X, k_min=2, k_max=8):
    best_k = k_min
    best_score = -1.0

    upper = min(k_max, len(X) - 1)

    for k in range(k_min, upper + 1):
        labels = SpectralClustering(
            n_clusters=k,
            affinity="nearest_neighbors",
            assign_labels="kmeans",
            random_state=42,
        ).fit_predict(X)

        score = silhouette_score(X, labels, metric="cosine")
        if score > best_score:
            best_score = score
            best_k = k

    return best_k

In [7]:
print("[LOG] Loading Whisper ASR model")
asr_model = whisper.load_model("base", device=DEVICE)

print("[LOG] Loading ECAPA-TDNN embedder")
embedder = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    run_opts={"device": DEVICE}
)

[LOG] Loading Whisper ASR model


100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 131MiB/s]


[LOG] Loading ECAPA-TDNN embedder


hyperparams.yaml: 0.00B [00:00, ?B/s]

  available_backends = torchaudio.list_audio_backends()
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

label_encoder.txt: 0.00B [00:00, ?B/s]

In [8]:
from glob import glob

AUDIO_DIR = "/kaggle/input/voxconverse-dataset/voxconverse_dev_wav/audio"
MAX_FILES = 3

audio_files = sorted(glob(os.path.join(AUDIO_DIR, "*.wav")))[:MAX_FILES]

print(f"[LOG] Found {len(audio_files)} audio files")

[LOG] Found 3 audio files


In [9]:
print("[LOG] Creating TalkNotePipeline object")

pipeline = TalkNotePipeline(
    session_id="test_session",
    run_dir=".",
    sample_rate=SAMPLE_RATE,
    speaker_estimator=estimate_num_speakers
)

print("[LOG] Creating TalkNoteHeMonitor object")

monitor = TalkNoteHealthMonitor(window_sec=60.0)

[LOG] Creating TalkNotePipeline object
[LOG] Initializing TalkNotePipeline
[LOG] Initialization complete | session=test_session
[LOG] Creating TalkNoteHeMonitor object
[LOG] Initializing TalkNoteHealthMonitor
[LOG] Health monitor ready


In [10]:
print("[LOG] Starting offline validation run")

TEST_FILES = audio_files[:1]

for idx, audio_path in enumerate(TEST_FILES):
    print(f"\n[LOG] File {idx+1}/{len(TEST_FILES)}: {os.path.basename(audio_path)}")

    # Reset pipeline cleanly
    pipeline.reset_session()

    # Load full audio
    audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)

    CHUNK_SAMPLES = int(SAMPLE_RATE * ASR_CHUNK_SEC)

    # Stream audio in chunks
    for start in range(0, len(audio), CHUNK_SAMPLES):
        chunk = audio[start : start + CHUNK_SAMPLES]
        pipeline.load_audio(chunk)
        pipeline.run(whisper_model=asr_model, embedder=embedder)

    pipeline.finalize_remaining_segments()

    # -------- Evaluation --------
    health = monitor.evaluate(
        final_transcript=pipeline.final_transcript,
        embeddings=pipeline.embeddings,
        memory_bank=pipeline.memory_bank,
        total_audio_sec=pipeline.total_audio_sec,
    )

    # -------- Inspection --------
    print("[RESULT] Turns:", len(pipeline.user_transcript))
    print("[RESULT] Speakers:", {
        seg["person"] for seg in pipeline.user_transcript
    })

    print("[HEALTH]")
    for k, v in health["metrics"].items():
        print(f"  {k}: {v}")

    print("[AMBIGUITY]")
    print("  zones:", len(health["ambiguous_zones"]))
    print("  coverage:", round(health["ambiguity_coverage"], 4))

print("FINISH")

[LOG] Starting offline validation run

[LOG] File 1/1: abjxc.wav
[LOG] Audio chunk loaded | samples=160000 | offset=0.00s
[LOG] Starting pipeline step
[LOG] Entering run_asr
Detected language: English


100%|██████████| 1000/1000 [00:01<00:00, 932.12frames/s]


[LOG] ASR batch completed | segments=2
[LOG] Entering extract_embeddings
[LOG] Embeddings appended | total=1
[LOG] Entering cluster_speakers (warm-up mode)
[LOG] Insufficient embeddings for initial clustering
[LOG] Entering update_memory_bank
[LOG] Speaker model not frozen — skipping memory update
[LOG] Entering align_segments
[LOG] Memory bank empty — cannot assign speakers
[LOG] Entering merge_segments
[LOG] Entering collapse_speakers
[LOG] Pipeline step complete
[LOG] Audio chunk loaded | samples=160000 | offset=10.00s
[LOG] Starting pipeline step
[LOG] Entering run_asr
Detected language: English


100%|██████████| 1000/1000 [00:00<00:00, 3100.20frames/s]


[LOG] ASR batch completed | segments=2
[LOG] Entering extract_embeddings
[LOG] Embeddings appended | total=2
[LOG] Entering cluster_speakers (warm-up mode)
[LOG] Insufficient embeddings for initial clustering
[LOG] Entering update_memory_bank
[LOG] Speaker model not frozen — skipping memory update
[LOG] Entering align_segments
[LOG] Memory bank empty — cannot assign speakers
[LOG] Entering merge_segments
[LOG] Entering collapse_speakers
[LOG] Pipeline step complete
[LOG] Audio chunk loaded | samples=160000 | offset=20.00s
[LOG] Starting pipeline step
[LOG] Entering run_asr
Detected language: English


100%|██████████| 1000/1000 [00:00<00:00, 3083.44frames/s]


[LOG] ASR batch completed | segments=3
[LOG] Entering extract_embeddings
[LOG] Embeddings appended | total=3
[LOG] Entering cluster_speakers (warm-up mode)
[LOG] Insufficient embeddings for initial clustering
[LOG] Entering update_memory_bank
[LOG] Speaker model not frozen — skipping memory update
[LOG] Entering align_segments
[LOG] Memory bank empty — cannot assign speakers
[LOG] Entering merge_segments
[LOG] Entering collapse_speakers
[LOG] Pipeline step complete
[LOG] Audio chunk loaded | samples=160000 | offset=30.00s
[LOG] Starting pipeline step
[LOG] Entering run_asr
Detected language: English


100%|██████████| 1000/1000 [00:00<00:00, 2738.20frames/s]


[LOG] ASR batch completed | segments=3
[LOG] Entering extract_embeddings
[LOG] Embeddings appended | total=4
[LOG] Entering cluster_speakers (warm-up mode)
[LOG] Insufficient embeddings for initial clustering
[LOG] Entering update_memory_bank
[LOG] Speaker model not frozen — skipping memory update
[LOG] Entering align_segments
[LOG] Memory bank empty — cannot assign speakers
[LOG] Entering merge_segments
[LOG] Entering collapse_speakers
[LOG] Pipeline step complete
[LOG] Audio chunk loaded | samples=160000 | offset=40.00s
[LOG] Starting pipeline step
[LOG] Entering run_asr
Detected language: English


100%|██████████| 1000/1000 [00:00<00:00, 2481.05frames/s]


[LOG] ASR batch completed | segments=5
[LOG] Entering extract_embeddings
[LOG] Embeddings appended | total=5
[LOG] Entering cluster_speakers (warm-up mode)
[LOG] Insufficient embeddings for initial clustering
[LOG] Entering update_memory_bank
[LOG] Speaker model not frozen — skipping memory update
[LOG] Entering align_segments
[LOG] Memory bank empty — cannot assign speakers
[LOG] Entering merge_segments
[LOG] Entering collapse_speakers
[LOG] Pipeline step complete
[LOG] Audio chunk loaded | samples=160000 | offset=50.00s
[LOG] Starting pipeline step
[LOG] Entering run_asr
Detected language: English


100%|██████████| 1000/1000 [00:00<00:00, 2648.69frames/s]


[LOG] ASR batch completed | segments=4
[LOG] Entering extract_embeddings
[LOG] Embeddings appended | total=6
[LOG] Entering cluster_speakers (warm-up mode)
[LOG] Insufficient embeddings for initial clustering
[LOG] Entering update_memory_bank
[LOG] Speaker model not frozen — skipping memory update
[LOG] Entering align_segments
[LOG] Memory bank empty — cannot assign speakers
[LOG] Entering merge_segments
[LOG] Entering collapse_speakers
[LOG] Pipeline step complete
[LOG] Audio chunk loaded | samples=134016 | offset=60.00s
[LOG] Starting pipeline step
[LOG] Entering run_asr
Detected language: English


100%|██████████| 837/837 [00:00<00:00, 2882.28frames/s]

[LOG] ASR batch completed | segments=2
[LOG] Entering extract_embeddings
[LOG] Embeddings appended | total=6
[LOG] Entering cluster_speakers (warm-up mode)
[LOG] Insufficient embeddings for initial clustering
[LOG] Entering update_memory_bank
[LOG] Speaker model not frozen — skipping memory update
[LOG] Entering align_segments
[LOG] Memory bank empty — cannot assign speakers
[LOG] Entering merge_segments
[LOG] Entering collapse_speakers
[LOG] Pipeline step complete
[LOG] Finalizing remaining segments
[LOG] Rebuilding user transcript from scratch
[LOG] Final transcript turns=0
[RESULT] Turns: 0
[RESULT] Speakers: set()
[HEALTH]
  memory_stability: 0.0
  boundary_consistency: 0.8562325716018677
  ambiguity_coverage: 0.0
  high_instability: False
[AMBIGUITY]
  zones: 0
  coverage: 0.0
FINISH



