In [2]:
import os
import json
import pandas as pd
import time

from pyannote.audio import Pipeline


In [10]:
df = pd.read_csv("../data/refined_dataset.csv")
df.head(2)


Unnamed: 0,audio,speaker,speaker_count
0,../audios-wav/12-audios-ar-en/6-audios-ar/1_sp...,"[{""start"":0.025320884681576335,""end"":11.079020...",1.0
1,../audios-wav/12-audios-ar-en/6-audios-ar/1_sp...,"[{""start"":0.1782457853618421,""end"":116.0241887...",1.0


In [11]:
import ast

# Parse the JSON-like string into Python objects
df["segments"] = df["speaker"].apply(lambda x: json.loads(x))

# Quick check on the first row
print("Audio file:", df.loc[0, "audio"])
print("Speaker count (ground truth):", df.loc[0, "speaker_count"])
print("First 2 segments:", df.loc[0, "segments"][:2])


Audio file: ../audios-wav/12-audios-ar-en/6-audios-ar/1_speaker_ar/solo10_ar.wav
Speaker count (ground truth): 1.0
First 2 segments: [{'start': 0.025320884681576335, 'end': 11.079020083885514, 'channel': 0, 'labels': ['Speaker 1']}, {'start': 11.353187839068042, 'end': 28.60528196857287, 'channel': 0, 'labels': ['Speaker 1']}]


In [15]:
import os, time, json
from dotenv import load_dotenv
from pyannote.audio import Pipeline

# Load token from .env
load_dotenv()
hf_token = os.getenv("HUGGINGFACE_TOKEN")
assert hf_token is not None, "HUGGINGFACE_TOKEN not found in .env"

# Load diarization pipeline
print("[INFO] Loading pyannote pipeline...")
start_time = time.time()
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token)
print(f"[INFO] Pipeline loaded in {time.time() - start_time:.2f} sec")

# Pick one audio for now
test_audio = df.loc[0, "audio"]
print(f"[INFO] Starting diarization for: {test_audio}")

file_start = time.time()
diarization = pipeline(test_audio)
file_end = time.time()

# Collect predictions
pred_segments = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
    pred_segments.append({
        "start": float(turn.start),
        "end": float(turn.end),
        "labels": [speaker]
    })

print(f"[INFO] Finished diarization in {file_end - file_start:.2f} sec")
print(f"[INFO] Total segments detected: {len(pred_segments)}")
print("Preview (first 5):", json.dumps(pred_segments[:5], indent=2))


[INFO] Loading pyannote pipeline...


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.2. Bad things might happen unless you revert torch to 1.x.
[INFO] Pipeline loaded in 2.19 sec
[INFO] Starting diarization for: ../audios-wav/12-audios-ar-en/6-audios-ar/1_speaker_ar/solo10_ar.wav


KeyboardInterrupt: 

In [16]:
import pathlib

results = []
output_dir = "../results/pyannote_predictions"
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

for idx, row in df.iterrows():
    audio_path = row["audio"]
    audio_name = pathlib.Path(audio_path).stem
    print(f"\n[INFO] ({idx+1}/{len(df)}) Processing {audio_name} ...")

    start_time = time.time()
    try:
        diarization = pipeline(audio_path)

        pred_segments = []
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            pred_segments.append({
                "start": float(turn.start),
                "end": float(turn.end),
                "labels": [speaker]
            })

        duration = time.time() - start_time
        print(f"[INFO] Finished {audio_name} in {duration:.2f} sec ({len(pred_segments)} segments)")

        # Save predictions to JSON
        out_file = f"{output_dir}/{audio_name}_pyannote.json"
        with open(out_file, "w") as f:
            json.dump(pred_segments, f, indent=2)

        # Append summary to results
        results.append({
            "audio": audio_path,
            "n_segments": len(pred_segments),
            "runtime_sec": duration,
            "output_file": out_file
        })

    except Exception as e:
        print(f"[ERROR] Failed on {audio_name}: {e}")
        results.append({
            "audio": audio_path,
            "error": str(e)
        })

# Save overall results as CSV
results_df = pd.DataFrame(results)
results_df.to_csv("../results/pyannote_summary.csv", index=False)

print("\n[INFO] All files processed. Summary saved to ../results/pyannote_summary.csv")
results_df.head()



[INFO] (1/12) Processing solo10_ar ...


KeyboardInterrupt: 