In [None]:
import torch
from pyannote.audio import Pipeline
import pandas as pd
import os
import torchaudio

# -----------------------------
# 1. Load VAD pipeline
# -----------------------------
pipeline = Pipeline.from_pretrained(
    "pyannote/voice-activity-detection",
    use_auth_token="YOUR_HUGGING_FACE_TOKEN"
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline.to(device)

  from .autonotebook import tqdm as notebook_tqdm
/home/teaching/miniconda3/lib/python3.13/site-packages/pytorch_lightning/utilities/migration/migration.py:208: You have multiple `ModelCheckpoint` callback states in this checkpoint, but we found state keys that would end up colliding with each other after an upgrade, which means we can't differentiate which of your checkpoint callbacks needs which states. At least one of your `ModelCheckpoint` callbacks will not be able to reload the state.
Lightning automatically upgraded your loaded checkpoint from v1.1.3 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/059e96f964841d40f1a5e755bb7223f76666bba4/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.7.1, yours is 2.7.0+cu126. Bad things might happen unless you revert torch to 1.x.


<pyannote.audio.pipelines.voice_activity_detection.VoiceActivityDetection at 0x7075c2edf380>

In [2]:
# -----------------------------
# 2. VAD + Collect Segments
# -----------------------------
def run_vad_on_audio(audio_path):
    audio_name = os.path.basename(audio_path)
    audio_basename = os.path.splitext(audio_name)[0]

    # Load audio to get duration
    waveform, sample_rate = torchaudio.load(audio_path)
    duration = waveform.size(1) / sample_rate

    # Run VAD
    vad_result = pipeline(audio_path)
    timeline = vad_result.get_timeline().support()

    segments = []
    utt_id_counter = 0
    prev_end = 0.0

    for segment in timeline:
        # Non-speech before speech
        if segment.start > prev_end:
            segments.append({
                "audio_name": audio_name,
                "utt_id": f"{audio_basename}_{utt_id_counter}",
                "start": round(prev_end, 2),
                "end": round(segment.start, 2),
                "length": round(segment.start - prev_end, 2),
                "language_tag": "NON_SPEECH",
                "overlap_diff_lang": "NA",
                "dev_eval_status": "NA"
            })
            utt_id_counter += 1

        # Speech segment
        segments.append({
            "audio_name": audio_name,
            "utt_id": f"{audio_basename}_{utt_id_counter}",
            "start": round(segment.start, 2),
            "end": round(segment.end, 2),
            "length": round(segment.end - segment.start, 2),
            "language_tag": "language",
            "overlap_diff_lang": "NA",
            "dev_eval_status": "NA"
        })
        utt_id_counter += 1
        prev_end = segment.end

    # Final non-speech after last segment
    if prev_end < duration:
        segments.append({
            "audio_name": audio_name,
            "utt_id": f"{audio_basename}_{utt_id_counter}",
            "start": round(prev_end, 2),
            "end": round(duration, 2),
            "length": round(duration - prev_end, 2),
            "language_tag": "NON_SPEECH",
            "overlap_diff_lang": "NA",
            "dev_eval_status": "NA"
        })

    return segments



In [3]:
# -----------------------------
# 3. Process All Audio Files
# -----------------------------
audio_dir = "/home/teaching/Desktop/priyam/_audio"  # Change this
output_csv = "/home/teaching/Desktop/priyam/labels/vad_all_segments_all_audios.csv"

all_segments = []

for filename in os.listdir(audio_dir):
    if filename.endswith(".wav"):
        audio_path = os.path.join(audio_dir, filename)
        try:
            print(f"Processing: {filename}")
            segments = run_vad_on_audio(audio_path)
            all_segments.extend(segments)
        except Exception as e:
            print(f"Failed on {filename}: {e}")

# -----------------------------
# 4. Save to One CSV
# -----------------------------
df = pd.DataFrame(all_segments)
df.to_csv(output_csv, index=False)
print(f"\nSaved combined VAD results to: {output_csv}")


Processing: TTS_P85881TT_VCST_ECxxx_01_AO_53262906_v001_R007_CRR_MERLIon-CCS.wav


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.



Processing: TTS_P32758TT_VCST_ECxxx_01_AO_67891948_v001_R004_CRR_MERLIon-CCS.wav
Processing: TTS_P75358TT_VCST_ECxxx_02_AO_45793475_v001_R007_CRR_MERLIon-CCS.wav
Processing: TTS_P26838TT_VCST_ECxxx_01_AO_76524185_v001_R007_CRR_MERLIon-CCS.wav
Processing: TTS_P70984TT_VCST_ECxxx_01_AO_30109736_v001_R003_CRR_MERLIon-CCS.wav
Processing: TTS_P42483TT_VCST_ECxxx_02_AO_46455754_v001_R011_CRR_MERLIon-CCS.wav
Processing: TTS_P91182TT_VCST_ECxxx_01_AO_48503281_v001_R004_CRR_MERLIon-CCS.wav
Processing: TTS_P36907TT_VCST_ECxxx_01_AO_78388818_v001_R011_CRR_MERLIon-CCS.wav
Processing: TTS_P11137TT_VCST_ECxxx_01_AO_22759572_v001_R011_CRR_MERLIon-CCS.wav
Processing: TTS_P64800TT_VCST_ECxxx_03_AO_16388055_v001_R003_CRR_MERLIon-CCS.wav
Processing: TTS_P99608TT_VCST_ECxxx_01_AO_93135689_v001_R004_CRR_MERLIon-CCS.wav
Processing: TTS_P66635TT_VCST_ECxxx_02_AO_60338760_v001_R003_CRR_MERLIon-CCS.wav
Processing: TTS_P58082TT_VCST_ECxxx_02_AO_37798398_v001_R011_CRR_MERLIon-CCS.wav
Processing: TTS_P46524TT_VCS