# Speaker diarization using WhisperX:
- To collect start and end time stemps of a speaker and their speaker IDS

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install git+https://github.com/m-bain/whisperx.git

Collecting git+https://github.com/m-bain/whisperx.git
  Cloning https://github.com/m-bain/whisperx.git to /tmp/pip-req-build-lraljtuc
  Running command git clone --filter=blob:none --quiet https://github.com/m-bain/whisperx.git /tmp/pip-req-build-lraljtuc
  Resolved https://github.com/m-bain/whisperx.git to commit 58f00339af7dcc9705ef49d97a1f40764b7cf555
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faster-whisper==1.0.0 (from whisperx==3.1.1)
  Downloading faster_whisper-1.0.0-py3-none-any.whl.metadata (14 kB)
Collecting pyannote.audio==3.1.1 (from whisperx==3.1.1)
  Downloading pyannote.audio-3.1.1-py2.py3-none-any.whl.metadata (9.3 kB)
Collecting av==11.* (from faster-whisper==1.0.0->whisperx==3.1.1)
  Downloading av-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting ctranslate2<5,>=4.0 (from faster-whisper==1.0.0->whisperx==3.1.1)
  Downloading ctranslate2-4.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.

In [None]:
YOUR_HF_TOKEN = "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"

In [None]:
import whisperx
import gc

device = "cuda"
audio_file = "/content/drive/MyDrive/Jyothi Mam/second/modified_audio.wav"
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
# print(result["segments"]) # after alignment

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
diarize_segments = diarize_model(audio)
result = whisperx.assign_word_speakers(diarize_segments, result)

In [None]:
import csv
# Specify the path for the CSV file
csv_file_path = "/content/drive/MyDrive/Jyothi Mam/second/transcription.csv"

# Open the CSV file in append mode
with open(csv_file_path, mode='a', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    if csv_file.tell() == 0:
        csv_writer.writerow(["Text", "Start Time", "End Time", "Speaker"])  # Write header if file is empty
    for i in result["segments"]:
        text = i["text"]
        start_time = i["start"]
        end_time = i["end"]
        speaker = i["speaker"]
        # Write the data to the CSV file
        csv_writer.writerow([text, start_time, end_time, speaker])


# Speaker Segmentation using TimeStemps for Audio Analysis
Collected audio of speakers as per the Time stemp

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
from pydub import AudioSegment
import pandas as pd

audio_file_path = "/content/drive/MyDrive/Jyothi Mam/second/modified_audio.wav"
csv_file_path = "/content/drive/MyDrive/Jyothi Mam/second/transcription.csv"

audio = AudioSegment.from_file(audio_file_path)
df = pd.read_csv(csv_file_path)
print(df.head())

                                                Text  Start Time  End Time  \
0                            Tell me about yourself.       0.472     1.452   
1                             Good afternoon, ma'am.       2.893     3.754   
2  I'm Thadala Veerabargiri from West Godavari an...       3.994    14.058   
3  I have completed my intermediate in Sasi Junio...      14.478    20.181   
4   Coming to my family, my family includes my fa...      24.585    36.711   

      Speaker  
0  SPEAKER_01  
1  SPEAKER_00  
2  SPEAKER_00  
3  SPEAKER_00  
4  SPEAKER_00  


In [None]:
current_speaker = None
combined_audio = AudioSegment.empty()
file_index = 1
for index, row in df.iterrows():
    start_time_ms = int(row["Start Time"] * 1000)
    end_time_ms = int(row["End Time"] * 1000)
    speaker_id = row["Speaker"]
    print(f"Processing segment: Start={start_time_ms}, End={end_time_ms}, Speaker={speaker_id}")
    if speaker_id == current_speaker:
        speaker_segment = audio[start_time_ms:end_time_ms]
        combined_audio += speaker_segment
    else:
        if current_speaker is not None:
            output_path = f"/content/drive/MyDrive/Jyothi Mam/second/temp/{current_speaker}_{file_index}.wav"
            print(f"Length of combined audio for {current_speaker}: {len(combined_audio)} ms")
            combined_audio.export(output_path, format="wav")
            print(f"Speaker {current_speaker} segment exported to {output_path}")
            file_index += 1
        current_speaker = speaker_id
        combined_audio = audio[start_time_ms:end_time_ms]

if current_speaker is not None:
    output_path = f"/content/drive/MyDrive/Jyothi Mam/second/temp/{current_speaker}_{file_index}.wav"
    print(f"Length of combined audio for {current_speaker}: {len(combined_audio)} ms")
    combined_audio.export(output_path, format="wav")
    print(f"Speaker {current_speaker} segment exported to {output_path}")

print("All segments exported successfully!")


Processing segment: Start=472, End=1452, Speaker=SPEAKER_01
Processing segment: Start=2893, End=3754, Speaker=SPEAKER_00
Length of combined audio for SPEAKER_01: 980 ms
Speaker SPEAKER_01 segment exported to /content/drive/MyDrive/Jyothi Mam/second/temp/SPEAKER_01_1.wav
Processing segment: Start=3994, End=14058, Speaker=SPEAKER_00
Processing segment: Start=14478, End=20181, Speaker=SPEAKER_00
Processing segment: Start=24585, End=36711, Speaker=SPEAKER_00
Processing segment: Start=38352, End=43015, Speaker=SPEAKER_00
Processing segment: Start=43795, End=53120, Speaker=SPEAKER_00
Processing segment: Start=54193, End=55694, Speaker=SPEAKER_01
Length of combined audio for SPEAKER_00: 42742 ms
Speaker SPEAKER_00 segment exported to /content/drive/MyDrive/Jyothi Mam/second/temp/SPEAKER_00_2.wav
Processing segment: Start=57775, End=61337, Speaker=SPEAKER_00
Length of combined audio for SPEAKER_01: 1501 ms
Speaker SPEAKER_01 segment exported to /content/drive/MyDrive/Jyothi Mam/second/temp/SPE