**You can access TEST-1.mp3 audio here.**

---

[Audio Link](https://drive.google.com/file/d/1ODHCbW7LjuTUxd0njylHHHObwQxQVK6D/view?usp=drive_link)


[Video Link](https://drive.google.com/file/d/1NFifPUnU2w6WZKyEXLyQjcn1lEXnupRt/view?usp=drive_link)

# **INSTALLING MODULES**

In [None]:
!pip install speechbrain==0.5.16
!pip install faster_whisper
!pip install pyannote.audio
!pip install whisper

Collecting speechbrain==0.5.16
  Using cached speechbrain-0.5.16-py3-none-any.whl.metadata (23 kB)
Using cached speechbrain-0.5.16-py3-none-any.whl (630 kB)
Installing collected packages: speechbrain
  Attempting uninstall: speechbrain
    Found existing installation: speechbrain 1.0.0
    Uninstalling speechbrain-1.0.0:
      Successfully uninstalled speechbrain-1.0.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyannote-audio 3.3.1 requires speechbrain>=1.0.0, but you have speechbrain 0.5.16 which is incompatible.[0m[31m
[0mSuccessfully installed speechbrain-0.5.16


Collecting speechbrain>=1.0.0 (from pyannote.audio)
  Using cached speechbrain-1.0.0-py3-none-any.whl.metadata (23 kB)
Using cached speechbrain-1.0.0-py3-none-any.whl (760 kB)
Installing collected packages: speechbrain
  Attempting uninstall: speechbrain
    Found existing installation: speechbrain 0.5.16
    Uninstalling speechbrain-0.5.16:
      Successfully uninstalled speechbrain-0.5.16
Successfully installed speechbrain-1.0.0




# **IMPORT NECESSARY LIBRARIES**

In [None]:
import librosa
import traceback
from faster_whisper import WhisperModel
import torch
import whisper
import datetime
from pathlib import Path
import pandas as pd
import re
import time
import os
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from pyannote.audio import Audio
from pyannote.core import Segment
import speechbrain
from scipy.spatial.distance import cdist


# **UPLOAD AUDIO**

In [None]:
# Get the path to the audio file in your colab notebook
audio_file_path = '/content/TEST-1.mp3'

# **.mp3 to .wav conversion**

*   Sample Rate - 16 KHz
*   Channel - 1 (mono)
*   Audio Codec - pcm_s16le





In [None]:
# prompt: convert mp3 to wav format

# !ffmpeg -i "{audio_file_path}" "{audio_file_path[:-4]}.wav"
!ffmpeg -i "{audio_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file_path[:-4]}.wav"



ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

# ****SPEAKER DIARIZATION - USING WHISPER SEGMENTS AND AGGLOMERATIVE HIERARCHICAL CLUSTERING****

In [None]:


whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]



embedding_model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)


def convert_time(secs):
    return datetime.timedelta(seconds=round(secs))

def speech_to_text(audio_file, whisper_model):
    model = WhisperModel(whisper_model, compute_type="int8")
    time_start = time.time()

    try:
        # Get duration
        audio_data, sample_rate = librosa.load(audio_file, mono= True, sr=16000)
        duration = len(audio_data) / sample_rate  # Calculate duration

        # Transcribe audio
        options = dict(language='en', beam_size=5, best_of=5)
        transcribe_options = dict(task="transcribe", **options)
        segments_raw, info = model.transcribe(audio_file, **transcribe_options)

        # Convert back to original openai format
        segments = []
        for segment_chunk in segments_raw:
            chunk = {}
            chunk["start"] = segment_chunk.start
            chunk["end"] = segment_chunk.end
            segments.append(chunk)

    except Exception as e:
        raise RuntimeError("Error converting video to audio")

    try:
        # Create embedding
        def segment_embedding(segment):
          try:
              audio = Audio()
              start = segment["start"]
              end = min(duration, segment["end"])

              clip = Segment(start, end)
              waveform, sample_rate = audio.crop(audio_file, clip)

              embeddings = embedding_model(waveform[None])

              return embeddings
          except Exception as e:
              traceback.print_exc()
              raise RuntimeError("Error during segment embedding", e)


        # Create embedding
        embeddings = np.zeros(shape=(len(segments), 192))
        for i, segment in enumerate(segments):
            embeddings[i] = segment_embedding(segment)
        embeddings = np.nan_to_num(embeddings)


        # Assign speaker label
        best_num_speaker = 2
        clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
        labels = clustering.labels_
        for i in range(len(segments)):
            segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

        # Make output
        objects = {
            'Start' : [],
            'End': [],
            'Speaker': [],
        }
        for (i, segment) in enumerate(segments):
            if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
                objects['Start'].append(str(convert_time(segment["start"])))
                objects['Speaker'].append(segment["speaker"])
                if i != 0:
                    objects['End'].append(str(convert_time(segments[i - 1]["end"])))
        objects['End'].append(str(convert_time(segments[i - 1]["end"])))

        save_path = "/content/TEST-1.csv"
        df_results = pd.DataFrame(objects)
        df_results.to_csv(save_path)
        return df_results, save_path

    except Exception as e:
        # Print exception for debugging
        print("Exception occurred:", e)
        raise RuntimeError("Error Running inference with local model", e)


# Provide the path to your audio file
audio_file = "/content/TEST-1.wav"

# Set the Whisper model and number of speakers
selected_whisper_model = "base"

# Run the transcription
transcription_results, save_path = speech_to_text(audio_file, selected_whisper_model)

# Print the transcription results
print(transcription_results)
print(f"Transcription results saved at: {save_path}")

     Start      End    Speaker
0  0:00:00  0:00:04  SPEAKER 2
1  0:00:04  0:00:27  SPEAKER 1
2  0:00:27  0:00:31  SPEAKER 2
3  0:00:31  0:00:54  SPEAKER 1
4  0:00:54  0:00:57  SPEAKER 2
5  0:00:57  0:01:21  SPEAKER 1
6  0:01:21  0:01:27  SPEAKER 2
7  0:01:27  0:01:58  SPEAKER 1
Transcription results saved at: /content/TEST-1.csv


In [None]:
!pip install moviepy pandas pillow



In [None]:
import pandas as pd
from moviepy.editor import VideoFileClip, ImageClip, CompositeVideoClip
from PIL import Image, ImageDraw, ImageFont



df_results = transcription_results

# Step 4: Load the Video
video_path = '/content/videoplayback_test1.mp4'  # Update with your video path
video = VideoFileClip(video_path)

# Function to create an image with text
def create_text_image(text, font_size=70, img_size=(640, 80), bg_color=(0, 0, 0), text_color=(255, 255, 255)):
    img = Image.new('RGB', img_size, color=bg_color)
    d = ImageDraw.Draw(img)
    try:
        font = ImageFont.truetype("arial.ttf", font_size)
    except IOError:
        font = ImageFont.load_default()
    text_width, text_height = d.textsize(text, font=font)
    position = ((img_size[0]-text_width)/2, (img_size[1]-text_height)/2)
    d.text(position, text, fill=text_color, font=font)
    return img

# Step 5: Overlay Speaker Labels
clips = [video]

for _, row in df_results.iterrows():
    start_time = pd.to_datetime(row['Start']).time()
    end_time = pd.to_datetime(row['End']).time()

    start_seconds = start_time.hour * 3600 + start_time.minute * 60 + start_time.second
    end_seconds = end_time.hour * 3600 + end_time.minute * 60 + end_time.second

    text_img = create_text_image(row['Speaker'])
    text_img_path = '/content/temp_text_img.png'
    text_img.save(text_img_path)

    txt_clip = (ImageClip(text_img_path)
                .set_position(('center', 'bottom'))
                .set_start(start_seconds)
                .set_duration(end_seconds - start_seconds))

    clips.append(txt_clip)

# Combine all clips
final_video = CompositeVideoClip(clips)

# Step 6: Save the Modified Video
final_video_path = '/content/videoplayback_label.mp4'
final_video.write_videofile(final_video_path, codec='libx264')


Moviepy - Building video /content/videoplayback_label.mp4.
MoviePy - Writing audio in videoplayback_labelTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video /content/videoplayback_label.mp4





Moviepy - Done !
Moviepy - video ready /content/videoplayback_label.mp4


In [None]:
from IPython.display import HTML
from base64 import b64encode

def show_video(final_video_path, video_width = 1000):

  video_file = open(final_video_path, "r+b").read()

  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  return HTML(f"""<video width={video_width} controls><source src="{video_url}"></video>""")

show_video(final_video_path)