Local version of diarization and transcription

In [1]:
from faster_whisper import WhisperModel
import whisper
import numpy as np
import librosa
import soundfile as sf

audioPath = "C:/Users/Jose/Desktop/Hackathon/Speech2Text/"
audioName = 'convPepeTrijoMaggi.wav' ## Filtered audio file
model_size = "medium"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

def check_gpu():
    if torch.cuda.is_available():
        print(f"GPU is available! (Device name: {torch.cuda.get_device_name(0)})")
    else:
        print("GPU is not available. Using CPU.")

check_gpu()

GPU is available! (Device name: NVIDIA GeForce RTX 3050 Laptop GPU)


Local configurations to run the local models

In [3]:
import scipy
from pathlib import Path
from pyannote.audio import Pipeline
import os

def load_pipeline_from_pretrained(path_to_config: str):
    path_to_config = Path(path_to_config)

    print(f"Loading pyannote pipeline from {path_to_config}...")
    # the paths in the config are relative to the current working directory
    # so we need to change the working directory to the model path
    # and then change it back

    cwd = Path.cwd().resolve()  # store current working directory

    # first .parent is the folder of the config, second .parent is the folder containing the 'models' folder
    cd_to = path_to_config.parent.resolve()

    print(f"Changing working directory to {cd_to}")
    os.chdir(cd_to)

    pipeline = Pipeline.from_pretrained(path_to_config)

    print(f"Changing working directory back to {cwd}")
    os.chdir(cwd)

    return pipeline


In [4]:

PATH_TO_CONFIG = "C:/Users/Jose/Desktop/pyannoteLocal/config.yaml"
pipeline = load_pipeline_from_pretrained(PATH_TO_CONFIG)
pipeline.to(torch.device("cuda")) ## Send Pipeline to GPU, CUDA 11.7, torch 2.0.1

Loading pyannote pipeline from C:\Users\Jose\Desktop\pyannoteLocal\config.yaml...
Changing working directory to C:\Users\Jose\Desktop\pyannoteLocal
Changing working directory back to C:\Users\Jose\Desktop\Wilco (Hackathon)\wilco_radio_assistant\research_and_tests\speach_2_text


<pyannote.audio.pipelines.speaker_diarization.SpeakerDiarization at 0x1d44f07b1f0>

In [5]:
audio_file = audioPath + audioName

diarization = pipeline({'uri': 'filename', 'audio': audio_file})

segments = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
    segments.append({
        'start': turn.start,
        'end': turn.end,
        'speaker': speaker
    })

del pipeline  # release VRAM

In [6]:
#model = whisper.model("small", device="cuda", compute_type="int8")
model = whisper.load_model("small", device="cuda")

def transcribe_segment(start, end, audio_path):
    y, sr = librosa.load(audio_path, sr=None, offset=start, duration=(end - start))
    sf.write("temp_segment.wav", y, sr)
    result = model.transcribe("temp_segment.wav")
    #print(result)
    segments = result['segments']
    #print(segments)
    counter = 0
    for segment in segments:
        counter += 1
        #print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
        continue
        
    #print(str(counter) + " Segments")
    try:
        return segment['text']
    except:
        return "Unidentified"

In [7]:
# Transcribe each segment and print
output_txt_path = 'LOCALtranscription.txt'
with open(output_txt_path, 'w') as f:
    f.write("Transcription:\n")
    for segment in segments:
        text = transcribe_segment(segment['start'], segment['end'], audio_file)
        print(f"{segment['speaker']} [{round(segment['start'], 2)} - {round(segment['end'], 2)}]: {text}")
        try:
            f.write(f"{segment['speaker']} [{round(segment['start'], 2)} - {round(segment['end'], 2)}]: {text} \n")
        except:
            f.write("Unidentified \n")

SPEAKER_01 [1.47 - 1.99]:  quindi 잘가ぁ는 영상 촬영io
SPEAKER_01 [3.27 - 8.65]:  les quería preguntar cómo le fue el día que yo la cagé con la tormenta
SPEAKER_00 [16.83 - 18.31]:  Да.
SPEAKER_00 [19.04 - 20.72]:  hasta el miércoles y lú
SPEAKER_00 [21.59 - 24.71]:  and I want to say
SPEAKER_02 [32.08 - 39.19]:  Yo al contrario, tuve luz zorrotos, no se me encortó menos mal, pero mi hermano vinieron hasta mis primas.
SPEAKER_02 [39.57 - 40.67]: いめろなー
SPEAKER_02 [40.81 - 45.85]:  Te dignos lujos, estamos cargando los colores de todos, de todos, de tus amigos.
SPEAKER_00 [46.09 - 48.74]:  ¡Espera, vida del amigo, de la mamá del amigo!
SPEAKER_02 [46.09 - 48.96]:  de la vida del amigo, de la mamá del amigo, y todo.
SPEAKER_02 [49.3 - 53.57]:  Alguien te lo lo sé como salió pero fue muy chito su paisada porca...
SPEAKER_00 [50.18 - 50.74]:  seis失 خصانت cloth
SPEAKER_02 [54.42 - 56.39]:  ...interruptor, o sea...
SPEAKER_02 [57.04 - 57.8]:  Into farewell?
SPEAKER_02 [58.38 - 61.83]:  En chifre nuev