In [6]:
import torch
from faster_whisper import WhisperModel
from pyannote.audio import Pipeline
import numpy as np
import librosa
import soundfile as sf

audioPath = "C:/Users/Jose/Desktop/Hackathon/Speech2Text/"
audioName = 'convPepeTrijoMaggi.wav' ## Filtered audio file
model_size = "medium"


In [7]:
tokenFile = "HuggingFaceToken.txt"
file1 = open(audioPath + tokenFile, 'r')
token = file1.readline()

First, using the pipeline of pyannote create the diarization of the speakers with the begining and end times of their frases

In [8]:
model = WhisperModel(model_size, device="cpu", compute_type="int8")

pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=token)

audio_file = audioPath + audioName

diarization = pipeline({'uri': 'filename', 'audio': audio_file})

segments = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
    segments.append({
        'start': turn.start,
        'end': turn.end,
        'speaker': speaker
    })

Then, process each segment using a temp wav file for whisper to use. Some times whisper is not able to recognize anything on the recording so there is a try-except to add some text for the code to continue running.
By not fixing the language the model performs better and faster (for one segment forcing spanish it takes like 10 times more)

In [18]:
def transcribe_segment(start, end, audio_path):
    y, sr = librosa.load(audio_path, sr=None, offset=start, duration=(end - start))
    sf.write("temp_segment.wav", y, sr)
    segments, info = model.transcribe("temp_segment.wav")
    counter = 0
    for segment in segments:
        counter += 1
        continue
        #print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
    print(str(counter) + " Segments")
    try:
        return segment.text
    except:
        return "Unidentified"

Finally transcribe the complete audio and save the transcription to a txt file

In [23]:
# Transcribe each segment and print
output_txt_path = 'transcription.txt'
with open(output_txt_path, 'w') as f:
    f.write("Transcription:\n")
    for segment in segments:
        text = transcribe_segment(segment['start'], segment['end'], audio_file)
        print(f"{segment['speaker']} [{round(segment['start'], 2)} - {round(segment['end'], 2)}]: {text}")
        try:
            f.write(f"{segment['speaker']} [{round(segment['start'], 2)} - {round(segment['end'], 2)}]: {text} \n")
        except:
            f.write("Unidentified \n")

1 Segments
SPEAKER_01 [1.47 - 1.99]: 稍等一下
1 Segments
SPEAKER_01 [3.27 - 8.65]:  Les quería preguntar cómo les fue el día que llevó la cagada con la tormenta.
1 Segments
SPEAKER_00 [16.83 - 18.31]:  Ya, yuktu!
1 Segments
SPEAKER_00 [19.04 - 20.72]:  Hasta el miércoles sin luz.
1 Segments
SPEAKER_00 [21.59 - 24.71]:  y venía a Karlaúa a pensar energía.
1 Segments
SPEAKER_02 [32.08 - 39.19]:  Yo al contrario, tuve el gusto rato, no se me cortó menos mal, pero mis hermanos vinieron hasta mis primas.
1 Segments
SPEAKER_02 [39.57 - 40.67]: しめろなー
1 Segments
SPEAKER_02 [40.81 - 45.85]:  a pedirnos lujo, estamos cargando los celulares de todos, literalmente de todos, de los amigos.
1 Segments
SPEAKER_00 [46.09 - 48.74]:  De amigo, de amigo, de mamá de amigo, e todo.
1 Segments
SPEAKER_02 [46.09 - 48.96]:  de amigo, de mamá de amigo, e tos.
1 Segments
SPEAKER_02 [49.3 - 53.57]:  La cuenta de luz, no sé cómo salió, pero fue muy chistoso para estar por cada...
1 Segments
SPEAKER_00 [50.18 - 50.74]