In [1]:
import pandas as pd
from faster_whisper import WhisperModel
from faster_whisper.transcribe import Segment, Word
from pathlib import Path
from tqdm import tqdm
from typing import Any
from whisper.utils import get_writer

# Get an example video

In [2]:
# I did not commit this file for copyright reasons, feel free to use your own files to test it
MEDIA_FILEPATH = 'example.mkv'
LANGUAGE = 'it'

# Initalizing the Whisper model

Notes on parameter:

* `device`: use `'cuda'` if you can (requires NVIDIA graphic gard and CUDA installation)! Transcription on `'cpu'` is much slower!
* `compute_type`:
    * `float16`: the default when CUDA is enabled because GPUs can handle float16 operations more efficiently, leading to faster inference times and lower memory usage
    * `float32`: the default when using the CPU. CPU generally perform better with higher precision data types, and the precision benefits outweigh the performance and memory considerations

In [3]:
model = WhisperModel(
    model_size_or_path='large-v3',
    device='cpu',  # default: "auto"
    compute_type="float32"  # default: "default" (depends on `device`)
)

# Prepare generator of transcription segments from `faster_whisper`

## Difference with Whipser

We get a generator and not the transcription for the whole file immediately

## Extra info for some important parameters

* `word_timestamps`: if you do not set this to True, you will not be able to generate subtitles, since this adds timings
* `task`: `'transcribe'` (default) or `'translate'`! But each task generates different timings...
* `initial_prompt`: you can provide additional context to help the transcription e.g. character names, names of places, invented words etc.
* `condition_on_previous_text`: if True, will use previous transcription results as context. May lead to more hallucination loops!

In [4]:
segments_gen, info = model.transcribe(
    audio=MEDIA_FILEPATH,
    word_timestamps=True,
    language=LANGUAGE,
    task='transcribe',
    initial_prompt=None,
    condition_on_previous_text=True
)
media_duration = round(info.duration, 2)

print(f'The media is {media_duration} seconds long')

The media is 30.27 seconds long


# Iterate over segments

I added a little progress bar using `tqdm` 🐢

In [5]:
# prepare generator iteration
current_time = 0
segments_data = []

# get transcription from segments
with tqdm(total=media_duration, unit=' audio seconds') as progress_bar:
    for segment in segments_gen:

        segments_data.append(segment)

        # show progress and set time for next loop
        progress_bar.update(segment.end - current_time)
        current_time = segment.end

100%|████████████████████████▉| 30.24/30.27 [00:49<00:00,  1.65s/ audio seconds]


# Convert output of `faster_whisper` to a format compatible with `Whisper`

`faster_whisper` uses named tuples whereas `Whisper` uses dictionaries.

We need to do that otherwise we will not be able to generate subtitles.

In [6]:
def faster_whisper_segment_to_openapi_whisper_segment(segment: Segment) -> dict[str, Any]:
    """
    Converts faster-whisper segments to OpenAI segments.
    This is useful for using whisper utilities such as writing subtitles.
    """
    # Note: `_asdict` is not a private method per say, it is a documented method of named tuples
    # and `FasterWhisperSegment` is one. Same goes for `FasterWhisperWord`.
    segment_dict = segment._asdict()
    words: list[Word] | None = segment_dict['words']
    if words is not None:
        words_parsed = [word._asdict() for word in words]
    else:
        words_parsed = None
    segment_dict['words'] = words_parsed
    return segment_dict

In [7]:
# example of `faster-whisper`'s output
segments_data[0]

Segment(id=1, seek=3000, start=0.0, end=2.86, text=' Oh no, la ciclope è in piena caccia!', tokens=[50390, 876, 572, 11, 635, 27464, 75, 1114, 4873, 294, 26274, 64, 269, 326, 2755, 0, 50513], temperature=0.0, avg_logprob=-0.16795300497956897, compression_ratio=1.3372093023255813, no_speech_prob=0.016745049506425858, words=[Word(start=0.0, end=0.92, word=' Oh', probability=0.9270329475402832), Word(start=0.92, end=1.3, word=' no,', probability=0.8142931461334229), Word(start=1.44, end=1.56, word=' la', probability=0.9834323525428772), Word(start=1.56, end=1.98, word=' ciclope', probability=0.8365635871887207), Word(start=1.98, end=2.12, word=' è', probability=0.7243817448616028), Word(start=2.12, end=2.14, word=' in', probability=0.999559223651886), Word(start=2.14, end=2.5, word=' piena', probability=0.9997018575668335), Word(start=2.5, end=2.86, word=' caccia!', probability=0.9998706777890524)])

In [8]:
# convert to Whisper format
segments_data_converted = [faster_whisper_segment_to_openapi_whisper_segment(segment=segment)
                           for segment in segments_data]
# example of what Whisper would output:
print(segments_data_converted[0])

{'id': 1, 'seek': 3000, 'start': 0.0, 'end': 2.86, 'text': ' Oh no, la ciclope è in piena caccia!', 'tokens': [50390, 876, 572, 11, 635, 27464, 75, 1114, 4873, 294, 26274, 64, 269, 326, 2755, 0, 50513], 'temperature': 0.0, 'avg_logprob': -0.16795300497956897, 'compression_ratio': 1.3372093023255813, 'no_speech_prob': 0.016745049506425858, 'words': [{'start': 0.0, 'end': 0.92, 'word': ' Oh', 'probability': 0.9270329475402832}, {'start': 0.92, 'end': 1.3, 'word': ' no,', 'probability': 0.8142931461334229}, {'start': 1.44, 'end': 1.56, 'word': ' la', 'probability': 0.9834323525428772}, {'start': 1.56, 'end': 1.98, 'word': ' ciclope', 'probability': 0.8365635871887207}, {'start': 1.98, 'end': 2.12, 'word': ' è', 'probability': 0.7243817448616028}, {'start': 2.12, 'end': 2.14, 'word': ' in', 'probability': 0.999559223651886}, {'start': 2.14, 'end': 2.5, 'word': ' piena', 'probability': 0.9997018575668335}, {'start': 2.5, 'end': 2.86, 'word': ' caccia!', 'probability': 0.9998706777890524}]}


# Let's have a closer look at the data using `pandas`

In [9]:
df = pd.DataFrame(segments_data_converted).set_index('id')

with pd.option_context('display.max_colwidth', 200):
    display(df)

Unnamed: 0_level_0,seek,start,end,text,tokens,temperature,avg_logprob,compression_ratio,no_speech_prob,words
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3000,0.0,2.86,"Oh no, la ciclope è in piena caccia!","[50390, 876, 572, 11, 635, 27464, 75, 1114, 4873, 294, 26274, 64, 269, 326, 2755, 0, 50513]",0.0,-0.167953,1.337209,0.016745,"[{'start': 0.0, 'end': 0.92, 'word': ' Oh', 'probability': 0.9270329475402832}, {'start': 0.92, 'end': 1.3, 'word': ' no,', 'probability': 0.8142931461334229}, {'start': 1.44, 'end': 1.56, 'word':..."
2,3000,3.6,4.94,"Attento, non guardare!","[50556, 7298, 15467, 11, 2107, 6290, 543, 0, 50617]",0.0,-0.167953,1.337209,0.016745,"[{'start': 3.6000000000000005, 'end': 4.28, 'word': ' Attento,', 'probability': 0.9829465746879578}, {'start': 4.28, 'end': 4.42, 'word': ' non', 'probability': 0.999756395816803}, {'start': 4.42,..."
3,3000,5.04,5.96,Non sto mica guardando!,"[50619, 8774, 22784, 32483, 6290, 1806, 0, 50669]",0.0,-0.167953,1.337209,0.016745,"[{'start': 5.04, 'end': 5.08, 'word': ' Non', 'probability': 0.6705727577209473}, {'start': 5.08, 'end': 5.22, 'word': ' sto', 'probability': 0.9813330769538879}, {'start': 5.22, 'end': 5.4, 'word..."
4,3000,9.38,15.86,"Qui, agente Sora Lela. Chiedo immediati rinforzi.","[50994, 27361, 11, 623, 1576, 46639, 441, 4053, 13, 761, 36035, 3640, 6908, 367, 259, 2994, 3992, 13, 51165]",0.0,-0.167953,1.337209,0.016745,"[{'start': 9.38, 'end': 10.06, 'word': ' Qui,', 'probability': 0.9696592092514038}, {'start': 13.14, 'end': 13.64, 'word': ' agente', 'probability': 0.9499600827693939}, {'start': 13.64, 'end': 13..."
5,3000,16.12,17.94,Saremo lì tra cinque minuti.,"[51190, 318, 543, 3280, 287, 4749, 944, 6539, 1077, 13951, 72, 13, 51267]",0.0,-0.167953,1.337209,0.016745,"[{'start': 16.12, 'end': 16.8, 'word': ' Saremo', 'probability': 0.9510459899902344}, {'start': 16.8, 'end': 16.98, 'word': ' lì', 'probability': 0.9996640980243683}, {'start': 16.98, 'end': 17.08..."
6,3000,20.54,24.12,"Ehi, nascondiamoci nel museo! So che il martedì si entra gratis.","[51394, 462, 4954, 11, 297, 4806, 684, 7415, 537, 15373, 39138, 78, 0, 407, 947, 1930, 12396, 292, 4749, 1511, 22284, 10158, 271, 13, 51581]",0.0,-0.167953,1.337209,0.016745,"[{'start': 20.54, 'end': 21.22, 'word': ' Ehi,', 'probability': 0.7486812174320221}, {'start': 21.22, 'end': 21.9, 'word': ' nascondiamoci', 'probability': 0.8889192461967468}, {'start': 21.9, 'en..."
7,3026,30.0,30.24,Il martedì si entra gratis.,"[50365, 4416, 12396, 292, 4749, 1511, 22284, 10158, 271, 13, 51864]",0.0,-0.350285,0.777778,0.277545,"[{'start': 30.0, 'end': 30.24, 'word': ' Il', 'probability': 0.0010006525553762913}, {'start': 30.24, 'end': 30.24, 'word': ' martedì', 'probability': 0.41527816581947263}, {'start': 30.24, 'end':..."


# Generate subtitles 🤖!

In [10]:
# we have to again imitate what Whisper would expect
expected_whisper_structure = {'segments': segments_data_converted}

srt_writer = get_writer(output_format='srt', output_dir='.')
srt_writer(result=expected_whisper_structure,
           audio_path=MEDIA_FILEPATH)

# Result

Opening the generated subtitle file

In [11]:
with open(Path(MEDIA_FILEPATH).with_suffix('.srt'), encoding='utf-8', mode='r') as fh:
    subtitles_text = fh.read()

print(subtitles_text)

1
00:00:00,000 --> 00:00:02,860
Oh no, la ciclope è in piena caccia!

2
00:00:03,600 --> 00:00:04,940
Attento, non guardare!

3
00:00:05,040 --> 00:00:05,960
Non sto mica guardando!

4
00:00:09,380 --> 00:00:15,860
Qui, agente Sora Lela. Chiedo immediati rinforzi.

5
00:00:16,120 --> 00:00:17,940
Saremo lì tra cinque minuti.

6
00:00:20,540 --> 00:00:24,120
Ehi, nascondiamoci nel museo! So che il martedì si entra gratis.

7
00:00:30,000 --> 00:00:30,240
Il martedì si entra gratis.


