In [None]:
# IMPORTANT: This is just for using the local whisper dir as the package directly. Delete until next comment when just installing whisper normally.
import sys
import os
from pathlib import Path
sys.path.insert(0, str(Path(os.path.abspath('')).resolve().parents[1]))
# end of dev import
import whisper

import colorsys
from typing import List
from whisper.tokenizer import get_tokenizer
from IPython.display import HTML as html_print

In [None]:
model = whisper.load_model("large-v2")

In [None]:
audio = whisper.load_audio("assets/230901_10min.wav")
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device) # make log-Mel spectrogram and move to the same device as the model

In [None]:
detect_lang = False
language = "en"
if detect_lang: # detect the spoken language
    print('Detecting language')
    _, probs = model.detect_language(mel)
    print(f"Detected language: {max(probs, key=probs.get)}")
    language=max(probs, key=probs.get)

In [None]:
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)

In [None]:
def get_colored_text(tokens: List[int], token_probs: List[float], tokenizer, prompt: str=""):
    text_tokens = [tokenizer.decode([t]) for t in tokens]

    output_text = ""
    for i, (token, prob) in enumerate(zip(text_tokens, token_probs)):
        # Interpolate between red and green in the HSV color space
        r, g, b = colorsys.hsv_to_rgb(prob * (1/3), 1, 1)
        r, g, b = int(r * 255), int(g * 255), int(b * 255)
        color_code = f"#{r:02x}{g:02x}{b:02x}"

        colored_token = f"<text style=color:{color_code}>{token}</text>"
        output_text += colored_token

    return output_text


tokenizer = get_tokenizer(multilingual=model.is_multilingual, language=language, task=options.task)
html_print(get_colored_text(result.tokens, result.token_probs, tokenizer))  # print text with fancy confidence colors