In [None]:
%%capture
!sudo apt install ffmpeg
!pip install -q cohere openai tiktoken
!pip install -q git+https://github.com/openai/whisper.git

In [None]:
# Download audio from GitHub repository
!wget -nv https://github.com/PacktPublishing/Learn-OpenAI-Whisper/raw/main/Chapter01/Learn_OAI_Whisper_Sample_Audio01.mp3
!wget -nv https://github.com/PacktPublishing/Learn-OpenAI-Whisper/raw/main/Chapter01/Learn_OAI_Whisper_Spanish_Sample_Audio01.mp3
!wget -nv https://cdn.openai.com/API/examples/data/bbq_plans.wav
!wget -nv https://cdn.openai.com/API/examples/data/product_names.wav

audiofiles = ['Learn_OAI_Whisper_Sample_Audio01.mp3', 'Learn_OAI_Whisper_Spanish_Sample_Audio01.mp3', 'bbq_plans.wav', 'product_names.wav']

Using a GPU is the preferred way to use Whisper. If you are using a local machine, you can check if you have a GPU available. The first line results `False`, if Cuda compatible Nvidia GPU is not available and `True` if it is available. The second line of code sets the model to preference GPU whenever it is available.

In [None]:
# https://lablab.ai/t/whisper-tutorial
import numpy as np
import torch
torch.cuda.is_available()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using torch {torch.__version__} ({DEVICE})")

Please keep in mind, that there are multiple different models available. You can find all of them [here](https://github.com/openai/whisper/blob/main/model-card.md). Each one of them has tradeoffs between accuracy and speed (compute needed). We will use the 'small' model for this tutorial.

In [None]:
#Now we can load the Whipser model. The model is loaded with the following command:
import whisper
model = whisper.load_model("medium", device=DEVICE)
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

In [None]:
# NLTK helps to split the transcription sentence by sentence
# and shows it in a neat manner one below another. You will see it in the output below.

import nltk
nltk.download('punkt')
from nltk import sent_tokenize

In [None]:
for audiofile in audiofiles:
    # Load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audiofile)
    audio = whisper.pad_or_trim(audio)
    # Make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    #The `detect_language` function detects the language of your audio file
    _, probs = model.detect_language(mel)
    detected_language = max(probs, key=probs.get)
    print(f"----\nDetected language: {detected_language}")
    # Set up the decoding options
    options = whisper.DecodingOptions(language=detected_language, without_timestamps=True, fp16=(DEVICE == "cuda"))
    # Decode the audio and print the recognized text
    result = whisper.decode(model, mel, options)
    print("Transcription of file '" + audiofile + "':")
    for sent in sent_tokenize(result.text):
        print(sent)

In [None]:
def process_file(audiofile, model, w_options, w_translate=False):

    # Load audio
    audio = whisper.load_audio(audiofile)
    transcribe_options = dict(task="transcribe", **w_options)
    translate_options = dict(task="translate", **w_options)

    transcription = model.transcribe(audiofile, **transcribe_options)["text"]
    if w_translate:
        translation = model.transcribe(audiofile, **translate_options)["text"]
    else:
        translation = "N/A"
    return transcription, translation

In [None]:
w_options = dict(without_timestamps=True, fp16=(DEVICE == "cuda"))
audiofile = 'Learn_OAI_Whisper_Spanish_Sample_Audio01.mp3'
transcription, translation = process_file(audiofile, model, w_options, False)

print("------\nTranscription of file '" + audiofile + "':")
for sent in sent_tokenize(transcription):
    print(sent)
print("------\nTranslation of file '" + audiofile + "':")
for sent in sent_tokenize(translation):
    print(sent)

import ipywidgets as widgets
widgets.Audio.from_file(audiofile, autoplay=False, loop=False)

In [None]:
w_options = dict(without_timestamps=True, fp16=(DEVICE == "cuda"), temperature=0, initial_prompt="")
audiofile = 'product_names.wav'
transcription, translation = process_file(audiofile, model, w_options)

print("------\nTranscription of file '" + audiofile + "':")
for sent in sent_tokenize(transcription):
    print(sent)

import ipywidgets as widgets
widgets.Audio.from_file(audiofile, autoplay=False, loop=False)

In [None]:
w_options = dict(without_timestamps=True, fp16=(DEVICE == "cuda"), temperature=0, initial_prompt="Quirk Quid Quill Inc., P3-Quattro, O3-Omni, B3-BondX, E3-Equity, W3-WrapZ, O2-Outlier, U3-UniFund, M3-Mover")
audiofile = 'product_names.wav'
transcription, translation = process_file(audiofile, model, w_options)

print("------\nTranscription of file '" + audiofile + "':")
for sent in sent_tokenize(transcription):
    print(sent)

In [None]:
w_options = dict(without_timestamps=True, fp16=(DEVICE == "cuda"), temperature=0, initial_prompt="")
audiofile = 'bbq_plans.wav'
transcription, translation = process_file(audiofile, model, w_options)

print("------\nTranscription of file '" + audiofile + "':")
for sent in sent_tokenize(transcription):
    print(sent)

import ipywidgets as widgets
widgets.Audio.from_file(audiofile, autoplay=False, loop=False)

In [None]:
w_options = dict(without_timestamps=True, fp16=(DEVICE == "cuda"), temperature=0, initial_prompt="Friends: Aimee, Shawn")
audiofile = 'bbq_plans.wav'
transcription, translation = process_file(audiofile, model, w_options)

print("------\nTranscription of file '" + audiofile + "':")
for sent in sent_tokenize(transcription):
    print(sent)

In [None]:
w_options = dict(without_timestamps=True, fp16=(DEVICE == "cuda"), temperature=0, initial_prompt=""""Aimee and Shawn had whisky, doughnuts, omelets at a BBQ.""")
audiofile = 'bbq_plans.wav'
transcription, translation = process_file(audiofile, model, w_options)

print("------\nTranscription of file '" + audiofile + "':")
for sent in sent_tokenize(transcription):
    print(sent)