# Installing Whisper

The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

# Loading the LibriSpeech dataset

The following will load the test-clean split of the LibriSpeech corpus using torchaudio.

In [None]:
import os
os.chdir(r'C:\\work\\Github\\whisper')
os.getcwd()

In [None]:
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import whisper
import torchaudio

from tqdm.notebook import tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
class LibriSpeech(torch.utils.data.Dataset):
    """
    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
    It will drop the last few seconds of a very small portion of the utterances.
    """
    def __init__(self, split="test-clean", device=DEVICE):
        self.dataset = torchaudio.datasets.LIBRISPEECH(
            root=os.path.expanduser("~/.cache"),
            url=split,
            download=True,
        )
        self.device = device

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        audio, sample_rate, text, _, _, _ = self.dataset[item]
        assert sample_rate == 16000
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel = whisper.log_mel_spectrogram(audio)
        
        return (mel, text)

In [None]:
dataset = LibriSpeech("test-clean")
loader = torch.utils.data.DataLoader(dataset, batch_size=16)

# Running inference on the dataset using a base Whisper model

The following will take a few minutes to transcribe all utterances in the dataset.

In [None]:
model = whisper.load_model("base.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

In [None]:
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", without_timestamps=True, fp16 = False)

In [None]:
%%time
hypotheses = []
references = []

for mels, texts in tqdm(loader):
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)
    

In [None]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

# Calculating the word error rate

Now, we use our English normalizer implementation to standardize the transcription and calculate the WER.

In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

In [None]:
audio, sample_rate, text, _, _, _  = dataset.dataset[10]


In [None]:
%%time
hypotheses = []
print(type(audio), audio.shape, f"{audio.shape[1]/16000}")
w_audio = whisper.pad_or_trim(audio.flatten())
print(type(w_audio), w_audio.shape)
mel = whisper.log_mel_spectrogram(w_audio)
print(type(mel), mel.shape)
results = model.decode(mel, options)
print(f"|{results.text}|")

#  try different models

In [None]:
model_tiny = whisper.load_model("tiny")
print(
    f"Model is {'multilingual' if model_tiny.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model_tiny.parameters()):,} parameters."
)

In [None]:
%%time
audio, sample_rate, text, _, _, _  = dataset.dataset[0]

hypotheses = []
print(type(audio), audio.shape, f"{audio.shape[1]/16000}")
w_audio = whisper.pad_or_trim(audio.flatten())
print(type(w_audio), w_audio.shape)
mel = whisper.log_mel_spectrogram(w_audio)
print(type(mel), mel.shape)
results = model_tiny.decode(mel, options)
#hypotheses.extend([result.text for result in results])
print(f"|{results.text}|")

# load and transcribe local file (16Khz)

In [None]:
import librosa
def load_wav_to_tensor(file_path):
    # Load the WAV file using librosa
    waveform, sample_rate = librosa.load(file_path, sr=None, mono=True)

    # Convert the waveform to a torch tensor
    tensor_waveform = torch.tensor(waveform).unsqueeze(0)

    return tensor_waveform, sample_rate

# Example usage
file_path = r"C:\work\local4test\sampleAudio\shortTestRecording.wav"
audio, sample_rate = load_wav_to_tensor(file_path)

print(type(audio), audio.shape, sample_rate)

In [None]:
%%time

hypotheses = []
print(type(audio), audio.shape, f"{audio.shape[1]/16000}")
w_audio = whisper.pad_or_trim(audio.flatten())
print(type(w_audio), w_audio.shape)
mel = whisper.log_mel_spectrogram(w_audio)
print(type(mel), mel.shape)
results = model_tiny.decode(mel, options)
#hypotheses.extend([result.text for result in results])
print(f"|{results.text}|")

# [decode code ](https://github.com/openai/whisper/blob/main/whisper/decoding.py)


# fast decode

https://github.com/openai/whisper/discussions/937
https://github.com/guillaumekln/faster-whisper

# [exploring OpenAI](https://deepgram.com/learn/exploring-whisper) 

https://github.com/huggingface/transformers/issues/22612



# Run from here

In [1]:
import os
os.chdir(r'C:\\work\\Github\\whisper')
os.getcwd()

'C:\\work\\Github\\whisper'

In [2]:
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import whisper
import torchaudio

from tqdm.notebook import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
model_tiny = whisper.load_model("tiny")
print(
    f"Model is {'multilingual' if model_tiny.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model_tiny.parameters()):,} parameters."
)

Model is multilingual and has 37,184,640 parameters.


In [4]:
import librosa
def load_wav_to_tensor(file_path):
    # Load the WAV file using librosa
    waveform, sample_rate = librosa.load(file_path, sr=None, mono=True)

    # Convert the waveform to a torch tensor
    tensor_waveform = torch.tensor(waveform).unsqueeze(0)

    return tensor_waveform, sample_rate

# Example usage
file_path = r"C:\work\local4test\sampleAudio\shortTestRecording.wav"
audio, sample_rate = load_wav_to_tensor(file_path)

print(type(audio), audio.shape, sample_rate)

<class 'torch.Tensor'> torch.Size([1, 262640]) 16000


In [5]:
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", 
                                  without_timestamps=False, 
                                  fp16 = False)
#                                  beam_size=20)

In [6]:
%%time

hypotheses = []
print(type(audio), audio.shape, f"{audio.shape[1]/16000}")
w_audio = whisper.pad_or_trim(audio.flatten())
print(type(w_audio), w_audio.shape)
mel = whisper.log_mel_spectrogram(w_audio)
print(type(mel), mel.shape)
results = model_tiny.decode(mel, options)
#hypotheses.extend([result.text for result in results])
print(f"|{results.text}|")

<class 'torch.Tensor'> torch.Size([1, 262640]) 16.415
<class 'torch.Tensor'> torch.Size([480000])
<class 'torch.Tensor'> torch.Size([80, 3000])
TVdbg: calling DecodingTask(model, options).run(mel)
tvdbg True, language='en', transcribe
tvdbg next_tokens=tensor([50364])
tvdbg values  tensor([-0.0778, -4.7494, -5.0728, -5.2159, -5.4058, -5.5530, -5.5588, -5.5881,
        -5.5893, -5.7145])
tvdbg indices tensor([50364, 50376, 50372, 50374, 50375, 50378, 50373, 50368, 50380, 50370])
tvdbg <class 'list'> [50364, 50376, 50372, 50374, 50375, 50378, 50373, 50368, 50380, 50370]
['']
tvdbg <class 'torch.Tensor'> torch.Size([1]) tensor([-0.0778]) self.eot=50257
tvdbg 0 tensor([50258, 50259, 50359, 50364])
tvdbg next_tokens=tensor([45517])
tvdbg values  tensor([-0.2734, -1.9554, -4.3461, -4.9601, -4.9666, -5.4980, -5.8983, -6.3826,
        -6.6220, -6.6550])
tvdbg indices tensor([45517,  4997,  9279,  3165,   314,  1500,   502,  6921,  1449, 11019])
tvdbg <class 'list'> [45517, 4997, 9279, 3165, 31

# current code

whisper/decoding.py

    class GreedyDecoder(TokenDecoder):
        def __init__(self, temperature: float, eot: int):
            self.temperature = temperature
            self.eot = eot
            self.tokenizer = get_tokenizer(True, language='en', task='transcribe')
    
    
        def update(
            self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
        ) -> Tuple[Tensor, bool]:
            if self.temperature == 0:
                next_tokens = logits.argmax(dim=-1)
            else:
                next_tokens = Categorical(logits=logits / self.temperature).sample()
    
            print(f"tvdbg {next_tokens=}")
            logprobs = F.log_softmax(logits.float(), dim=-1)
            #print(f"tvdbg {logprobs[0][:10]}")
            sorted_tensor, sorted_indices = torch.sort(logprobs[0][:], descending=True)
            print(f"tvdbg values  {sorted_tensor[:10]}")
            print(f"tvdbg indices {sorted_indices[:10]}")
            #print("tvdbg {[self.tokenizer.decode([[t.item()]]).strip() for t in sorted_indices[:10]]}")
            #print(f"tvdbg {[self.tokenizer.decode([[t.item()]]).strip() for t in sorted_indices[:10]]}")
            #print(f"tvdbg {[t for t in sorted_indices[:10]]}")
            #tokens1: List[List[int]] = [t[i].tolist() for i, t in zip([selected], tokens)]
            #tokens1 = [50258, 50259, 50359, 50364, 45517,  3165,    74, 21409,  3165,    65,   1373,  4084,   257,  6465,    44,   281,   536,   437,   264, 11150, 2709,   505,   257,  2099,  3636,   337,  4997,    13, 50964, 50257]
            tokens1: List[List[int]] = [t.item() for t in sorted_indices[:10]]
            print(f"tvdbg {type(tokens1)} {tokens1}")
            #texts: List[str] = [self.tokenizer.decode(t).strip() for t in [tokens1]]
            texts: List[str] = [self.tokenizer.decode(t) for t in [tokens1]]
            print(texts)
    
    
            current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens]
            print(f"tvdbg {type(current_logprobs)} {current_logprobs.shape} {current_logprobs[:10]} {self.eot=}")
            sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot)
    
            next_tokens[tokens[:, -1] == self.eot] = self.eot
            tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1)
    
            completed = (tokens[:, -1] == self.eot).all()
            return tokens, completed


# remember 

whisper/decoding.py

tokens1 = [50258, 50259, 50359, 50364, 45517,  3165,    74, 21409,  3165,    65,   1373,  4084,   257,  6465,    44,   281,   536,   437,   264, 11150, 2709,   505,   257,  2099,  3636,   337,  4997,    13, 50964, 50257]
        print(f"tvdbg {type(tokens1)} ")
        texts: List[str] = [self.tokenizer.decode(t).strip() for t in [tokens1]]
        print(texts)
        
['<|startoftranscript|><|en|><|transcribe|> Testing 16kHz 16bts creating a PCM to see what the recognition gives us a short message for testing.<|endoftext|>']


    class GreedyDecoder(TokenDecoder):
        def __init__(self, temperature: float, eot: int):
            self.temperature = temperature
            self.eot = eot
            self.tokenizer = get_tokenizer(True, language='en', task='transcribe')
    
    
        def update(
            self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
        ) -> Tuple[Tensor, bool]:
            if self.temperature == 0:
                next_tokens = logits.argmax(dim=-1)
            else:
                next_tokens = Categorical(logits=logits / self.temperature).sample()
    
            print(f"tvdbg {next_tokens=}")
            logprobs = F.log_softmax(logits.float(), dim=-1)
            #print(f"tvdbg {logprobs[0][:10]}")
            sorted_tensor, sorted_indices = torch.sort(logprobs[0][:], descending=True)
            print(f"tvdbg values  {sorted_tensor[:10]}")
            print(f"tvdbg indices {sorted_indices[:10]}")
            #print("tvdbg {[self.tokenizer.decode([[t.item()]]).strip() for t in sorted_indices[:10]]}")
            #print(f"tvdbg {[self.tokenizer.decode([[t.item()]]).strip() for t in sorted_indices[:10]]}")
            #print(f"tvdbg {[t for t in sorted_indices[:10]]}")
            #tokens1: List[List[int]] = [t[i].tolist() for i, t in zip([selected], tokens)]
            tokens1 = [50258, 50259, 50359, 50364, 45517,  3165,    74, 21409,  3165,    65,   1373,  4084,   257,  6465,    44,   281,   536,   437,   264, 11150, 2709,   505,   257,  2099,  3636,   337,  4997,    13, 50964, 50257]
            print(f"tvdbg {type(tokens1)} ")
            texts: List[str] = [self.tokenizer.decode(t).strip() for t in [tokens1]]
            print(texts)
    
    
            current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens]
            print(f"tvdbg {type(current_logprobs)} {current_logprobs.shape} {current_logprobs[:10]} {self.eot=}")
            sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot)
    
            next_tokens[tokens[:, -1] == self.eot] = self.eot
            tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1)
    
            completed = (tokens[:, -1] == self.eot).all()
            return tokens, completed


In [None]:
results

# [silero-vad](https://github.com/snakers4/silero-vad)

In [None]:
# https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb#scrollTo=pSifus5IilRp

#https://github.com/snakers4/silero-vad/blob/master/utils_vad.py


In [None]:
from IPython.display import Audio
from pprint import pprint
import librosa
import librosa.display
import matplotlib.pyplot as plt


In [None]:
USE_ONNX = False # change this to True if you want to test onnx model
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=USE_ONNX)


In [None]:
(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils

In [None]:
SAMPLING_RATE = 16000

In [None]:
wav = read_audio(file_path, sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE, visualize_probs=False)
pprint(speech_timestamps)

In [None]:
wav.shape

In [None]:
for seg in speech_timestamps:
    print(f"{seg['start']/SAMPLING_RATE} to {seg['end']/SAMPLING_RATE}")

In [None]:
plt.close()
#plt.figure(figsize=(10, 4))
librosa.display.waveshow(wav.numpy(), sr=SAMPLING_RATE)
#plt.xlabel("Time (s)")
#plt.ylabel("Amplitude")
#plt.title("Audio Waveform")
#plt.tight_layout()
#plt.show()


In [None]:
plt.show()

In [None]:
y, sr = librosa.load(librosa.ex('choice'), duration=10)
fig, ax = plt.subplots(nrows=1, sharex=True)
librosa.display.waveshow(wav.numpy(), sr=SAMPLING_RATE, ax=ax[0])
ax[0].set(title='Envelope view, mono')
ax[0].label_outer()

# Games

In [None]:
import torch
from torch.distributions import Categorical

# Create a tensor of logits (unnormalized probabilities)
logits = torch.tensor([1.0, 2.0, 3.0])

# Create a Categorical distribution
distribution = Categorical(logits=logits)

# Sample from the distribution
sample = distribution.sample()  # Returns an index (0, 1, or 2)

# Compute the log probability of a specific value (e.g., index 2)
log_prob = distribution.log_prob(2)

In [None]:
import whisper


tokenizer = whisper.tokenizer()

t = tokenizer.encode("This is a string")

decoded_string = tokenizer.decode(t).strip()

print(decoded_string)
# This is a string

In [None]:
whisper.tokenizer
