# Whisper with transformers

https://huggingface.co/openai/whisper-medium

https://github.com/huggingface/transformers/tree/main



In [3]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
model.config.forced_decoder_ids = None

# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
print(transcription)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)


Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/3.48k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating validation split: 0 examples [00:00, ? examples/s]



In [7]:
type(sample["array"])



numpy.ndarray

In [8]:
import librosa
def load_wav_to_tensor(file_path):
    # Load the WAV file using librosa
    waveform, sample_rate = librosa.load(file_path, sr=None, mono=True)

    # Convert the waveform to a torch tensor
    tensor_waveform = waveform #torch.tensor(waveform).unsqueeze(0)

    return tensor_waveform, sample_rate

# Example usage
file_path = r"C:\work\local4test\sampleAudio\shortTestRecording.wav"
audio, sample_rate = load_wav_to_tensor(file_path)

print(type(audio), audio.shape, sample_rate)

<class 'numpy.ndarray'> (262640,) 16000


In [14]:
%%time
# load model and processor

#model_name = "openai/whisper-medium"
model_name = "openai/whisper-tiny"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

CPU times: total: 922 ms
Wall time: 38 s


In [15]:
%%time

input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
print(transcription)

['<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Testing 16kHz 16bts creating a PCM to see what the recognition gives us a short message for testing.<|endoftext|>']
CPU times: total: 1.55 s
Wall time: 709 ms


# model_name = "openai/whisper-medium"

audio 16sec C:\work\local4test\sampleAudio\shortTestRecording.wav

['<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Testing 16 kilohertz, 16 bits, creating a PCM to see what the recognition gives us. A short message for testing.<|endoftext|>']
CPU times: total: 32.3 s
Wall time: 12.2

#model_name = "openai/whisper-tiny"
['<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Testing 16kHz 16bts creating a PCM to see what the recognition gives us a short message for testing.<|endoftext|>']
CPU times: total: 1.55 s
Wall time: 709 ms
 s