# Baseline: Whisper Small Zero Shot

In [None]:
# Run this if required
# %pip install transformers evaluate datasets librosa jiwer

### Library Imports

In [None]:
import torch
import evaluate

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

### Model

In [None]:
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")


### Load Custom Dataset

In [None]:
DATASET_PATH = "../data/custom/hf_dataset/"

In [None]:
ds = load_dataset(DATASET_PATH, streaming=True)

### Evaluation on 1 sample

In [None]:
input_speech = next(iter(ds))["audio"]
input_features = processor(
    input_speech["array"],
    sampling_rate=input_speech["sampling_rate"],
    return_tensors="pt"
).input_features

# generate
generated_ids = model.generate(
    input_features,
    forced_decoder_ids=forced_decoder_ids,
)

# decode to text
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(transcription)

### Evaluation on entire dataset

In [None]:
def map_to_pred(batch):
    audio = batch["audio"]
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
    batch["reference"] = processor.tokenizer._normalize(batch['transcript'])

    with torch.no_grad():
        predicted_ids = model.generate(input_features.to("cuda"))[0]
    transcription = processor.decode(predicted_ids)
    batch["prediction"] = processor.tokenizer._normalize(transcription)
    return batch

# Load the dataset again
ds = load_dataset(DATASET_PATH)

result = ds.map(map_to_pred, batched=True, batch_size=8)

# Calculate WER
wer = evaluate.load("wer")
result = wer.compute(predictions=result["prediction"], references=result["reference"])

print(f"WER: {result['WER']:.3f}")