# Baseline: Whisper Small Zero Shot

In [None]:
# Run this if required
# %pip install transformers evaluate datasets librosa jiwer

### Library Imports

In [None]:
import torch
import evaluate

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_from_disk

### Model

In [None]:
DEVICE = "cuda"

In [None]:
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(DEVICE)
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")


### Load Custom Dataset

In [None]:
DATASET_PATH = "../data/custom/hf_dataset/"

In [None]:
ds = load_from_disk(DATASET_PATH)

In [None]:
ds[0]

### Evaluation on 1 sample

In [None]:
input_speech = ds[0]["audio"]
input_features = processor(
    input_speech["array"],
    sampling_rate=input_speech["sampling_rate"],
    return_tensors="pt"
).input_features

# generate
generated_ids = model.generate(
    input_features.to(DEVICE),
    forced_decoder_ids=forced_decoder_ids,
)

# decode to text
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(transcription)

### Evaluation on entire dataset

In [None]:
def normalize_to_til(transcript: str) -> str:
    # TIL output is purely uppercase alphabet and space
    # so we normalize the output to that
    return "".join([c.upper() if c.isalpha() else " " for c in transcript])

In [None]:
def map_to_pred(batch):
    audio = batch["audio"]
    raw = [i["array"] for i in audio]
    
    input_features = processor(raw, sampling_rate=audio[0]["sampling_rate"], return_tensors="pt").input_features
    batch["reference"] = [normalize_to_til(processor.tokenizer._normalize(transcript)) for transcript in batch["annotation"]]


    with torch.no_grad():
        predicted_ids = model.generate(input_features.to(DEVICE))
    preds = []
    for pred in predicted_ids:
      transcription = processor.decode(pred)
      preds.append(normalize_to_til(processor.tokenizer._normalize(transcription)))
    batch["prediction"] = preds
    return batch

result = ds.map(map_to_pred, batched=True, batch_size=8)

# Calculate WER
wer = evaluate.load("wer")
result = wer.compute(predictions=result["prediction"], references=result["reference"])

print(f"WER: {result * 100:.3f}")

In [None]:
def map_to_pred_eval(batch):
    audio = batch["audio"]
    raw = [i["array"] for i in audio]
    
    input_features = processor(raw, sampling_rate=audio[0]["sampling_rate"], return_tensors="pt").input_features
    with torch.no_grad():
        predicted_ids = model.generate(input_features.to(DEVICE))
    preds = []
    for pred in predicted_ids:
      transcription = processor.decode(pred)
      preds.append(normalize_to_til(processor.tokenizer._normalize(transcription)))
    batch["prediction"] = preds
    return batch

## Next Steps
- Error Analysis
- Denoising of Data