# Evaluation of Fine Tuned Whisper Small

In [None]:
# Run this if required
%pip install transformers evaluate datasets librosa jiwer

Collecting transformers
  Using cached transformers-4.29.2-py3-none-any.whl (7.1 MB)
Collecting evaluate
  Using cached evaluate-0.4.0-py3-none-any.whl (81 kB)
Collecting datasets
  Using cached datasets-2.12.0-py3-none-any.whl (474 kB)
Collecting librosa
  Using cached librosa-0.10.0.post2-py3-none-any.whl (253 kB)
Collecting jiwer
  Using cached jiwer-3.0.1-py3-none-any.whl (21 kB)
Collecting huggingface-hub<1.0,>=0.14.1
  Using cached huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
Collecting regex!=2019.12.17
  Using cached regex-2023.5.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (756 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Collecting multiprocess
  Using cached multiprocess-0.70.14-py37-none-any.whl (115 kB)
Collecting dill
  Using cached dill-0.3.6-py3-none-any.whl (110 kB)
Collecting responses<0.19
  Using cached responses-0.18.0-py3-none-any.whl (38 kB)


### Library Imports

In [None]:
import csv
import torch
import evaluate

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_from_disk

In [None]:
torch.cuda.is_available()

True

### Model

In [None]:
DEVICE = "cuda"

In [None]:
# load model and processor
MODEL_PATH = "<path-to-model>"
processor = WhisperProcessor.from_pretrained(MODEL_PATH)
model = Whiprocessor = WhisperProcessor.from_pretrained(MODEL_PATH).to(DEVICE)
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")


### Load Custom Dataset

In [None]:
DATASET_PATH = "gs://cloud-ai-platform-e8edc327-855c-4911-bb8e-205517f8c899/asr/data/train/til_asr_base_train"

In [None]:
ds = load_from_disk(DATASET_PATH)

In [None]:
ds["train"][0]

{'path': 'audio/train_03701.wav',
 'annotation': 'THERE IS ONLY ONE WAY TO SUCCESS AND THAT IS HARD WORK AND DETERMINATION',
 'audio': {'path': 'train_03701.wav',
  'array': array([-3.96728516e-04, -5.18798828e-04, -4.88281250e-04, ...,
         -3.05175781e-05, -3.05175781e-05, -9.15527344e-05]),
  'sampling_rate': 16000}}

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['path', 'annotation', 'audio'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['path', 'annotation', 'audio'],
        num_rows: 750
    })
})

### Evaluation on 1 sample

In [None]:
input_speech = ds["train"][0]["audio"]
input_features = processor(
    input_speech["array"],
    sampling_rate=input_speech["sampling_rate"],
    return_tensors="pt"
).input_features

# generate
generated_ids = model.generate(
    input_features.to(DEVICE),
    forced_decoder_ids=forced_decoder_ids,
)

# decode to text
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(transcription)



 There is only one way to success and that is hard work and determination.


### Evaluation on entire dataset

In [None]:
def normalize_to_til(transcript: str) -> str:
    # TIL output is purely uppercase alphabet and space
    # so we normalize the output to that
    result = "".join([c.upper() if c.isalpha() else " " for c in transcript])
    # Remove double spaces
    while "  " in result:
        result = result.replace("  ", " ")
    return result

In [None]:
def map_to_pred(batch):
    audio = batch["audio"]
    raw = [i["array"] for i in audio]
    
    input_features = processor(raw, sampling_rate=audio[0]["sampling_rate"], return_tensors="pt").input_features
    batch["reference"] = [normalize_to_til(processor.tokenizer._normalize(transcript)) for transcript in batch["annotation"]]


    with torch.no_grad():
        predicted_ids = model.generate(input_features.to(DEVICE))
    preds = []
    for pred in predicted_ids:
      transcription = processor.decode(pred)
      preds.append(normalize_to_til(processor.tokenizer._normalize(transcription)))
    batch["prediction"] = preds
    return batch

preds = ds.map(map_to_pred, batched=True, batch_size=32)

# Calculate WER
wer = evaluate.load("wer")
test_result = wer.compute(predictions=preds["test"]["prediction"], references=preds["test"]["reference"])
train_result = wer.compute(predictions=preds["train"]["prediction"], references=preds["train"]["reference"])
print(f"Train WER: {train_result * 100:.3f}")
print(f"Test WER: {test_result * 100:.3f}")

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Train WER: 3.665
Test WER: 3.489


In [None]:
def map_to_pred_eval(batch):
    audio = batch["audio"]
    raw = [i["array"] for i in audio]
    
    input_features = processor(raw, sampling_rate=audio[0]["sampling_rate"], return_tensors="pt").input_features
    with torch.no_grad():
        predicted_ids = model.generate(input_features.to(DEVICE))
    preds = []
    for pred in predicted_ids:
      transcription = processor.decode(pred)
      preds.append(normalize_to_til(processor.tokenizer._normalize(transcription)))
    batch["prediction"] = preds
    return batch

In [None]:
EVAL_DATASET = "gs://cloud-ai-platform-e8edc327-855c-4911-bb8e-205517f8c899/asr/data/test/til_asr_base_eval"
eval_ds = load_from_disk(EVAL_DATASET)
eval_ds

Dataset({
    features: ['path', 'audio'],
    num_rows: 12000
})

In [None]:
eval_preds = eval_ds.map(map_to_pred_eval, batched=True, batch_size=32)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [None]:
def create_csv(paths, predictions, output_filename):
    # Ensure that paths and predictions are of the same length
    assert len(paths) == len(predictions), "Lists must have the same length"

    with open(output_filename, 'w', newline='') as csvfile:
        fieldnames = ['path', 'annotation']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for path, prediction in zip(paths, predictions):
            writer.writerow({'path': path[6:], 'annotation': prediction})

# Test with example data
paths = eval_ds["path"]
predictions = eval_ds["prediction"]

create_csv(paths, predictions, 'eval_submission_whisper_small_zero_shot_defaultdecoding.csv')

## Next Steps
- Error Analysis
- Denoising of Data