In [1]:
%pip install evaluate jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Collecting rapidfuzz>=3.9.7
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
reference = "This is a test sentence."
prediction = "This is test sentenceWrong."

In [None]:
from evaluate import load

wer_metric = load("wer")

wer = wer_metric.compute(references=[reference], predictions=[prediction])

0.4

In [11]:
print(f"Word Error Rate (WER): {wer:.3f} (lower is better)")
print(f"Accuracy: {1 - wer:.3f} (higher is better)")

Word Error Rate (WER): 0.400 (lower is better)
Accuracy: 0.600 (higher is better)


In [None]:
from evaluate import load

cer_metric = load("cer")

cer = cer_metric.compute(references=[reference], predictions=[prediction])

In [14]:
print(f"Character Error Rate (WER): {cer:.3f} (lower is better)")
print(f"Accuracy: {1 - cer:.3f} (higher is better)")

Character Error Rate (WER): 0.292 (lower is better)
Accuracy: 0.708 (higher is better)


CER is much more forgiving then WER, as small errors in the transcription are not penalized as much. This is because CER is based on character-level accuracy, while WER is based on word-level accuracy. In many cases, a small error in a word can lead to a large increase in WER, while the same error may only slightly affect CER.

However most of the time CER is not used. This is because that CER only looks at the characters in the transcription, and does not take into account the meaning of the words. Like grammar. We want to encourage the model to gain a better understanding of the language, and not just the characters. This is why WER is more commonly used.


## Normalization
When one normalizes a dataset for ASR one remove any casing and the ppunctionation. This makes the Speech Recognition task easier, as the model does not have to learn to recognize different cases and punctuation marks. Like the difference between "Hello" and "hello", or "Hello," and "Hello". This has actually been shown to dramatically improve the performance of ASR models.

In [15]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer()

prediction = " He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly is drawn from eating and its results occur most readily to the mind."
normalized_prediction = normalizer(prediction)

normalized_prediction

' he tells us that at this festive season of the year with christmas and roast beef looming before us similarly is drawn from eating and its results occur most readily to the mind '

In [16]:
reference = "HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAS AND ROAST BEEF LOOMING BEFORE US SIMILES DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND"
normalized_referece = normalizer(reference)

wer = wer_metric.compute(
    references=[normalized_referece], predictions=[normalized_prediction]
)
wer

0.03488372093023256

## Fine-tuning ASR Model

In [None]:
from transformers import pipeline
import torch

if torch.cuda.is_available():
    device = "cuda:0"
    torch_dtype = torch.float16
else:
    device = "cpu"
    torch_dtype = torch.float32

pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    torch_dtype=torch_dtype,
    device=device,
)

In [17]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset

common_voice_test = load_dataset(
    "mozilla-foundation/common_voice_13_0", "dv", split="test"
)

In [None]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

all_predictions = []

# run streamed inference
for prediction in tqdm(
    pipe(
        KeyDataset(common_voice_test, "audio"),
        max_new_tokens=128,
        generate_kwargs={"task": "transcribe"},
        batch_size=32,
    ),
    total=len(common_voice_test),
):
    all_predictions.append(prediction["text"])

In [None]:
from evaluate import load

wer_metric = load("wer")

wer_ortho = 100 * wer_metric.compute(
    references=common_voice_test["sentence"], predictions=all_predictions
)
wer_ortho

In [None]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer()

# compute normalised WER
all_predictions_norm = [normalizer(pred) for pred in all_predictions]
all_references_norm = [normalizer(label) for label in common_voice_test["sentence"]]

# filtering step to only evaluate the samples that correspond to non-zero references
all_predictions_norm = [
    all_predictions_norm[i]
    for i in range(len(all_predictions_norm))
    if len(all_references_norm[i]) > 0
]
all_references_norm = [
    all_references_norm[i]
    for i in range(len(all_references_norm))
    if len(all_references_norm[i]) > 0
]

wer = 100 * wer_metric.compute(
    references=all_references_norm, predictions=all_predictions_norm
)

wer