In [1]:
from datasets import load_dataset
import whisper
import random
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import pipeline
import requests
import json
import time


from transformers.models.whisper.english_normalizer import BasicTextNormalizer
import evaluate

normalizer = BasicTextNormalizer()
wer_metric = evaluate.load("wer")

def normalize_text(text: str) -> str:
    """Normalize text using Whisper's basic text normalizer."""
    return normalizer(text.strip())

def compute_wer(reference: str, prediction: str) -> float:
    """Compute WER between two strings after normalization."""
    norm_ref = normalize_text(reference)
    norm_pred = normalize_text(prediction)
    return wer_metric.compute(references=[norm_ref], predictions=[norm_pred])

# Load the dataset
dataset = load_dataset("google/fleurs", "lb_lu")
samples = dataset["test"]

prepared_samples = []

import tempfile
import soundfile as sf
import torch
import torchaudio
from tqdm import tqdm

for sample in tqdm(samples):
    audio_array = sample["audio"]["array"]
    sample_rate = sample["audio"]["sampling_rate"]
    reference = sample["transcription"].strip()

    # Resample if necessary
    if sample_rate != 16000:
        audio_array = torchaudio.functional.resample(
            torch.tensor(audio_array), orig_freq=sample_rate, new_freq=16000
        ).numpy()

    # Save to temp file
    tmp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp_file.name, audio_array, 16000)

    prepared_samples.append({
        "path": tmp_file.name,
        "reference": reference
    })

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/13.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/191M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/473M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/245k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/593k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

100%|██████████| 934/934 [00:15<00:00, 61.63it/s] 


In [3]:
model_names = ["large-v2", "large-v3"]

for model_name in model_names:
    try:
        model = whisper.load_model(model_name)
        print(f"Loaded model: {model_name}")
    except Exception as e:
        print(f"Model {model_name} not available: {e}")
        continue  # Skip to next model if this one fails

    refs_preds = []
    wers = []

    for sample in tqdm(prepared_samples, desc=f"Evaluating {model_name}"):
        try:
            result = model.transcribe(sample["path"], language="lb", task="transcribe")
            prediction = result["text"].strip()

            error = compute_wer(sample["reference"], prediction)
            wers.append(error)
            refs_preds.append((sample["reference"], prediction, error))

        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                print(f"CUDA OOM on: {sample['path']}. Skipping.")
            else:
                raise e

        torch.cuda.empty_cache()

    if wers:
        average_wer = sum(wers) / len(wers)
        print(f"[{model_name}] Average WER: {average_wer:.2%}")
    else:
        print(f"[{model_name}] No valid WER results.")

    for ref, pred, err in random.sample(refs_preds, min(5, len(refs_preds))):
        print(f"Reference: {ref}")
        print(f"Predicted: {pred}")
        print(f"WER: {err:.2%}\n")


100%|██████████████████████████████████████| 2.87G/2.87G [00:27<00:00, 113MiB/s]


Loaded model: large-v2


Evaluating large-v2: 100%|██████████| 934/934 [49:05<00:00,  3.15s/it]  


[large-v2] Average WER: 92.92%
Reference: de service gëtt dacks vun der schëfffaart inklusiv sportbooter esouwéi expeditioune verwent déi bedarf un externen daten a sproocherkennung hunn
Predicted: Des servies gedags von der Schöf-Arst inklusive Spurtboote sowie Expeditione verwandt de Bedrof on externen Daten o spurtakannung hon.
WER: 90.00%

Reference: am waarme klima vum noen oste waren haiser net esou wichteg
Predicted: Am warme klima vum Nooën-Osten waren Heusan net so wichtig.
WER: 54.55%

Reference: dat sinn heiansdo iwwerfëllt familljestränn mat enger gudder auswiel u butteker laanscht d'küst schwammen ass sécher
Predicted: WHEN nooit inderdaad aara trimmed je je ap' aa stik et zoom je ergap, SUBUSIEERDE AARE
WER: 100.00%

Reference: verkéiersfloss ass d'untersuchung vun der beweegung vun eenzele chaufferen a gefierer tëschent zwee punkten an d'wiesselwierkungen déi se openeen hunn
Predicted: Tvrkaja flos ast unta sorung vonder bewegjung von enzele shofören and gefejrd tswische

Evaluating large-v3: 100%|██████████| 934/934 [29:55<00:00,  1.92s/it]  

[large-v3] Average WER: 85.76%
Reference: nuets goufen tëschent 150 bis 200 kopië gemaach déi haut als dunlap broadsides bekannt sinn
Predicted: Nuts giffen tischen 250 bis 200 Kopiergema die Haut als Dunlop Broad Sides bekannt ging.
WER: 73.33%

Reference: d'kuuscht ass op der zougewanter säit ongeféier 70 km an op der ofgewanter säit ongeféier 100 km déck
Predicted: Die Kuh ist op der zugewandte Seite ungefähr 70 km an, auf der aufgewandte Seite ungefähr 100 km dick.
WER: 57.89%

Reference: virun allem gëtt behaapt datt een erkenne kann ob eng persoun litt andeem ee mikroausdréck richteg interpretéiert
Predicted: Für in allem geht behaabt, dat eh noch keine Kern ob beim Personenlid, an dem ihr Mikroeisträg richtig interpretiert.
WER: 100.00%

Reference: am norden an einfach ze erreechen befënnt sech d'romantesch a faszinéierend stad sintra déi no enger feiereger duerstellung vun hirer pruecht déi vum lord byron opgezeechent gouf fir auslänner berüümt gouf
Predicted: Am Norden an Einf


