In [1]:
from datasets import load_dataset
import whisper
import random
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import pipeline
import requests
import json
import time


from transformers.models.whisper.english_normalizer import BasicTextNormalizer
import evaluate

normalizer = BasicTextNormalizer()
wer_metric = evaluate.load("wer")

def normalize_text(text: str) -> str:
    """Normalize text using Whisper's basic text normalizer."""
    return normalizer(text.strip())

def compute_wer(reference: str, prediction: str) -> float:
    """Compute WER between two strings after normalization."""
    norm_ref = normalize_text(reference)
    norm_pred = normalize_text(prediction)
    return wer_metric.compute(references=[norm_ref], predictions=[norm_pred])

# Load the dataset
dataset = load_dataset("google/fleurs", "lb_lu")
samples = dataset["test"]

prepared_samples = []

import tempfile
import soundfile as sf
import torch
import torchaudio
from tqdm import tqdm

for sample in tqdm(samples):
    audio_array = sample["audio"]["array"]
    sample_rate = sample["audio"]["sampling_rate"]
    reference = sample["transcription"].strip()

    # Resample if necessary
    if sample_rate != 16000:
        audio_array = torchaudio.functional.resample(
            torch.tensor(audio_array), orig_freq=sample_rate, new_freq=16000
        ).numpy()

    # Save to temp file
    tmp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp_file.name, audio_array, 16000)

    prepared_samples.append({
        "path": tmp_file.name,
        "reference": reference
    })

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
100%|██████████| 934/934 [00:04<00:00, 231.46it/s]


In [2]:
import torch
import soundfile as sf
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, GenerationConfig
from tqdm import tqdm
import random

# List of Whisper models fine-tuned on Luxembourgish
whisper_model_names = [
    "Tun-Wellens/whisper-medium-lb-excluded",
    "Tun-Wellens/whisper-medium-lb-included",
]

for model_name in whisper_model_names:
    print(f"\nEvaluating {model_name}")

    # Load model
    model = WhisperForConditionalGeneration.from_pretrained(model_name)

    # Try to load generation config
    try:
        gen_config = GenerationConfig.from_pretrained(model_name)
    except Exception:
        print(f"Could not load generation config from '{model_name}', using default config.")
        gen_config = GenerationConfig.from_pretrained("openai/whisper-medium")

    # Attach generation config to model
    model.generation_config = gen_config

    # Load processor with fallback
    try:
        processor = WhisperProcessor.from_pretrained(model_name)
    except Exception:
        print(f"Could not load processor from '{model_name}', using default processor.")
        processor = WhisperProcessor.from_pretrained("openai/whisper-medium")

    # Create ASR pipeline
    asr_pipeline = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        device=0 if torch.cuda.is_available() else -1,
    )

    predictions_and_refs = []
    wers = []
    num_skipped = 0

    for sample in tqdm(prepared_samples, desc=f"Whisper Eval: {model_name}"):
        try:
            # Load already-preprocessed 16kHz audio
            speech_array, _ = sf.read(sample["path"])

            result = asr_pipeline({
                "array": speech_array,
                "sampling_rate": 16000
            })
            predicted_text = result["text"].strip()

            reference_text = sample["reference"]
            wer = compute_wer(reference_text, predicted_text)

            predictions_and_refs.append((reference_text, predicted_text, wer))
            wers.append(wer)

        except ValueError as e:
            if "more than 3000 mel input features" in str(e):
                num_skipped += 1
                continue
            else:
                raise

    if wers:
        avg_wer = sum(wers) / len(wers)
        print(f"\nAverage WER ({model_name}) over {len(wers)} valid samples: {avg_wer:.2%}")
    else:
        print(f"\nNo valid samples for {model_name}")

    if num_skipped:
        print(f"Skipped {num_skipped} sample(s) due to long-form constraint (>30s)")

    if predictions_and_refs:
        print(f"\nSample predictions ({model_name}):\n")
        for ref, pred, err in random.sample(predictions_and_refs, min(5, len(predictions_and_refs))):
            print(f"Reference : {ref}")
            print(f"Predicted : {pred}")
            print(f"WER       : {err:.2%}\n")



Evaluating Tun-Wellens/whisper-medium-lb-excluded


config.json:   0%|          | 0.00/2.28k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

Could not load generation config from 'Tun-Wellens/whisper-medium-lb-excluded', using default config.
Could not load processor from 'Tun-Wellens/whisper-medium-lb-excluded', using default processor.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Whisper Eval: Tun-Wellens/whisper-medium-lb-excluded:   0%|          | 0/934 [00:00<?, ?it/s]Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Whisper Eval: Tun-Wellens/whisper-medium-lb-excluded:   1%|          | 10/934 [00:11<17:50,  1.16s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Whisper Eval: Tun-Wellens/whisper-medium-lb-excluded: 100%|██████████| 934/934 [20:09<00:00,  1.29s/it]



Average WER (Tun-Wellens/whisper-medium-lb-excluded) over 934 valid samples: 50.23%

Sample predictions (Tun-Wellens/whisper-medium-lb-excluded):

Reference : d'debatt gouf duerch kontroversen iwwer ausgabe fir hëllef an ëremopbau nom hurrikan katrina ausgeléist e puer finanzpolitesch konservativer hu se humorvoll als dem bush säin new-orleans-deal bezeechent
Predicted : De Bat gouf duerch kontroversen iwwer Ausgabe fir Hëllef an erëmopbauen um Hurrikan Katrina ausgeléiste Profinanz, politesch konservativer, hu se humorfull als d'Ëm Bush säin New Orleans Deal bezeechent.
WER       : 36.67%

Reference : jugendlech ausräisser hunn eventuell schwéier kandsmësshandlung oder en trauma erlieft éier se opgi goufen oder fortgelaf sinn
Predicted : Jugendlech Ausreisser hunn eventuell schwéier Kandesmisshandlung oder en Trauma erlieft, éier se opgi goufen oder fortbelaf sinn.
WER       : 17.65%

Reference : planze produzéieren sauerstoff deen d'mënschen ootmen a se huele kuelendioxid an dee mën

config.json:   0%|          | 0.00/2.23k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Whisper Eval: Tun-Wellens/whisper-medium-lb-included: 100%|██████████| 934/934 [19:37<00:00,  1.26s/it]


Average WER (Tun-Wellens/whisper-medium-lb-included) over 934 valid samples: 49.23%

Sample predictions (Tun-Wellens/whisper-medium-lb-included):

Reference : de cuomo huet ufank vun dësem joer mat 53 säi gouverneursamt ugetrueden an de leschte mount e gesetzentworf fir d'legaliséierung vu gläichgeschlechtleche bestietnesser ënnerschriwwen
Predicted : De Komer huet ufang vun dësen Joer mat 53 säi Gouverneursamt getrueden an de leschte Mount am Gesetz entwort fir d'Legaliséierung vu gläichgeschlechtlechtechten Bestietnesser ënnerschriwwen.
WER       : 32.00%

Reference : schliisslech ginn et vill kleng kazen inklusiv fräilafend hauskazen déi déi vill méi grouss zuel u klenge fäng ewéi insekten nagedéieren eidechsen a vulle friessen
Predicted : Schlisslech ginn et vill kleng Kazen, inklusiv fräi lafen d'Hauskazen, déi déi vill méi grouss zur elo klenge Fange wéin ze akten, nagetéieren, eidacksen a Vullefriessen.
WER       : 56.00%

Reference : d'strategie huet sech als wierksam erwisen 


