In [6]:
from datasets import load_dataset
import whisper
import random
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import pipeline
import requests
import json
import time


from transformers.models.whisper.english_normalizer import BasicTextNormalizer
import evaluate

normalizer = BasicTextNormalizer()
wer_metric = evaluate.load("wer")

def normalize_text(text: str) -> str:
    """Normalize text using Whisper's basic text normalizer."""
    return normalizer(text.strip())

def compute_wer(reference: str, prediction: str) -> float:
    """Compute WER between two strings after normalization."""
    norm_ref = normalize_text(reference)
    norm_pred = normalize_text(prediction)
    return wer_metric.compute(references=[norm_ref], predictions=[norm_pred])

# Load the dataset
dataset = load_dataset("google/fleurs", "lb_lu")
samples = dataset["test"]

prepared_samples = []

import tempfile
import soundfile as sf
import torch
import torchaudio
from tqdm import tqdm

for sample in tqdm(samples):
    audio_array = sample["audio"]["array"]
    sample_rate = sample["audio"]["sampling_rate"]
    reference = sample["transcription"].strip()

    # Resample if necessary
    if sample_rate != 16000:
        audio_array = torchaudio.functional.resample(
            torch.tensor(audio_array), orig_freq=sample_rate, new_freq=16000
        ).numpy()

    # Save to temp file
    tmp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp_file.name, audio_array, 16000)

    prepared_samples.append({
        "path": tmp_file.name,
        "reference": reference
    })

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
100%|██████████| 934/934 [00:03<00:00, 240.04it/s]


In [1]:
"""# Load the Whisper model
model = whisper.load_model("medium")

refs_preds = []
wers = []

for sample in tqdm(prepared_samples):
    # Transcribe the audio
    result = model.transcribe(sample["path"], language="lb", task="transcribe")
    prediction = result["text"].strip()

    # Compute WER
    error = compute_wer(sample["reference"], prediction)
    wers.append(error)
    refs_preds.append((sample["reference"], prediction, error))

# calculate average WER and print results
average_wer = sum(wers) / len(wers)
print(f"Average WER: {average_wer:.2%}")

# show a few random samples

for ref, pred, err in random.sample(refs_preds, min(5, len(refs_preds))):
    print(f"Reference: {ref}")
    print(f"Predicted: {pred}")
    print(f"WER: {err:.2%}\n")"""

'# Load the Whisper model\nmodel = whisper.load_model("medium")\n\nrefs_preds = []\nwers = []\n\nfor sample in tqdm(prepared_samples):\n    # Transcribe the audio\n    result = model.transcribe(sample["path"], language="lb", task="transcribe")\n    prediction = result["text"].strip()\n\n    # Compute WER\n    error = compute_wer(sample["reference"], prediction)\n    wers.append(error)\n    refs_preds.append((sample["reference"], prediction, error))\n\n# calculate average WER and print results\naverage_wer = sum(wers) / len(wers)\nprint(f"Average WER: {average_wer:.2%}")\n\n# show a few random samples\n\nfor ref, pred, err in random.sample(refs_preds, min(5, len(refs_preds))):\n    print(f"Reference: {ref}")\n    print(f"Predicted: {pred}")\n    print(f"WER: {err:.2%}\n")'

In [9]:
model = whisper.load_model("medium")
refs_preds = []
wers = []

for sample in tqdm(prepared_samples):
    # Transcribe the audio
    result = model.transcribe(sample["path"], task="transcribe")
    prediction = result["text"].strip()

    # Compute WER
    error = compute_wer(sample["reference"], prediction)
    wers.append(error)
    refs_preds.append((sample["reference"], prediction, error))

# calculate average WER and print results
average_wer = sum(wers) / len(wers)
print(f"Average WER: {average_wer:.2%}")

# show a few random samples

for ref, pred, err in random.sample(refs_preds, min(5, len(refs_preds))):
    print(f"Reference: {ref}")
    print(f"Predicted: {pred}")
    print(f"WER: {err:.2%}\n")

  checkpoint = torch.load(fp, map_location=device)
100%|██████████| 934/934 [1:00:08<00:00,  3.86s/it]

Average WER: 95.69%
Reference: doduerch ass se ofwäertskompabitel mat 802.11a 802.11b an 802.11g virausgesat datt d'basisstatioun iwwer zwee funkapparater verfüügt
Predicted: Dadurch ist also aufwärtskompatibel mit Etonaut 2,11a, Etonaut 2,11b und Etonaut 2,11g herausgesagt, dass Basis Station IWA 2 von dem Operator verfügt.
WER: 110.00%

Reference: d'mënsche schreiwen elo noriichten op computerbildschiermer a brauche kee spëtzer méi
Predicted: Mnče škravno ljudno rešeno kompiutabil šema, a brak kajn špacomej.
WER: 91.67%

Reference: d'haaptstad vu moldawien ass chişinău. d'landessprooch ass rumänesch awer russesch ass wäit verbreet
Predicted: Die Hauptstadt von Moldawien ist Chisinau. Die Landessprache ist Rumänisch, aber Russisch ist weit verbreitet.
WER: 93.33%

Reference: virtuell gerüster sinn an der software verënnerlecht a solle verfaren hannerfroen opfuerderen an erklären déi fir de schüler ze schwiereg gewiescht kéinte sinn se eleng ze bewältegen
Predicted: ושתו אל גרויסטה זינ




In [10]:
# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and processor
model_name = "Lemswasabi/wav2vec2-base-luxembourgish-4h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device).eval()

# Store results
refs_preds = []
wers = []

# Evaluate
for sample in tqdm(prepared_samples):
    # Load resampled audio from disk
    speech_array, _ = sf.read(sample["path"])  # already 16kHz from preprocessing

    # Tokenize
    inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True)
    input_values = inputs.input_values.to(device)
    attention_mask = inputs.attention_mask.to(device) if "attention_mask" in inputs else None

    # Inference
    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits \
            if attention_mask is not None else model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True).strip()

    # Compute WER
    reference = sample["reference"]
    error = compute_wer(reference, transcription)

    wers.append(error)
    refs_preds.append((reference, transcription, error))

# calculate average WER and print results
average_wer = sum(wers) / len(wers)
print(f"Average WER: {average_wer:.2%}")

# show a few random samples

for ref, pred, err in random.sample(refs_preds, min(5, len(refs_preds))):
    print(f"Reference: {ref}")
    print(f"Predicted: {pred}")
    print(f"WER: {err:.2%}\n")


Some weights of the model checkpoint at Lemswasabi/wav2vec2-base-luxembourgish-4h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Lemswasabi/wav2vec2-base-luxembourgish-4h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You shoul

Average WER: 62.36%
Reference: de 15 august 1940 sinn d'alliéierter a südfrankräich agefall d'invasioun gouf operation dragoon genannt
Predicted: de aurgust sin ea liéierten a süd frankräich agefal dim vasioun gouf op peration dra gon genant
WER: 100.00%

Reference: d'höl läit op engem bierg nërdlech vu mekka an ass komplett vum rescht vun der welt isoléiert
Predicted: toul läit op engem bierg nartlech vum macker an as komplet vum recht vun der welt isoléiert
WER: 44.44%

Reference: onnéideg ze soen wann dir eng romanesch sprooch kennt gëtt et méi einfach portugisesch ze léieren
Predicted: onéidegh ze soen wan dir eng homanesch sprochkënt gët et méi einfach portugisesch zu dieren
WER: 50.00%

Reference: dës koppele kënne sech entscheeden en adoptiounsplang fir hire puppelchen ze maachen
Predicted: dës copeleu këne sech entscheden a at optionsplang fir re popochen ze machen
WER: 75.00%

Reference: zu de regionalen a saisonalen onwiederphenomeener gehéiere blizzarden schnéistierm äisstie




In [12]:
# List of pgilles Whisper models fine-tuned on Luxembourgish
whisper_models = [
    "Tun-Wellens/pgilles-whisper-tiny-lb",
    "Tun-Wellens/pgilles-whisper-medium-lb",
    "Tun-Wellens/pgilles-whisper-large-lb",
]

for whisper_model_name in whisper_models:
    print(f"\n Evaluating {whisper_model_name}")

    asr = pipeline(
        "automatic-speech-recognition",
        model=whisper_model_name,
        device=0 if torch.cuda.is_available() else -1
    )

    refs_preds = []
    wers = []
    skipped = 0

    for sample in tqdm(prepared_samples, desc=f"Whisper Eval: {whisper_model_name}"):
        try:
            # Load already-preprocessed 16kHz audio from file
            speech_array, _ = sf.read(sample["path"])

            # Run transcription
            result = asr(
                {"array": speech_array, "sampling_rate": 16000}
            )
            transcription = result["text"].strip()

            # Compute WER
            reference = sample["reference"]
            error = compute_wer(reference, transcription)

            wers.append(error)
            refs_preds.append((reference, transcription, error))

        except ValueError as e:
            if "more than 3000 mel input features" in str(e):
                skipped += 1
                continue
            else:
                raise e  # re-raise unexpected errors

    # Average WER
    if wers:
        average_wer_pgilles = sum(wers) / len(wers)
        print(f"\n Average WER ({whisper_model_name}) over {len(wers)} valid samples: {average_wer_pgilles:.2%}")
    else:
        print(f"\n No valid samples for {whisper_model_name}")

    if skipped:
        print(f" Skipped {skipped} sample(s) due to long-form constraint (>30s)")

    # Show sample predictions
    if refs_preds:
        print(f"\n Sample predictions ({whisper_model_name}):\n")
        for ref, pred, err in random.sample(refs_preds, min(5, len(refs_preds))):
            print(f"Reference: {ref}")
            print(f"Predicted: {pred}")
            print(f"WER: {err:.2%}\n")



 Evaluating Tun-Wellens/pgilles-whisper-tiny-lb


config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

Whisper Eval: Tun-Wellens/pgilles-whisper-tiny-lb:   0%|          | 0/934 [00:00<?, ?it/s]Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Whisper Eval: Tun-Wellens/pgilles-whisper-tiny-lb:   1%|          | 10/934 [00:02<03:20,  4.61it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Whisper Eval: Tun-Wellens/pgilles-whisper-tiny-lb: 100%|██████████| 934/934 [04:02<00:00,  3.85it/s]



 Average WER (Tun-Wellens/pgilles-whisper-tiny-lb) over 934 valid samples: 80.17%

 Sample predictions (Tun-Wellens/pgilles-whisper-tiny-lb):

Reference: no der revolutioun waren aarbechtsplaze fir all männlech bewerber op wat et den éiergäizegsten a erfollegräichsten erméiglecht huet et ze packen
Predicted: No der Revolutioun war en Aarbechtsplaz fir handlännesch Bewerb op, wat et den éiergäizegsten an erfollegräistener Méiglechthescht huet, an ze packen.
WER: 47.62%

Reference: wat d'spannung méi niddereg wat d'existent liewenskraaft méi positiv jiddwer mënsch huet d'potenzial absolutte fridden an zefriddenheet ze fannen
Predicted: Wat Spannung meng Niddere schafen, déi sech dann sech déi gesinn d' Liewenskraaft méi positiv jiddere Mënsch huet, datt sech sel op zesumme Fridden anzefidden net ze fannen.
WER: 86.36%

Reference: dem aristoteles seng usiichte goufen an alle beräicher vun der wëssenschaft akzeptéiert inklusiv der psychologie
Predicted: De Marie-Soudle seng u sech dee gou

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.46k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Whisper Eval: Tun-Wellens/pgilles-whisper-medium-lb: 100%|██████████| 934/934 [18:03<00:00,  1.16s/it]



 Average WER (Tun-Wellens/pgilles-whisper-medium-lb) over 934 valid samples: 49.79%

 Sample predictions (Tun-Wellens/pgilles-whisper-medium-lb):

Reference: genee esou musst dir mat engem schengen-visa net fir jiddwer schenge-memberstat separat e visa ufroen wat zäit suen a schreifaarbecht erspuert
Predicted: Den hir sou muss du mat engem Schengem Visa net fir jidde wa Schengem Memberstad, separat de Visa ufruen, verzäit Suen a schreiwaarbeje schwuet.
WER: 69.57%

Reference: schinneschwelle goufen zimmlech fréi ageféiert fir d'schinnen op der plaz ze halen no an no huet sech awer erausgestallt datt d'schinne méi effizient wieren wa se en eisestaf uewen hätten
Predicted: Schinnen e schwelle goufen zimlech fräi ageféiert fir d'Schinnen op de Plaz ze halen. No an no huet se sech awer erausgestallt datt d'Schinnen er méi effizient wieren, wa se een eise Staf uewen hätten.
WER: 37.50%

Reference: mierkt iech och wann d'musek op den haaptbüne dem enn zou geet kann et nach beräicher vum fes

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.56k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Whisper Eval: Tun-Wellens/pgilles-whisper-large-lb: 100%|██████████| 934/934 [26:09<00:00,  1.68s/it]


 Average WER (Tun-Wellens/pgilles-whisper-large-lb) over 934 valid samples: 57.97%

 Sample predictions (Tun-Wellens/pgilles-whisper-large-lb):

Reference: et kann ee sech nëmme froen wat mat der tastatur geschitt wann eppes méi neits kënnt
Predicted: Et kann ee sech nëmme froen, wat mat der Tastatur geschitt, wann eppes Neits gëtt.
WER: 12.50%

Reference: den ofstuerz huet sech héich uewen a biergegem terrain ereegent an et gëtt ugeholl datt et d'resultat vu feindlechem beschoss war
Predicted: Den Ofstuert huet sech héich uewen a biergergem Terrain eréignert an et gëtt ugeholl, datt et d'Resultat vu feindleche Beschoss war.
WER: 18.18%

Reference: déi dräi kinnekräicher war ee vun den bluddegsten zäitalteren am ale china dausende mënsche si beim kampf ëm den héchste sëtz am grousse palais zu xi'an gestuerwen
Predicted: Déi dräi Kinnekräicher waren ee vun de bluddegsten Zäitalteren an aller China. Dausende Mënsche si beim Kampf ëm den héckschte Sëtz an de grousse Palette z'Uxiane gest




In [13]:
# LuxASR endpoint
LUXASR_API = "https://luxasr.uni.lu/v2/asr?diarization=Disabled&outfmt=text"

refs_preds = []
wers = []

for sample in tqdm(prepared_samples):
    with open(sample["path"], "rb") as audio_file:
        files = {"audio_file": ("audio.wav", audio_file, "audio/wav")}
        response = requests.post(LUXASR_API, files=files, timeout=30)
        predicted = json.loads(response.text.strip())

    # Compute WER
    error = compute_wer(sample["reference"], predicted)
    wers.append(error)
    refs_preds.append((sample["reference"], predicted, error))

    time.sleep(1)  

# Average WER
average_wer = sum(wers) / len(wers)
print(f"\n Average WER (LuxASR) over {len(wers)} samples: {average_wer:.2%}")

# Show sample predictions
print("\n Sample predictions (LuxASR):\n")
for ref, pred, err in random.sample(refs_preds, min(5, len(refs_preds))):
    print(f"Reference: {ref}")
    print(f"Predicted: {pred}")
    print(f"WER: {err:.2%}\n")

100%|██████████| 934/934 [35:24<00:00,  2.27s/it]


 Average WER (LuxASR) over 934 samples: 23.48%

 Sample predictions (LuxASR):

Reference: d'zentral autoritéit vun der kierch war zanter iwwer dausend joer zu roum an dës konzentratioun vu muecht a suen huet der vill dozou bruecht dorun ze zweiwelen ob dëse grondsaz erfëllt gouf
Predicted: D'zentral Autoritéit vun der Kierch war zanter iwwer dausend Joer zu Rou, an dës Konzentratioun vu Muecht a Suen huet der vill dozou bruecht, dorun ze zweifelen, ob dësem Grondsätz erfëllt gouf.
WER: 12.12%

Reference: de scotturb bus 403 fiert reegelméisseg op sintra a bleift a cabo da roca stoen
Predicted: De Scott-E-Bus véierhonnertdräi féiert reegelméisseg op Sintra a bleift a Cabuda Roca stoen.
WER: 40.00%

Reference: esou ewéi de mound eng unzéiungskraaft op d'äerd ausüübt a gezäite verursaacht esou üübt d'mëllechstrooss eng kraaft op d'sagittarius-galaxie aus
Predicted: Esou wéi de Mound eng Unzéiungskraaft op d'Äert ausüübt a Gezäite verursacht, esou üübt d'Mëllechstrooss eng Kraaft op der P


