# Evaluate Fine-tuned Whisper-LoRA on TORGO Test Set

Evaluates the fine-tuned Whisper Small + LoRA model on the TORGO processed test set.

Produces the same table format as the baseline evaluation notebook:
- Overall WER / CER
- Per-group breakdown (dysarthric vs healthy)
- Error type counts (substitutions, deletions, insertions)
- Side-by-side comparison with baseline Whisper Small

In [1]:
import os
import json
import torch
import numpy as np
from collections import defaultdict
from pathlib import Path

from jiwer import wer, cer, process_words
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoProcessor
from peft import PeftModel
from datasets import Dataset, DatasetDict, Audio

PROJECT_ROOT = Path(os.getcwd()).resolve()
while PROJECT_ROOT != PROJECT_ROOT.parent and not (PROJECT_ROOT / "asr").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
os.chdir(PROJECT_ROOT)

device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
print(f"Project root: {PROJECT_ROOT}")

  from .autonotebook import tqdm as notebook_tqdm


Device: mps
Project root: /Users/sidharthbildikar/Desktop/code/ASR Project


## Configuration

In [2]:
ADAPTER_PATH = "asr/whisper-lora-final"
BASE_MODEL = "openai/whisper-small"
METADATA_PATH = "audio/torgo/processed/metadata.json"
PROCESSED_DIR = "audio/torgo/processed"
SAMPLING_RATE = 16000
BASELINE_RESULTS_PATH = "asr/baseline_results.json"

assert os.path.exists(ADAPTER_PATH), f"Adapter not found at {ADAPTER_PATH}"
assert os.path.exists(METADATA_PATH), f"Metadata not found at {METADATA_PATH}"
print("Paths verified.")

Paths verified.


## Load Test Dataset

Loads the processed test set from metadata.json with speech status information for per-group evaluation.

In [3]:
import re

with open(METADATA_PATH) as f:
    metadata = json.load(f)

test_meta = metadata["test"]

audio_paths, transcriptions, speech_statuses = [], [], []
for filename, meta in test_meta.items():
    wav_path = os.path.join(PROCESSED_DIR, "test", filename)
    if os.path.exists(wav_path) and meta["transcription"]:
        audio_paths.append(wav_path)
        transcriptions.append(meta["transcription"])
        speech_statuses.append(meta.get("speech_status", "unknown"))

test_dataset = Dataset.from_dict({
    "audio": audio_paths,
    "transcription": transcriptions,
    "speech_status": speech_statuses,
})
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))

print(f"Test samples: {len(test_dataset)}")
status_counts = defaultdict(int)
for s in speech_statuses:
    status_counts[s] += 1
for status, count in sorted(status_counts.items()):
    print(f"  {status}: {count}")

Test samples: 1655
  dysarthria: 558
  healthy: 1097


## Load Fine-tuned Model

Loads the base Whisper Small model and applies the LoRA adapter weights on top.

In [4]:
processor = AutoProcessor.from_pretrained(BASE_MODEL, language="en", task="transcribe")

base_model = WhisperForConditionalGeneration.from_pretrained(BASE_MODEL)
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
model = model.merge_and_unload()
model.to(device)
model.eval()

model.generation_config.forced_decoder_ids = None
model.generation_config.suppress_tokens = []

print(f"Model loaded and merged on {device}.")

Loading weights: 100%|██████████| 479/479 [00:00<00:00, 938.13it/s, Materializing param=model.encoder.layers.11.self_attn_layer_norm.weight]   


Model loaded and merged on mps.


## Evaluation Functions

In [5]:
def transcribe_audio(model, processor, audio_array, sr, device):
    input_features = processor(
        audio_array, sampling_rate=sr, return_tensors="pt"
    ).input_features.to(device)
    with torch.no_grad():
        predicted_ids = model.generate(input_features, max_new_tokens=100)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription.strip().lower()

def evaluate_on_test(model, processor, dataset, device, model_name="whisper-lora"):
    group_results = defaultdict(lambda: {"refs": [], "hyps": []})
    total = len(dataset)

    for i, sample in enumerate(dataset):
        reference = sample["transcription"].strip().lower()
        if not reference:
            continue

        status = sample.get("speech_status", "unknown")
        audio = sample["audio"]

        hypothesis = transcribe_audio(
            model, processor, audio["array"], audio["sampling_rate"], device
        )

        group_results[status]["refs"].append(reference)
        group_results[status]["hyps"].append(hypothesis)

        if (i + 1) % 100 == 0:
            print(f"  Processed {i + 1}/{total} samples...")

    report = {"model": model_name, "model_id": "whisper-small + LoRA", "groups": {}}
    all_refs, all_hyps = [], []

    for status, data in group_results.items():
        refs, hyps = data["refs"], data["hyps"]
        all_refs.extend(refs)
        all_hyps.extend(hyps)

        group_wer = wer(refs, hyps)
        group_cer = cer(refs, hyps)
        output = process_words(refs, hyps)

        report["groups"][status] = {
            "wer": group_wer,
            "cer": group_cer,
            "substitutions": output.substitutions,
            "deletions": output.deletions,
            "insertions": output.insertions,
            "num_samples": len(refs),
        }

    if all_refs:
        report["overall_wer"] = wer(all_refs, all_hyps)
        report["overall_cer"] = cer(all_refs, all_hyps)
        report["total_samples"] = len(all_refs)

    print(f"  Done. Overall WER: {report.get('overall_wer', 0)*100:.1f}%")
    return report

## Run Evaluation

In [6]:
print("Evaluating fine-tuned Whisper-LoRA on test set...")
finetuned_report = evaluate_on_test(model, processor, test_dataset, device, "whisper-lora")

Evaluating fine-tuned Whisper-LoRA on test set...


Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformer

  Processed 100/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 200/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 300/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 400/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 500/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 600/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 700/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 800/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 900/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 1000/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 1100/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 1200/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 1300/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 1400/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 1500/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Processed 1600/1655 samples...


Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=448) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Done. Overall WER: 2.4%


## Results: Baseline vs Fine-tuned

Compares the fine-tuned LoRA model against the baseline Whisper models (tiny, base, small) using the same table format.

In [None]:
def print_comparison(reports):
    print("\n" + "=" * 78)
    print("TORGO TEST SET EVALUATION RESULTS")
    print("=" * 78)

    print(f"\n{'Model':>15} {'Overall WER':>12} {'Overall CER':>12} {'Dysarthric':>12} {'Healthy':>12}")
    print("-" * 66)
    for r in reports:
        dys_wer = r.get("groups", {}).get("dysarthric", r.get("groups", {}).get("dysarthria", {})).get("wer", 0)
        healthy_wer = r.get("groups", {}).get("healthy", {}).get("wer", 0)
        print(
            f"{r['model']:>15} "
            f"{r.get('overall_wer', 0)*100:>11.1f}% "
            f"{r.get('overall_cer', 0)*100:>11.1f}% "
            f"{dys_wer*100:>11.1f}% "
            f"{healthy_wer*100:>11.1f}%"
        )

    # Highlight fine-tuned vs baseline small
    ft = [r for r in reports if r["model"] == "whisper-lora"]
    bl = [r for r in reports if r["model"] == "small"]
    if ft and bl:
        ft, bl = ft[0], bl[0]
        improvement = (bl["overall_wer"] - ft["overall_wer"]) * 100
        print(f"\n{'─' * 66}")
        print(f"  Improvement over baseline Whisper Small: {improvement:+.1f}% WER")

        ft_dys = ft.get("groups", {}).get("dysarthric", ft.get("groups", {}).get("dysarthria", {})).get("wer", 0)
        bl_dys = bl.get("groups", {}).get("dysarthric", {}).get("wer", 0)
        if ft_dys and bl_dys:
            dys_improvement = (bl_dys - ft_dys) * 100
            print(f"  Improvement on dysarthric speech:       {dys_improvement:+.1f}% WER")

    # Per-group details for fine-tuned model
    ft_report = [r for r in reports if r["model"] == "whisper-lora"][0]
    print(f"\n{'Per-Group Details (whisper-lora)':=^78}")
    print(f"  {'Group':<14} {'WER':>8} {'CER':>8} {'Samples':>8} {'Sub':>6} {'Del':>6} {'Ins':>6}")
    print("  " + "-" * 56)
    for group, data in sorted(ft_report["groups"].items()):
        print(
            f"  {group:<14} "
            f"{data['wer']*100:>7.1f}% "
            f"{data['cer']*100:>7.1f}% "
            f"{data['num_samples']:>8} "
            f"{data['substitutions']:>6} "
            f"{data['deletions']:>6} "
            f"{data['insertions']:>6}"
        )

    dys = ft_report["groups"].get("dysarthric", ft_report["groups"].get("dysarthria", {}))
    healthy = ft_report["groups"].get("healthy", {})
    if dys and healthy:
        gap = (dys.get("wer", 0) - healthy.get("wer", 0)) * 100
        print(f"\n  WER gap (dysarthric - healthy): {gap:.1f}%")

# Load baseline results
reports = []
if os.path.exists(BASELINE_RESULTS_PATH):
    with open(BASELINE_RESULTS_PATH) as f:
        reports = json.load(f)
    print(f"Loaded {len(reports)} baseline results.")
else:
    print("No baseline results found — showing fine-tuned results only.")

reports.append(finetuned_report)
print_comparison(reports)

In [None]:
output_path = "asr/finetuned_evaluation_results.json"
with open(output_path, "w") as f:
    json.dump(finetuned_report, f, indent=2)
print(f"Results saved to {output_path}")

## Sample Predictions

Shows a few example transcriptions to qualitatively inspect model output.

In [None]:
print("Sample predictions (first 10 test samples):\n")
print(f"{'#':>3}  {'Status':<12} {'Reference':<35} {'Prediction':<35}")
print("-" * 88)

for i in range(min(10, len(test_dataset))):
    sample = test_dataset[i]
    ref = sample["transcription"].strip().lower()
    status = sample.get("speech_status", "?")
    audio = sample["audio"]

    pred = transcribe_audio(model, processor, audio["array"], audio["sampling_rate"], device)

    ref_display = ref[:33] + ".." if len(ref) > 35 else ref
    pred_display = pred[:33] + ".." if len(pred) > 35 else pred
    match = "✓" if ref == pred else "✗"

    print(f"{i+1:>3}  {status:<12} {ref_display:<35} {pred_display:<35} {match}")