In [None]:
!pip install -q transformers datasets accelerate jiwer librosa evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m111.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_from_disk
import torch
from tqdm import tqdm
import jiwer
import evaluate
import numpy as np
from peft import PeftModel
from jiwer import wer, mer, wil, wip
from transformers import (
    WhisperProcessor,
    WhisperTokenizer,
    WhisperForConditionalGeneration,
    pipeline
)
import torch
import pandas as pd
import librosa
import os
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/Drive')

Mounted at /content/Drive


In [None]:
model_dir = "/content/Drive/MyDrive/vin-capstone/whisper-dementia-final-v19_corrected"
data_dir  = "/content/Drive/MyDrive/vin-capstone/data/preprocessed_whisper_large_v3_features_v3_final/test"
base_model_id = "openai/whisper-large-v3"

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
test_dataset = load_from_disk(data_dir)
print(test_dataset)
print(test_dataset.features)

Dataset({
    features: ['input_features', 'labels', 'input_length', 'audio_path', 'offset_sec', 'end_sec', 'source'],
    num_rows: 7339
})
{'input_features': List(List(Value('float16'))), 'labels': List(Value('int64')), 'input_length': Value('int64'), 'audio_path': Value('string'), 'offset_sec': Value('float64'), 'end_sec': Value('float64'), 'source': Value('string')}


In [None]:
def get_filename(example):
    example['filename'] = os.path.basename(example['audio_path'])
    return example

test_dataset_with_filename = test_dataset.map(get_filename)

# Deduplicate - keep first occurrence of each file
seen = set()
indices_to_keep = []
for i in range(len(test_dataset_with_filename)):
    filename = test_dataset_with_filename[i]['filename']
    if filename not in seen:
        seen.add(filename)
        indices_to_keep.append(i)

test_dataset_dedup = test_dataset_with_filename.select(indices_to_keep)

print(f"Original: {len(test_dataset)}")
print(f"Deduplicated: {len(test_dataset_dedup)}")

Map:   0%|          | 0/7339 [00:00<?, ? examples/s]

Original: 7339
Deduplicated: 6441


# Fine-Tuned (General)


In [None]:
FILLER_TOKENS = ["[UH]", "[UM]", "[ER]", "[AH]", "[HM]", "[UNINTELLIGIBLE]"]

tokenizer = WhisperTokenizer.from_pretrained(model_dir, language="en", task="transcribe")
tokenizer.add_tokens(FILLER_TOKENS, special_tokens=False)
print(f"Tokenizer vocab size: {len(tokenizer)}")  # Must be 51872

processor = WhisperProcessor.from_pretrained(base_model_id, language="en", task="transcribe")
processor.tokenizer = tokenizer

base_model = WhisperForConditionalGeneration.from_pretrained(base_model_id)
base_model.resize_token_embeddings(51872)

model = PeftModel.from_pretrained(base_model, model_dir)
model = model.to(device)
model.eval()

model.generation_config.language = "en"
model.generation_config.task = "transcribe"

print(f" Embedding size: {model.get_input_embeddings().num_embeddings}")

Tokenizer vocab size: 51872


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


 Embedding size: 51872


In [None]:
print("torch.cuda.is_available():", torch.cuda.is_available())
print("model device:", next(model.parameters()).device)
print("device variable:", device)

torch.cuda.is_available(): True
model device: cuda:0
device variable: cuda


In [None]:
BATCH_SIZE = 16

pred_texts = []
ref_texts = []
filenames = []

model.generation_config.language = "en"
model.generation_config.task = "transcribe"

for start in tqdm(range(0, len(test_dataset_dedup), BATCH_SIZE)):
    end = min(start + BATCH_SIZE, len(test_dataset_dedup))
    batch = test_dataset_dedup[start:end]

    # Track filenames
    filenames.extend(batch["filename"])

    feats_list = batch["input_features"]
    feat_tensors = [torch.tensor(f, dtype=torch.float32) for f in feats_list]
    max_len = max(t.shape[-1] for t in feat_tensors)

    padded = []
    for t in feat_tensors:
        pad_len = max_len - t.shape[-1]
        if pad_len > 0:
            pad = torch.zeros(t.shape[0], pad_len, dtype=torch.float32)
            t = torch.cat([t, pad], dim=-1)
        padded.append(t)

    input_features = torch.stack(padded, dim=0).to(device)

    with torch.no_grad():
        pred_ids = model.generate(
            input_features,
            max_new_tokens=128,
            num_beams=5,
            do_sample=False
        )

    batch_preds = processor.batch_decode(pred_ids, skip_special_tokens=True)

    labels_batch = batch["labels"]
    batch_refs = []
    for ref_ids in labels_batch:
        if hasattr(ref_ids, "tolist"):
            ref_ids = ref_ids.tolist()
        ref_ids = [i for i in ref_ids if i != -100]
        ref_text = processor.tokenizer.decode(ref_ids, skip_special_tokens=True)
        batch_refs.append(ref_text)

    pred_texts.extend(batch_preds)
    ref_texts.extend(batch_refs)

print(f"Processed {len(pred_texts)} samples")

# Verify filenames match baseline
print(f"Filenames match baseline: {set(filenames) == baseline_filenames}")

100%|██████████| 403/403 [1:29:43<00:00, 13.36s/it]

Processed 6441 samples
Filenames match baseline: True





In [None]:
transform = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
])

In [None]:
ref_norm  = [transform(r) for r in ref_texts]
pred_norm = [transform(p) for p in pred_texts]

In [None]:
cer_metric = evaluate.load("cer")

def compute_metrics(predictions, references):
    # normalize
    pred_norm = [transform(p) for p in predictions]
    ref_norm  = [transform(r) for r in references]

    metrics = {}

    metrics["wer"] = wer(ref_norm, pred_norm)
    metrics["wer_percentage"] = metrics["wer"] * 100

    metrics["mer"] = mer(ref_norm, pred_norm)
    metrics["mer_percentage"] = metrics["mer"] * 100

    metrics["wil"] = wil(ref_norm, pred_norm)
    metrics["wil_percentage"] = metrics["wil"] * 100

    metrics["wip"] = wip(ref_norm, pred_norm)
    metrics["wip_percentage"] = metrics["wip"] * 100

    metrics["cer"] = cer_metric.compute(
        predictions=pred_norm,
        references=ref_norm
    )
    metrics["cer_percentage"] = metrics["cer"] * 100

    metrics["num_samples"] = len(predictions)
    metrics["perfect_transcriptions"] = sum(
        p == r for p, r in zip(pred_norm, ref_norm)
    )
    metrics["perfect_transcriptions_percentage"] = (
        metrics["perfect_transcriptions"] / len(predictions) * 100
        if predictions else 0.0
    )

    metrics["avg_prediction_length"] = float(
        np.mean([len(p.split()) for p in pred_norm])
    )
    metrics["avg_reference_length"] = float(
        np.mean([len(r.split()) for r in ref_norm])
    )

    return metrics

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
metrics = compute_metrics(pred_texts, ref_texts)
metrics

{'wer': 0.16564971168358725,
 'wer_percentage': 16.564971168358724,
 'mer': 0.16347965575189694,
 'mer_percentage': 16.347965575189694,
 'wil': 0.2617526052210467,
 'wil_percentage': 26.175260522104672,
 'wip': 0.7382473947789533,
 'wip_percentage': 73.82473947789533,
 'cer': 0.057943475002676374,
 'cer_percentage': 5.794347500267637,
 'num_samples': 6441,
 'perfect_transcriptions': 2248,
 'perfect_transcriptions_percentage': 34.90141282409564,
 'avg_prediction_length': 11.712777519018786,
 'avg_reference_length': 12.037106039434871}

# Fine-Tuned (Dementia Only)


#### Uncomment code below to filter test set to just dementia before running

In [1]:
# def is_dementiabank(example):
#     return example['source'] == 'dementiabank'

# test_dataset_db = test_dataset.filter(is_dementiabank)

# # deduplicate
# def get_filename(example):
#     example['filename'] = os.path.basename(example['audio_path'])
#     return example

# test_dataset_db = test_dataset_db.map(get_filename)

# seen = set()
# indices_to_keep = []
# for i in range(len(test_dataset_db)):
#     filename = test_dataset_db[i]['filename']
#     if filename not in seen:
#         seen.add(filename)
#         indices_to_keep.append(i)

# test_dataset_db_dedup = test_dataset_db.select(indices_to_keep)

# print(f"Original: {len(test_dataset)}")
# print(f"DementiaBank only: {len(test_dataset_db)}")
# print(f"DementiaBank deduplicated: {len(test_dataset_db_dedup)}")

NameError: name 'test_dataset' is not defined

In [None]:
FILLER_TOKENS = ["[UH]", "[UM]", "[ER]", "[AH]", "[HM]", "[UNINTELLIGIBLE]"]

tokenizer = WhisperTokenizer.from_pretrained(model_dir, language="en", task="transcribe")
tokenizer.add_tokens(FILLER_TOKENS, special_tokens=False)
print(f"Tokenizer vocab size: {len(tokenizer)}")  # Must be 51872

processor = WhisperProcessor.from_pretrained(base_model_id, language="en", task="transcribe")
processor.tokenizer = tokenizer

base_model = WhisperForConditionalGeneration.from_pretrained(base_model_id)
base_model.resize_token_embeddings(51872)

model = PeftModel.from_pretrained(base_model, model_dir)
model = model.to(device)
model.eval()

model.generation_config.language = "en"
model.generation_config.task = "transcribe"

print(f" Embedding size: {model.get_input_embeddings().num_embeddings}")

Tokenizer vocab size: 51872
 Embedding size: 51872


In [None]:
print("torch.cuda.is_available():", torch.cuda.is_available())
print("model device:", next(model.parameters()).device)
print("device variable:", device)

torch.cuda.is_available(): True
model device: cuda:0
device variable: cuda


In [None]:
BATCH_SIZE = 16

pred_texts = []
ref_texts = []
filenames = []  # Track filenames

model.generation_config.language = "en"
model.generation_config.task = "transcribe"

for start in tqdm(range(0, len(test_dataset_db_dedup), BATCH_SIZE)):
    end = min(start + BATCH_SIZE, len(test_dataset_db_dedup))
    batch = test_dataset_db_dedup[start:end]

    filenames.extend(batch["filename"])

    feats_list = batch["input_features"]
    feat_tensors = [torch.tensor(f, dtype=torch.float32) for f in feats_list]
    max_len = max(t.shape[-1] for t in feat_tensors)

    padded = []
    for t in feat_tensors:
        pad_len = max_len - t.shape[-1]
        if pad_len > 0:
            pad = torch.zeros(t.shape[0], pad_len, dtype=torch.float32)
            t = torch.cat([t, pad], dim=-1)
        padded.append(t)

    input_features = torch.stack(padded, dim=0).to(device)

    with torch.no_grad():
        pred_ids = model.generate(
            input_features,
            max_new_tokens=128,
            num_beams=5,
            do_sample=False
        )

    batch_preds = processor.batch_decode(pred_ids, skip_special_tokens=True)

    labels_batch = batch["labels"]
    batch_refs = []
    for ref_ids in labels_batch:
        if hasattr(ref_ids, "tolist"):
            ref_ids = ref_ids.tolist()
        ref_ids = [i for i in ref_ids if i != -100]
        ref_text = processor.tokenizer.decode(ref_ids, skip_special_tokens=True)
        batch_refs.append(ref_text)

    pred_texts.extend(batch_preds)
    ref_texts.extend(batch_refs)

print(f"Processed {len(pred_texts)} samples")

100%|██████████| 11/11 [04:37<00:00, 25.24s/it]

Processed 167 samples





In [None]:
transform = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
])

In [None]:
ref_norm  = [transform(r) for r in ref_texts]
pred_norm = [transform(p) for p in pred_texts]

In [None]:
cer_metric = evaluate.load("cer")

def compute_metrics(predictions, references):
    # normalize
    pred_norm = [transform(p) for p in predictions]
    ref_norm  = [transform(r) for r in references]

    metrics = {}

    metrics["wer"] = wer(ref_norm, pred_norm)
    metrics["wer_percentage"] = metrics["wer"] * 100

    metrics["mer"] = mer(ref_norm, pred_norm)
    metrics["mer_percentage"] = metrics["mer"] * 100

    metrics["wil"] = wil(ref_norm, pred_norm)
    metrics["wil_percentage"] = metrics["wil"] * 100

    metrics["wip"] = wip(ref_norm, pred_norm)
    metrics["wip_percentage"] = metrics["wip"] * 100

    metrics["cer"] = cer_metric.compute(
        predictions=pred_norm,
        references=ref_norm
    )
    metrics["cer_percentage"] = metrics["cer"] * 100

    metrics["num_samples"] = len(predictions)
    metrics["perfect_transcriptions"] = sum(
        p == r for p, r in zip(pred_norm, ref_norm)
    )
    metrics["perfect_transcriptions_percentage"] = (
        metrics["perfect_transcriptions"] / len(predictions) * 100
        if predictions else 0.0
    )

    metrics["avg_prediction_length"] = float(
        np.mean([len(p.split()) for p in pred_norm])
    )
    metrics["avg_reference_length"] = float(
        np.mean([len(r.split()) for r in ref_norm])
    )

    return metrics

In [None]:
metrics = compute_metrics(pred_texts, ref_texts)
metrics

{'wer': 0.5077878103837472,
 'wer_percentage': 50.77878103837472,
 'mer': 0.501840490797546,
 'mer_percentage': 50.1840490797546,
 'wil': 0.6595877399977266,
 'wil_percentage': 65.95877399977266,
 'wip': 0.34041226000227337,
 'wip_percentage': 34.04122600022734,
 'cer': 0.25451508255459254,
 'cer_percentage': 25.451508255459252,
 'num_samples': 167,
 'perfect_transcriptions': 0,
 'perfect_transcriptions_percentage': 0.0,
 'avg_prediction_length': 39.59880239520958,
 'avg_reference_length': 53.053892215568865}

# Baseline

In [None]:
test_baseline_results_with_gt = pd.read_csv("test_baseline_results_with_gt.csv")

In [None]:
# Uncomment to run eval on just dementiabank

# db_baseline_only = test_baseline_results_with_gt[~test_baseline_results_with_gt['filename'].str.contains('common_voice')]
# print(f"Original: {len(test_baseline_results_with_gt)}")
# print(f"DementiaBank only: {len(db_baseline_only)}")

Original: 6441
DementiaBank only: 167


In [None]:
ref_texts = test_baseline_results_with_gt['ground_truth'].tolist()
pred_texts = test_baseline_results_with_gt['baseline_whisper_prediction'].tolist()
# ref_texts = db_baseline_only['ground_truth'].tolist()
# pred_texts = db_baseline_only['baseline_whisper_prediction'].tolist()

In [None]:
transform = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
])

In [None]:
ref_norm  = [transform(r) for r in ref_texts]
pred_norm = [transform(p) for p in pred_texts]

In [None]:
cer_metric = evaluate.load("cer")

def compute_metrics(predictions, references):
    # normalize
    pred_norm = [transform(p) for p in predictions]
    ref_norm  = [transform(r) for r in references]

    metrics = {}

    metrics["wer"] = wer(ref_norm, pred_norm)
    metrics["wer_percentage"] = metrics["wer"] * 100

    metrics["mer"] = mer(ref_norm, pred_norm)
    metrics["mer_percentage"] = metrics["mer"] * 100

    metrics["wil"] = wil(ref_norm, pred_norm)
    metrics["wil_percentage"] = metrics["wil"] * 100

    metrics["wip"] = wip(ref_norm, pred_norm)
    metrics["wip_percentage"] = metrics["wip"] * 100

    metrics["cer"] = cer_metric.compute(
        predictions=pred_norm,
        references=ref_norm
    )
    metrics["cer_percentage"] = metrics["cer"] * 100

    metrics["num_samples"] = len(predictions)
    metrics["perfect_transcriptions"] = sum(
        p == r for p, r in zip(pred_norm, ref_norm)
    )
    metrics["perfect_transcriptions_percentage"] = (
        metrics["perfect_transcriptions"] / len(predictions) * 100
        if predictions else 0.0
    )

    metrics["avg_prediction_length"] = float(
        np.mean([len(p.split()) for p in pred_norm])
    )
    metrics["avg_reference_length"] = float(
        np.mean([len(r.split()) for r in ref_norm])
    )

    return metrics

In [None]:
metrics = compute_metrics(pred_texts, ref_texts)
metrics

{'wer': 0.5286567824739987,
 'wer_percentage': 52.86567824739987,
 'mer': 0.5257482394366197,
 'mer_percentage': 52.574823943661976,
 'wil': 0.6776453829543905,
 'wil_percentage': 67.76453829543905,
 'wip': 0.32235461704560947,
 'wip_percentage': 32.23546170456095,
 'cer': 0.2839757077276205,
 'cer_percentage': 28.397570772762048,
 'num_samples': 167,
 'perfect_transcriptions': 0,
 'perfect_transcriptions_percentage': 0.0,
 'avg_prediction_length': 38.17964071856287,
 'avg_reference_length': 54.119760479041915}