# Benchmark whisper large fine tuned / whisper large fine tuned distilled

In [1]:
import time
import torch
import librosa
import json
import os
import pandas as pd
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from tqdm import tqdm  

# ============================================
# Config device
# ============================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"▶️ Utilisation du device : {device}")

# === Charger TSV du subset ===
tsv_path = "data/subset10K/validated_subset.tsv"
clips_dir = "data/subset10K/clips"
df = pd.read_csv(tsv_path, sep="\t")

# === Utiliser tout le subset ===
subset = df

# === Charger les modèles ===
model_name_non = "./models/whisper-large-v3-french"
processor_non = AutoProcessor.from_pretrained(model_name_non)
model_non = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_non).to(device)
model_non.eval()

model_name_distil = "./models/whisper-large-v3-french-distil-dec16"
processor_distil = AutoProcessor.from_pretrained(model_name_distil)
model_distil = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_distil).to(device)
model_distil.eval()

# === Fichier JSONL pour écrire au fur et à mesure ===
output_dir = "results"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "benchmark_large_only.jsonl")

# ============================================
# Boucle sur tous les fichiers audio avec tqdm
# ============================================
for idx, row in enumerate(tqdm(subset.itertuples(), total=len(subset), desc="Transcription")):
    mp3_file = row.path
    sentence = str(row.sentence)
    audio_path = os.path.join(clips_dir, mp3_file)

    if not os.path.exists(audio_path):
        tqdm.write(f"⚠️ Fichier introuvable : {audio_path}, ignoré.")
        continue

    # Charger audio (mp3 -> PCM 16kHz mono float32)
    audio, sr = librosa.load(audio_path, sr=16000, mono=True)
    duration_s = len(audio) / 16000.0

    file_results = {"audio_file": mp3_file, "duration_s": duration_s, "raw_text": sentence}

    # ====== Large HF ======
    start_global = time.time()
    inputs = processor_non(audio, sampling_rate=16000, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    start_inf = time.time()
    with torch.no_grad():
        generated_ids = model_non.generate(inputs["input_features"])
        transcription = processor_non.batch_decode(generated_ids, skip_special_tokens=True)[0]
    end_inf = time.time()
    end_global = time.time()

    file_results["whisper_large_non_distilled"] = {
        "transcription": transcription,
        "inference_time_s": end_inf - start_inf,
        "elapsed_time_s": end_global - start_global,
        "real_time_factor": (end_inf - start_inf) / duration_s
    }

    # ====== Large-distilled HF ======
    start_global = time.time()
    inputs = processor_distil(audio, sampling_rate=16000, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    start_inf = time.time()
    with torch.no_grad():
        generated_ids = model_distil.generate(inputs["input_features"])
        transcription = processor_distil.batch_decode(generated_ids, skip_special_tokens=True)[0]
    end_inf = time.time()
    end_global = time.time()

    file_results["whisper_large_distilled"] = {
        "transcription": transcription,
        "inference_time_s": end_inf - start_inf,
        "elapsed_time_s": end_global - start_global,
        "real_time_factor": (end_inf - start_inf) / duration_s
    }

    # ====== Écriture immédiate dans JSONL ======
    with open(output_file, "a", encoding="utf-8") as f:
        f.write(json.dumps(file_results, ensure_ascii=False) + "\n")

print(f"✅ Benchmark terminé. Résultats sauvegardés dans {output_file}")


▶️ Utilisation du device : cuda


Transcription:   0%|                                             | 0/10000 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Transcription:  44%|█████████████▏                | 4393/10000 [2:30:23<3:07:55,  2.01s/it]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

