# Benchmark whisper large fined tuned french distilled + CT2

In [1]:
import os
import json
import pandas as pd
import time
from faster_whisper import WhisperModel
from jiwer import wer, cer
import librosa
import numpy as np
from tqdm import tqdm 

# ============================================
# Config GPU
# ============================================
device = "cuda"  # forcer GPU
print(f"▶️ Utilisation du device : {device}")

# === Charger TSV du subset ===
tsv_path = "data/subset10K/validated_subset.tsv"
clips_dir = "data/subset10K/clips"

df = pd.read_csv(tsv_path, sep="\t")
subset = df  # tout le subset

# === Charger modèle Faster-Whisper sur GPU (float16) ===
ct2_model_path = "./models/whisper-large-v3-french-distil-dec16-ct2/ctranslate2"
model_ct2 = WhisperModel(
    ct2_model_path,
    device=device,
    compute_type="float16"
)

# === Fichier JSONL de sortie ===
output_jsonl = "benchmark_results_ct2_gpu.jsonl"
# Vider le fichier si déjà existant
open(output_jsonl, "w").close()

# ===========================================
# Boucle sur tous les fichiers audio avec tqdm
# ===========================================
for idx, row in tqdm(subset.iterrows(), total=len(subset), desc="Transcription audio"):
    mp3_file = row["path"]
    sentence = str(row["sentence"])
    audio_path = os.path.join(clips_dir, mp3_file)

    if not os.path.exists(audio_path):
        print(f"⚠️ Fichier introuvable : {audio_path}, ignoré.")
        continue

    # Transcription Faster-Whisper
    start_global = time.time()
    start_inf = time.time()
    segments, info = model_ct2.transcribe(
        audio_path,
        beam_size=5,
        language="fr",
        condition_on_previous_text=False
    )
    end_inf = time.time()
    end_global = time.time()

    transcription = " ".join([s.text for s in segments])
    duration_s = librosa.get_duration(filename=audio_path)

    file_results = {
        "mp3_file": mp3_file,
        "duration_s": duration_s,
        "raw_text": sentence,
        "whisper_ct2_gpu": {
            "transcription": transcription,
            "language_detected": info.language,
            "inference_time_s": end_inf - start_inf,
            "elapsed_time_s": end_global - start_global,
            "real_time_factor": (end_inf - start_inf) / duration_s
        }
    }

    # Écriture immédiate dans le JSONL
    with open(output_jsonl, "a", encoding="utf-8") as f:
        f.write(json.dumps(file_results, ensure_ascii=False) + "\n")

# ===========================================
# Calcul du WER, CER sur le fichier JSONL
# ===========================================
results = []
with open(output_jsonl, "r", encoding="utf-8") as f:
    for line in f:
        results.append(json.loads(line))

wers, cers, times, elapsed_times, rtfs = [], [], [], [], []
for res in results:
    ref = str(res["raw_text"]).lower().strip()
    hyp = str(res["whisper_ct2_gpu"]["transcription"]).lower().strip()
    wers.append(wer(ref, hyp))
    cers.append(cer(ref, hyp))
    times.append(res["whisper_ct2_gpu"]["inference_time_s"])
    elapsed_times.append(res["whisper_ct2_gpu"]["elapsed_time_s"])
    rtfs.append(res["whisper_ct2_gpu"]["real_time_factor"])

report = {
    "WER": float(np.mean(wers)) if wers else None,
    "CER": float(np.mean(cers)) if cers else None,
    "avg_inference_time_s": float(np.mean(times)) if times else None,
    "avg_elapsed_time_s": float(np.mean(elapsed_times)) if elapsed_times else None,
    "sum_elapsed_time_s": float(np.sum(elapsed_times)) if elapsed_times else None,
    "avg_real_time_factor": float(np.mean(rtfs)) if rtfs else None
}

# Sauvegarde du rapport global
report_file = "benchmark_report_ct2_gpu.json"
with open(report_file, "w", encoding="utf-8") as f:
    json.dump(report, f, ensure_ascii=False, indent=4)

print(f"✅ Benchmark terminé. Résultats sauvegardés dans {output_jsonl} et rapport dans {report_file}")


▶️ Utilisation du device : cuda


	This alias will be removed in version 1.0.
  duration_s = librosa.get_duration(filename=audio_path)
Transcription audio:  57%|█████████████▋          | 5705/10000 [1:37:25<1:17:43,  1.09s/it]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

