# Benchmark large / large distilled / large distilled + CT2 subset commonvoice 10K CPU

In [1]:
import time
import torch
import librosa
import json
import os
import pandas as pd
import numpy as np
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from faster_whisper import WhisperModel
from jiwer import wer, cer

# === Charger TSV du subset ===
tsv_path = "data/common_voice_mozilla_fr/cv-corpus-21.0-2025-03-14/fr/subset10K/validated_subset.tsv"
clips_dir = "data/common_voice_mozilla_fr/cv-corpus-21.0-2025-03-14/fr/subset10K/clips"

df = pd.read_csv(tsv_path, sep="\t")

# === Utiliser tout le subset ===
def get_all_commonvoice_audios(df):
    return df  # pas d'échantillonnage, on prend tout

subset = get_all_commonvoice_audios(df)
results = {}

########################################
# Charger les modèles une seule fois
########################################
# 1. Modèle non distillé
model_name = "./models/whisper-large-v3-french"
processor_non = AutoProcessor.from_pretrained(model_name)
model_non = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
model_non.eval()
model_non_int8 = torch.quantization.quantize_dynamic(
    model_non, {torch.nn.Linear}, dtype=torch.qint8
)

# 2. Modèle distillé
model_name = "./models/whisper-large-v3-french-distil-dec16"
processor_distil = AutoProcessor.from_pretrained(model_name)
model_distil = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
model_distil.eval()
model_distil_int8 = torch.quantization.quantize_dynamic(
    model_distil, {torch.nn.Linear}, dtype=torch.qint8
)

# 3. Modèle distillé CTranslate2
ct2_model_path = "./models/whisper-large-v3-french-distil-dec16-ct2/ctranslate2"
model_ct2 = WhisperModel(
    ct2_model_path,
    device="cpu",
    compute_type="int8"
)

########################################
# Boucle sur tous les fichiers audio
########################################
for idx, row in subset.iterrows():
    mp3_file = row["path"]
    sentence = str(row["sentence"])
    audio_path = os.path.join(clips_dir, mp3_file)

    if not os.path.exists(audio_path):
        print(f"?? Fichier introuvable : {audio_path}, ignoré.")
        continue

    # Charger audio (mp3 -> PCM 16kHz mono float32)
    audio, sr = librosa.load(audio_path, sr=16000, mono=True)
    duration_s = len(audio) / 16000.0

    file_results = {"duration_s": duration_s, "raw_text": sentence}

    # ====== Modèle non distillé ======
    global_start = time.time()
    inputs = processor_non(audio, sampling_rate=16000, return_tensors="pt")
    input_features = inputs["input_features"]

    inf_start = time.time()
    with torch.no_grad():
        generated_ids = model_non_int8.generate(input_features)
        transcription = processor_non.batch_decode(generated_ids, skip_special_tokens=True)[0]
    inf_end = time.time()
    global_end = time.time()

    file_results["whisper_large_non_distilled"] = {
        "transcription": transcription,
        "inference_time_s": inf_end - inf_start,
        "elapsed_time_s": global_end - global_start,
        "real_time_factor": (inf_end - inf_start) / duration_s
    }

    # ====== Modèle distillé ======
    global_start = time.time()
    inputs = processor_distil(audio, sampling_rate=16000, return_tensors="pt")
    input_features = inputs["input_features"]

    inf_start = time.time()
    with torch.no_grad():
        generated_ids = model_distil_int8.generate(input_features)
        transcription = processor_distil.batch_decode(generated_ids, skip_special_tokens=True)[0]
    inf_end = time.time()
    global_end = time.time()

    file_results["whisper_large_distilled"] = {
        "transcription": transcription,
        "inference_time_s": inf_end - inf_start,
        "elapsed_time_s": global_end - global_start,
        "real_time_factor": (inf_end - inf_start) / duration_s
    }

    # ====== Modèle distillé CTranslate2 ======
    global_start = time.time()
    inf_start = time.time()
    segments, info = model_ct2.transcribe(
        audio_path,
        beam_size=5,
        language="fr",
        condition_on_previous_text=False
    )
    inf_end = time.time()
    global_end = time.time()

    ct2_transcription = " ".join([s.text for s in segments])
    file_results["whisper_large_distilled_ct2"] = {
        "transcription": ct2_transcription,
        "language_detected": info.language,
        "inference_time_s": inf_end - inf_start,
        "elapsed_time_s": global_end - global_start,
        "real_time_factor": (inf_end - inf_start) / duration_s
    }

    results[mp3_file] = file_results

########################################
# Calcul du WER, CER, temps moyen, RTF
########################################
report = {}
for model_key in ["whisper_large_non_distilled", "whisper_large_distilled", "whisper_large_distilled_ct2"]:
    wers, cers, times, elapsed_times, rtfs = [], [], [], [], []
    for _, res in results.items():
        ref = str(res["raw_text"]).lower().strip()
        hyp = str(res[model_key]["transcription"]).lower().strip()
        wers.append(wer(ref, hyp))
        cers.append(cer(ref, hyp))
        times.append(res[model_key]["inference_time_s"])
        elapsed_times.append(res[model_key]["elapsed_time_s"])
        rtfs.append(res[model_key]["real_time_factor"])
    report[model_key] = {
        "WER": float(np.mean(wers)) if wers else None,
        "CER": float(np.mean(cers)) if cers else None,
        "avg_inference_time_s": float(np.mean(times)) if times else None,
        "avg_elapsed_time_s": float(np.mean(elapsed_times)) if elapsed_times else None,
        "sum_elapsed_time_s": float(np.sum(elapsed_times)) if elapsed_times else None,
        "avg_real_time_factor": float(np.mean(rtfs)) if rtfs else None
    }

# Sauvegarde
final_output = {"results": results, "report": report}
with open("benchmark_results_subset.json", "w", encoding="utf-8") as f:
    json.dump(final_output, f, ensure_ascii=False, indent=4)

print("? Benchmark terminé sur tout le subset. Résultats sauvegardés dans benchmark_results_subset.json")


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  model_non_int8 = torch.quantization.quantize_dynamic(
For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantizatio

? Benchmark terminé sur tout le subset. Résultats sauvegardés dans benchmark_results_subset.json
