# FasterWhisper benchmark

In [2]:
import json
import random
import time
from pathlib import Path
from faster_whisper import WhisperModel
from tqdm import tqdm
import soundfile as sf  # pour calculer la durée audio

def get_audio_duration(path: Path) -> float:
    """Retourne la durée d'un fichier audio en secondes."""
    try:
        f = sf.SoundFile(str(path))
        return len(f) / f.samplerate
    except Exception:
        return None

def benchmark_models(
    input_json: str,
    audio_root: str = "data",
    models: list = ["small", "medium", "large"],
    sample_size: int = 1000,
    device: str = "cuda",
    compute_type: str = "float16",
    beam_size: int = 1,
    language: str = "fr"
):
    """
    Benchmark vitesse d'inférence pour plusieurs modèles Whisper.
    -> Séquentiel
    -> Échantillon aléatoire de N fichiers
    -> Ajoute 'inference_time' et 'audio_duration' dans le JSONL
    """

    # Charger tout le dataset
    with open(input_json, "r", encoding="utf-8") as fin:
        data = json.load(fin)

    # Tirer un échantillon aléatoire
    sampled_data = random.sample(data, min(sample_size, len(data)))
    print(f"🎯 Benchmark sur {len(sampled_data)} fichiers audio.")

    for model_size in models:
        print(f"\n🚀 Benchmark du modèle {model_size}...")

        # Charger le modèle sur GPU
        model = WhisperModel(model_size, device=device, compute_type=compute_type)

        output_jsonl = input_json.replace(".json", f"_{model_size}_speed_benchmark.jsonl")

        total_start = time.time()

        with open(output_jsonl, "w", encoding="utf-8") as fout:
            for item in tqdm(sampled_data, desc=f"Transcription {model_size}"):
                audio_path = Path(audio_root) / item["audio_file"]

                audio_duration = get_audio_duration(audio_path)
                item["audio_duration"] = audio_duration

                if not audio_path.exists():
                    item[f"model_prediction_{model_size}"] = None
                    item[f"inference_time_{model_size}"] = None
                else:
                    try:
                        start_time = time.time()
                        segments, _ = model.transcribe(
                            str(audio_path),
                            language=language,
                            beam_size=beam_size
                        )
                        inference_time = time.time() - start_time

                        item[f"model_prediction_{model_size}"] = " ".join([seg.text for seg in segments])
                        item[f"inference_time_{model_size}"] = inference_time
                    except Exception as e:
                        item[f"model_prediction_{model_size}"] = f"Erreur: {str(e)}"
                        item[f"inference_time_{model_size}"] = None

                fout.write(json.dumps(item, ensure_ascii=False) + "\n")

        elapsed = time.time() - total_start
        print(f"✅ Modèle {model_size} terminé en {elapsed/60:.2f} minutes")
        print(f"📂 Résultats sauvegardés dans {output_jsonl}")


In [3]:
benchmark_models(
    input_json="data/voxpopuli_fr_train/train_metadata_full.json",
    audio_root="data"
)


🎯 Benchmark sur 100 fichiers audio.

🚀 Benchmark du modèle small...


Transcription small: 100%|██████████████████████████████| 100/100 [00:58<00:00,  1.70it/s]


✅ Modèle small terminé en 0.98 minutes
📂 Résultats sauvegardés dans data/voxpopuli_fr_train/train_metadata_full_small_speed_benchmark.jsonl

🚀 Benchmark du modèle medium...


Transcription medium: 100%|█████████████████████████████| 100/100 [01:31<00:00,  1.09it/s]


✅ Modèle medium terminé en 1.52 minutes
📂 Résultats sauvegardés dans data/voxpopuli_fr_train/train_metadata_full_medium_speed_benchmark.jsonl

🚀 Benchmark du modèle large...


Transcription large: 100%|██████████████████████████████| 100/100 [02:15<00:00,  1.36s/it]

✅ Modèle large terminé en 2.26 minutes
📂 Résultats sauvegardés dans data/voxpopuli_fr_train/train_metadata_full_large_speed_benchmark.jsonl





In [1]:
import json
import random
import time
from pathlib import Path
from faster_whisper import WhisperModel
from tqdm import tqdm
import soundfile as sf  # pour calculer la durée audio

def get_audio_duration(path: Path) -> float:
    """Retourne la durée d'un fichier audio en secondes."""
    try:
        f = sf.SoundFile(str(path))
        return len(f) / f.samplerate
    except Exception:
        return None

def benchmark_models(
    input_json: str,
    audio_root: str = "data",
    models: list = ["small", "medium", "large"],
    sample_size: int = 1000,
    device: str = "cuda",
    compute_type: str = "float16",
    beam_size: int = 1,
    language: str = "fr"
):
    """
    Benchmark vitesse d'inférence pour plusieurs modèles Whisper.
    -> Séquentiel
    -> Échantillon aléatoire de N fichiers
    -> Ajoute 'inference_time' et 'audio_duration' dans le JSONL
    -> Sauvegarde un résumé global des temps par modèle
    """

    # Charger tout le dataset
    with open(input_json, "r", encoding="utf-8") as fin:
        data = json.load(fin)

    # Tirer un échantillon aléatoire
    sampled_data = random.sample(data, min(sample_size, len(data)))
    print(f"🎯 Benchmark sur {len(sampled_data)} fichiers audio.")

    summary = {}

    for model_size in models:
        print(f"\n🚀 Benchmark du modèle {model_size}...")

        # Charger le modèle sur GPU
        model = WhisperModel(model_size, device=device, compute_type=compute_type)

        output_jsonl = input_json.replace(".json", f"_{model_size}_speed_benchmark.jsonl")

        total_start = time.time()
        total_inference_time = 0.0
        total_audio_duration = 0.0

        with open(output_jsonl, "w", encoding="utf-8") as fout:
            for item in tqdm(sampled_data, desc=f"Transcription {model_size}"):
                audio_path = Path(audio_root) / item["audio_file"]

                audio_duration = get_audio_duration(audio_path)
                item["audio_duration"] = audio_duration
                if audio_duration:
                    total_audio_duration += audio_duration

                if not audio_path.exists():
                    item[f"model_prediction_{model_size}"] = None
                    item[f"inference_time_{model_size}"] = None
                else:
                    try:
                        start_time = time.time()
                        segments, _ = model.transcribe(
                            str(audio_path),
                            language=language,
                            beam_size=beam_size
                        )
                        inference_time = time.time() - start_time
                        total_inference_time += inference_time

                        item[f"model_prediction_{model_size}"] = " ".join([seg.text for seg in segments])
                        item[f"inference_time_{model_size}"] = inference_time
                    except Exception as e:
                        item[f"model_prediction_{model_size}"] = f"Erreur: {str(e)}"
                        item[f"inference_time_{model_size}"] = None

                fout.write(json.dumps(item, ensure_ascii=False) + "\n")

        elapsed = time.time() - total_start
        print(f"✅ Modèle {model_size} terminé en {elapsed/60:.2f} minutes")
        print(f"📂 Résultats sauvegardés dans {output_jsonl}")

        # Stocker résumé
        summary[model_size] = {
            "total_wall_time_sec": elapsed,
            "total_inference_time_sec": total_inference_time,
            "total_audio_duration_sec": total_audio_duration,
            "rtf": total_inference_time / total_audio_duration if total_audio_duration > 0 else None  # Real-Time Factor
        }

    # Sauvegarder le résumé global
    summary_file = input_json.replace(".json", "_benchmark_summary.json")
    with open(summary_file, "w", encoding="utf-8") as fsum:
        json.dump(summary, fsum, indent=2, ensure_ascii=False)

    print(f"\n📊 Résumé global sauvegardé dans {summary_file}")
    return summary


In [2]:
benchmark_models(
    input_json="data/voxpopuli_fr_train/train_metadata_full.json",
    audio_root="data"
)


🎯 Benchmark sur 1000 fichiers audio.

🚀 Benchmark du modèle small...


Transcription small: 100%|████████████████████████████| 1000/1000 [10:04<00:00,  1.66it/s]


✅ Modèle small terminé en 10.07 minutes
📂 Résultats sauvegardés dans data/voxpopuli_fr_train/train_metadata_full_small_speed_benchmark.jsonl

🚀 Benchmark du modèle medium...


Transcription medium: 100%|███████████████████████████| 1000/1000 [15:41<00:00,  1.06it/s]


✅ Modèle medium terminé en 15.70 minutes
📂 Résultats sauvegardés dans data/voxpopuli_fr_train/train_metadata_full_medium_speed_benchmark.jsonl

🚀 Benchmark du modèle large...


Transcription large: 100%|████████████████████████████| 1000/1000 [23:19<00:00,  1.40s/it]


✅ Modèle large terminé en 23.32 minutes
📂 Résultats sauvegardés dans data/voxpopuli_fr_train/train_metadata_full_large_speed_benchmark.jsonl

📊 Résumé global sauvegardé dans data/voxpopuli_fr_train/train_metadata_full_benchmark_summary.json


{'small': {'total_wall_time_sec': 604.2079555988312,
  'total_inference_time_sec': 59.06663155555725,
  'total_audio_duration_sec': 10258.871874999992,
  'rtf': 0.005757614704156474},
 'medium': {'total_wall_time_sec': 941.7749347686768,
  'total_inference_time_sec': 55.47819924354553,
  'total_audio_duration_sec': 10258.871874999992,
  'rtf': 0.005407826505635697},
 'large': {'total_wall_time_sec': 1399.4844186306,
  'total_inference_time_sec': 56.45949172973633,
  'total_audio_duration_sec': 10258.871874999992,
  'rtf': 0.005503479565557629}}

# WHisper Medusa 

https://github.com/aiola-lab/whisper-medusa

## Télécharge Miniconda (recommandé pour serveur)
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
echo "export PATH=$HOME/miniconda/bin:$PATH" >> ~/.bashrc
source ~/.bashrc

conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r



conda create -n whisper-medusa python=3.11 -y
conda activate whisper-medusa
pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu118


git clone https://github.com/aiola-lab/whisper-medusa.git
cd whisper-medusa
pip install -e .


# The model is optimized for English audio with sampling rate of 16kHz.

In [None]:
import torch
import torchaudio
import json
import random
from pathlib import Path
from tqdm import tqdm
from whisper_medusa import WhisperMedusaModel
from transformers import WhisperProcessor

def transcribe_medusa_json_sample(
    input_json: str,
    output_jsonl: str,
    audio_root: str = "data",
    sample_size: int = 10,
    model_name: str = "aiola/whisper-medusa-linear-libri",
    device: str = "cuda",
    language: str = "fr",
    regulation_factor: float = 1.01,
    regulation_start: int = 140
):
    """
    Transcrit un petit échantillon du dataset JSON avec Whisper-Medusa.
    - sample_size définit le nombre de fichiers à traiter.
    - Ajoute inference_time et audio_duration dans le JSONL.
    """
    # Charger le dataset complet
    with open(input_json, "r", encoding="utf-8") as fin:
        data = json.load(fin)

    # Tirer un échantillon aléatoire
    sampled_data = random.sample(data, min(sample_size, len(data)))
    print(f"🎯 Transcription d'un échantillon de {len(sampled_data)} fichiers audio.")

    # Charger le modèle et le processor
    model = WhisperMedusaModel.from_pretrained(model_name).to(device)
    processor = WhisperProcessor.from_pretrained(model_name)

    with open(output_jsonl, "w", encoding="utf-8") as fout:
        for item in tqdm(sampled_data, desc="Transcription Medusa"):
            audio_path = Path(audio_root) / item["audio_file"]
            if not audio_path.exists():
                item["model_prediction_medusa"] = None
                item["inference_time_medusa"] = None
                fout.write(json.dumps(item, ensure_ascii=False) + "\n")
                continue

            # Charger l'audio
            waveform, sr = torchaudio.load(audio_path)
            if waveform.shape[0] > 1:
                waveform = waveform.mean(dim=0, keepdim=True)
            SAMPLING_RATE = 16000
            if sr != SAMPLING_RATE:
                waveform = torchaudio.transforms.Resample(sr, SAMPLING_RATE)(waveform)

            item["audio_duration"] = waveform.shape[-1] / SAMPLING_RATE

            # Préparer les features avec attention_mask
            input_features = processor(
                waveform.squeeze(),
                return_tensors="pt",
                sampling_rate=SAMPLING_RATE
            )
            input_features = input_features.to(device)
            attention_mask = input_features.attention_mask if hasattr(input_features, "attention_mask") else None

            # Transcription
            start_time = torch.cuda.Event(enable_timing=True)
            end_time = torch.cuda.Event(enable_timing=True)
            start_time.record()

            model_output = model.generate(
                input_features.input_features,
                attention_mask=attention_mask,
                language=language,
                exponential_decay_length_penalty=(regulation_start, regulation_factor)
            )

            end_time.record()
            torch.cuda.synchronize()
            item["inference_time_medusa"] = start_time.elapsed_time(end_time) / 1000.0  # secondes

            predict_ids = model_output[0]
            item["model_prediction_medusa"] = processor.decode(predict_ids, skip_special_tokens=True)

            fout.write(json.dumps(item, ensure_ascii=False) + "\n")

    print(f"✅ Résultats sauvegardés dans {output_jsonl}")

if __name__ == "__main__":
    transcribe_medusa_json_sample(
        input_json="/home/projet9/projet9/notebooks/data/voxpopuli_fr_train/train_metadata_full.json",
        output_jsonl="/home/projet9/projet9/notebooks/data/voxpopuli_fr_train/train_sample10_medusa.jsonl",
        audio_root="/home/projet9/projet9/notebooks/data",
        sample_size=10,
        model_name="aiola/whisper-medusa-linear-libri",
        device="cuda",
        language="fr"
    )



# Benchmark Whisper Base

In [None]:
sudo apt update
sudo apt install ffmpeg -y


In [1]:
import json
import random
import time
from pathlib import Path
import soundfile as sf
from tqdm import tqdm
import whisper  # OpenAI Whisper officiel

def get_audio_duration(path: Path) -> float:
    """Retourne la durée d'un fichier audio en secondes."""
    try:
        with sf.SoundFile(str(path)) as f:
            return len(f) / f.samplerate
    except Exception:
        return None

def benchmark_whisper(
    input_json: str,
    audio_root: str = "data",
    models: list = ["small", "medium", "large"],
    sample_size: int = 1000,
    device: str = "cuda",
    beam_size: int = 1,
    language: str = "fr"
):
    """
    Benchmark officiel Whisper (OpenAI) sur un échantillon de fichiers.
    - Séquentiel
    - Ajoute 'inference_time' et 'audio_duration'
    - Sauvegarde résumé global par modèle
    """
    # Charger dataset
    with open(input_json, "r", encoding="utf-8") as fin:
        data = json.load(fin)

    sampled_data = random.sample(data, min(sample_size, len(data)))
    print(f"🎯 Benchmark sur {len(sampled_data)} fichiers audio.")

    summary = {}

    for model_size in models:
        print(f"\n🚀 Benchmark du modèle Whisper {model_size}...")

        model = whisper.load_model(model_size, device=device)

        output_jsonl = input_json.replace(".json", f"_{model_size}_speed_benchmark.jsonl")
        total_start = time.time()
        total_inference_time = 0.0
        total_audio_duration = 0.0

        with open(output_jsonl, "w", encoding="utf-8") as fout:
            for item in tqdm(sampled_data, desc=f"Transcription {model_size}"):
                audio_path = Path(audio_root) / item["audio_file"]
                audio_duration = get_audio_duration(audio_path)
                item["audio_duration"] = audio_duration
                if audio_duration:
                    total_audio_duration += audio_duration

                if not audio_path.exists():
                    item[f"model_prediction_{model_size}"] = None
                    item[f"inference_time_{model_size}"] = None
                else:
                    try:
                        start_time = time.time()
                        result = model.transcribe(str(audio_path), language=language, beam_size=beam_size)
                        inference_time = time.time() - start_time
                        total_inference_time += inference_time

                        item[f"model_prediction_{model_size}"] = result["text"]
                        item[f"inference_time_{model_size}"] = inference_time
                    except Exception as e:
                        item[f"model_prediction_{model_size}"] = f"Erreur: {str(e)}"
                        item[f"inference_time_{model_size}"] = None

                fout.write(json.dumps(item, ensure_ascii=False) + "\n")

        elapsed = time.time() - total_start
        print(f"✅ Modèle {model_size} terminé en {elapsed/60:.2f} minutes")
        print(f"📂 Résultats sauvegardés dans {output_jsonl}")

        summary[model_size] = {
            "total_wall_time_sec": elapsed,
            "total_inference_time_sec": total_inference_time,
            "total_audio_duration_sec": total_audio_duration,
            "rtf": total_inference_time / total_audio_duration if total_audio_duration > 0 else None
        }

    # Sauvegarder résumé global
    summary_file = input_json.replace(".json", "_benchmark_summary.json")
    with open(summary_file, "w", encoding="utf-8") as fsum:
        json.dump(summary, fsum, indent=2, ensure_ascii=False)

    print(f"\n📊 Résumé global sauvegardé dans {summary_file}")
    return summary


In [2]:
    summary = benchmark_whisper(
        input_json='data/voxpopuli_fr_train/train_metadata_full.json',
        audio_root='data',
        models=["small", "medium", "large"],  # les modèles Whisper à tester
        sample_size=1000,                     # échantillon aléatoire de fichiers
        device="cuda",                        # ou "cpu"
        beam_size=1,                          # beam size pour la transcription
        language="fr"                         # français
    )

    print("Résumé global :", summary)


🎯 Benchmark sur 1000 fichiers audio.

🚀 Benchmark du modèle Whisper small...


Transcription small: 100%|████████████████████████████| 1000/1000 [10:51<00:00,  1.53it/s]


✅ Modèle small terminé en 10.86 minutes
📂 Résultats sauvegardés dans data/voxpopuli_fr_train/train_metadata_full_small_speed_benchmark.jsonl

🚀 Benchmark du modèle Whisper medium...


Transcription medium: 100%|███████████████████████████| 1000/1000 [18:34<00:00,  1.11s/it]


✅ Modèle medium terminé en 18.57 minutes
📂 Résultats sauvegardés dans data/voxpopuli_fr_train/train_metadata_full_medium_speed_benchmark.jsonl

🚀 Benchmark du modèle Whisper large...


Transcription large: 100%|████████████████████████████| 1000/1000 [46:25<00:00,  2.79s/it]

✅ Modèle large terminé en 46.43 minutes
📂 Résultats sauvegardés dans data/voxpopuli_fr_train/train_metadata_full_large_speed_benchmark.jsonl

📊 Résumé global sauvegardé dans data/voxpopuli_fr_train/train_metadata_full_benchmark_summary.json
Résumé global : {'small': {'total_wall_time_sec': 651.5620958805084, 'total_inference_time_sec': 649.6017773151398, 'total_audio_duration_sec': 10036.407249999997, 'rtf': 0.06472453350427165}, 'medium': {'total_wall_time_sec': 1114.286206960678, 'total_inference_time_sec': 1113.2970192432404, 'total_audio_duration_sec': 10036.407249999997, 'rtf': 0.11092585140397135}, 'large': {'total_wall_time_sec': 2785.6306076049805, 'total_inference_time_sec': 2784.463607311249, 'total_audio_duration_sec': 10036.407249999997, 'rtf': 0.27743629148879445}}



