In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os

# Chemin du dossier de base sur Kaggle
base_dir = "/kaggle/input/test-audio"


# Chargement du fichier CSV
df = pd.read_csv(f"{base_dir}/Audio/metadata_balises_h1_p.csv")

for index, row in df.iterrows():
    # Construction du chemin absolu vers le fichier audio
    audio_path = os.path.join(base_dir, row["audio"])

    # Vérifie l'existence du fichier audio
    if not os.path.exists(audio_path):
        print(f"Fichier manquant : {audio_path}")

    # Vérifie la validité du HTML
    try:
        BeautifulSoup(row["html"], 'html.parser')
    except Exception as e:
        print(f"HTML invalide à la ligne {index} : {e}")

In [2]:
import pandas as pd
import librosa
import os

# Chemin de base du dataset sur Kaggle
base_dir = "/kaggle/input/test-audio"

# Charger le CSV
df = pd.read_csv(f"{base_dir}/Audio/metadata_balises_h1_p.csv")

# Parcourir les échantillons
for index, row in df.iterrows():
    # Construire le chemin absolu du fichier audio
    audio_path = os.path.join(base_dir, row["audio"])
    transcription = row["transcription"]
    html = row["html"]

    # Vérifier que le fichier audio existe avant de le charger
    if os.path.exists(audio_path):
        # Charger l'audio avec librosa
        audio, sr = librosa.load(audio_path, sr=16000)  # 16kHz recommandé pour Whisper
        print(f"Audio: {row['audio']}, Transcription: {transcription}, HTML: {html}")
    else:
        print(f"Fichier audio manquant : {audio_path}")


Audio: Audio/16k_h11.wav, Transcription: balise h1 contenu Titre Principal, HTML: <h1>Titre Principal</h1>
Audio: Audio/16k_h12.wav, Transcription: balise h1 class=titre-Principal contenu Accueil, HTML: <h1 class="titre-Principal">Accueil</h1>
Audio: Audio/16k_h13.wav, Transcription: balise h1 id=entete contenu Bienvenue, HTML: <h1 id="entete">Bienvenue</h1>
Audio: Audio/16k_h14.wav, Transcription: balise h1 style=color:blue contenu Promotions, HTML: <h1 style="color:blue">Promotions</h1>
Audio: Audio/16k_h15.wav, Transcription: balise h1 contenu Nouveautés, HTML: <h1>Nouveautés</h1>
Audio: Audio/16k_h16.wav, Transcription: balise h1 class=section contenu À propos, HTML: <h1 class="section">À propos</h1>
Audio: Audio/16k_h17.wav, Transcription: balise h1 id=entete-Principal contenu Services, HTML: <h1 id="entete">Services</h1>
Audio: Audio/16k_h18.wav, Transcription: balise h1 style=font-size contenu Contact, HTML: <h1 style="font-size">Contact</h1>
Audio: Audio/16k_h19.wav, Transcript

In [3]:
import pandas as pd
from datasets import Dataset, Audio
import os

def prepare_data(csv_path, audio_base_dir):
    df = pd.read_csv(csv_path)
    data = []
    
    for _, row in df.iterrows():
        audio_path = os.path.join(audio_base_dir, row['audio'])
        if not os.path.exists(audio_path):
            print(f"Fichier introuvable : {audio_path}")
            continue
            
        data.append({
            "audio": audio_path,
            "text": row["transcription"]
        })
    
    dataset = Dataset.from_dict({
        "audio": [d["audio"] for d in data],
        "text": [d["text"] for d in data]
    })
    
    # Rééchantillonnage audio à 16 kHz
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    return dataset

# Définition des chemins avec base audio
csv_path = "/kaggle/input/test-audio/Audio/metadata_balises_h1_p.csv"
audio_base_dir = "/kaggle/input/test-audio"

dataset = prepare_data(csv_path, audio_base_dir)

# Attention : dans Kaggle, tu ne peux pas écrire dans le dossier courant
# dataset.save_to_disk("whisper_dataset")  # ça risque de planter ici

# Tu peux éventuellement sauvegarder sur /kaggle/working qui est accessible en écriture :
dataset.save_to_disk("/kaggle/working/whisper_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

In [4]:
!pip install -q openai-whisper

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m:00:0

In [5]:
import whisper
import pandas as pd
import os

# 1. Charger le modèle Whisper
model = whisper.load_model("base")

# 2. Lire le CSV
csv_path = "/kaggle/input/test-audio/Audio/metadata_balises_h1_p.csv"
df = pd.read_csv(csv_path)

# 3. Tester chaque enregistrement
for index, row in df.iterrows():
    audio_file = os.path.join("/kaggle/input/test-audio", row["audio"])  # Correction ici
    expected_text = row["transcription"]
    
    if not os.path.exists(audio_file):
        print(f"Fichier audio manquant : {audio_file}")
        continue

    result = model.transcribe(audio_file, language="fr", task="transcribe")
    transcript = result["text"].strip()

    print(f"\nFichier : {audio_file}")
    print(f"Attendu : {expected_text}")
    print(f"Obtenu  : {transcript}")

    match = "✅" if expected_text.lower() == transcript.lower() else "❌"
    print(f"Correspondance : {match}")

100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 153MiB/s]



Fichier : /kaggle/input/test-audio/Audio/16k_h11.wav
Attendu : balise h1 contenu Titre Principal
Obtenu  : BallyzH1, grand tenure t2 principale.
Correspondance : ❌

Fichier : /kaggle/input/test-audio/Audio/16k_h12.wav
Attendu : balise h1 class=titre-Principal contenu Accueil
Obtenu  : Bali, H1, classe titre principal, continue à c'est
Correspondance : ❌

Fichier : /kaggle/input/test-audio/Audio/16k_h13.wav
Attendu : balise h1 id=entete contenu Bienvenue
Obtenu  : Baleeze elements et il a souvent antennae
Correspondance : ❌

Fichier : /kaggle/input/test-audio/Audio/16k_h14.wav
Attendu : balise h1 style=color:blue contenu Promotions
Obtenu  : Valise-là un style color blue. Continue promotion.
Correspondance : ❌

Fichier : /kaggle/input/test-audio/Audio/16k_h15.wav
Attendu : balise h1 contenu Nouveautés
Obtenu  : Valise h1 nope Now울K
Correspondance : ❌

Fichier : /kaggle/input/test-audio/Audio/16k_h16.wav
Attendu : balise h1 class=section contenu À propos
Obtenu  : Bâler le H1 class sect

In [6]:
from datasets import load_dataset, Audio
import os

csv_path = "/kaggle/input/test-audio/Audio/metadata_balises_h1_p.csv"
audio_base_dir = "/kaggle/input/test-audio/Audio"

# 1. Charger le dataset
dataset = load_dataset("csv", data_files=csv_path)["train"]

# 2. Corriger les chemins audio pour éviter le doublon "Audio/Audio"
def fix_audio_path(example):
    audio_path = example["audio"]
    # Si le chemin commence par "Audio/", on l'enlève pour éviter le doublon
    if audio_path.startswith("Audio/"):
        audio_path = audio_path[len("Audio/"):]
    example["audio"] = os.path.join(audio_base_dir, audio_path)
    return example

dataset = dataset.map(fix_audio_path)

# 3. Cast colonne audio
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

# 4. Vérification des chemins
def check_audio_paths(example):
    if isinstance(example["audio"], dict):
        path = example["audio"]["path"]
    else:
        path = example["audio"]
    if not os.path.exists(path):
        raise FileNotFoundError(f"Fichier {path} introuvable")
    return example

dataset = dataset.map(check_audio_paths)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [7]:
dataset.save_to_disk("dataset_processed")

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

In [8]:
from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import load_from_disk
import torch
from torch.nn.utils.rnn import pad_sequence

# 1. Chargement des données
dataset = load_from_disk("dataset_processed")

# 2. Initialisation du modèle
model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name, language="fr", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# 3. Préparation des batches (suppression de tout print de débogage)
def prepare_batch(batch):
    audio = batch["audio"]
    input_features = processor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"],
        return_tensors="pt"
    ).input_features[0]  # Accès au tensor directement
    
    labels = processor.tokenizer(batch["transcription"], return_tensors="pt").input_ids[0]
    
    return {
        "input_features": input_features,
        "labels": labels
    }

# Appliquer la préparation des batches
dataset = dataset.map(prepare_batch)

# 4. Data Collator (conserver les impressions pour débogage si nécessaire)
def data_collator(features):
    input_features = torch.stack([torch.tensor(f["input_features"]) for f in features])
    
    # Gestion du padding des labels
    labels = [torch.tensor(f["labels"]) for f in features]
    labels = pad_sequence(labels, batch_first=True, padding_value=processor.tokenizer.pad_token_id)
    
    # Débogage : vérifier les formes (peut être commenté si plus nécessaire)
    print("Input features shape:", input_features.shape)
    print("Labels shape:", labels.shape)
    
    return {
        "input_features": input_features,
        "labels": labels
    }

# 5. Configuration de l'entraînement
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper_finetuned_html",
    run_name="whisper_finetune_run_1",  # Nom unique pour WandB
    per_device_train_batch_size=1,  # Taille réduite pour éviter les problèmes de mémoire
    learning_rate=1e-5,
    num_train_epochs=3,
    fp16=True,  # Précision mixte pour optimiser la mémoire GPU
    save_steps=100,
    logging_steps=10,  # Log fréquent pour surveiller
    logging_strategy="steps",
    logging_first_step=True,
    report_to="none",  # Désactiver WandB (retirer si vous utilisez WandB)
)

# 6. Vérification du dataset avant entraînement
print("Taille du dataset :", len(dataset))
#print("Exemple de données :", dataset[0])

# 7. Lancement de l'entraînement
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    processing_class=processor,  # Utilisation de processing_class pour éviter FutureWarning
    data_collator=data_collator,
)

# S'assurer que le modèle n'est pas enveloppé dans DataParallel (pour éviter l'avertissement _functions.py)
if torch.cuda.device_count() > 1:
    print("Multiple GPUs detected, but using single GPU to avoid parallel warning.")
    model = model.to("cuda:0")
else:
    model = model.to("cuda")

trainer.train()

2025-06-04 11:33:30.931682: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749036811.098040      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749036811.148031      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Taille du dataset : 20
Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 17])
Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 16])


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1,5.7193
10,2.5499
20,0.8876
30,0.618


Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 23])
Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 15])
Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 21])
Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 19])
Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 23])
Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 18])
Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 17])
Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 20])
Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 17])
Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 17])
Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 20])
Input features shape: torch.Size([2, 80, 3000])
Labels shape: torch.Size([2, 23])
Input features s



TrainOutput(global_step=30, training_loss=1.4574635028839111, metrics={'train_runtime': 35.8534, 'train_samples_per_second': 1.673, 'train_steps_per_second': 0.837, 'total_flos': 1.73151240192e+16, 'train_loss': 1.4574635028839111, 'epoch': 3.0})

In [9]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.

In [10]:
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
from datasets import load_from_disk
import evaluate
import torch

# 1. Charger le processeur et le modèle fine-tuné
checkpoint_dir = "./whisper_finetuned_html/checkpoint-30"  # Remplacez par le bon checkpoint si différent
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="fr", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_dir)

In [11]:
# 2. Configurer la pipeline
transcriber = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

Device set to use cuda


In [14]:
# 3. Charger le dataset (votre dataset local)
dataset = load_from_disk("dataset_processed")  # Remplacez par le chemin de votre dataset de test si différent

In [19]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0


In [20]:
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor, GenerationConfig
from datasets import load_from_disk
import evaluate
import torch
import numpy as np

# 1. Charger le processeur et le modèle fine-tuné
checkpoint_dir = "./whisper_finetuned_html/checkpoint-30"  # Remplacez par le bon checkpoint si différent
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="fr", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_dir)

# 2. Réinitialiser la configuration de génération pour éviter les conflits
model.generation_config = GenerationConfig.from_pretrained(
    "openai/whisper-small",
    language="french",
    task="transcribe",
    forced_decoder_ids=None  # Désactiver explicitement forced_decoder_ids
)

# 3. Configurer la pipeline
transcriber = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# 4. Charger le dataset
dataset = load_from_disk("dataset_processed")  # Remplacez par le chemin de votre dataset de test si différent

# 5. Test sur un exemple
audio_sample = dataset["audio"][0]
transcription = transcriber(
    audio_sample["array"],  # Passer directement le tableau NumPy brut
    generate_kwargs={
        "language": "french",
        "task": "transcribe",
        "forced_decoder_ids": None  # Désactiver forced_decoder_ids
    }
)
print(f"Résultat de la transcription : {transcription['text']}")
print(f"Transcription réelle : {dataset['transcription'][0]}")

# 6. Évaluer avec la métrique WER (Word Error Rate)
wer_metric = evaluate.load("wer")
predictions = []
references = dataset["transcription"]

for audio in dataset["audio"]:
    pred = transcriber(
        audio["array"],  # Passer directement le tableau NumPy brut
        generate_kwargs={
            "language": "french",
            "task": "transcribe",
            "forced_decoder_ids": None  # Désactiver forced_decoder_ids
        }
    )["text"]
    predictions.append(pred)

wer = wer_metric.compute(predictions=predictions, references=references)
print(f"WER : {wer:.2%}")

Device set to use cuda


Résultat de la transcription : balise h1 contenu Titre Principal
Transcription réelle : balise h1 contenu Titre Principal


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


WER : 22.43%


In [None]:
pmspm s pms