## Install Libraries

In [None]:
!pip install datasets evaluate jiwer
!pip install transformers[torch]
!pip install accelerate -U
!pip install transformers==4.28.0
!pip install soundfile
!pip install librosa

## Import libraries

In [None]:
from datasets import load_dataset, Audio
from dataclasses import dataclass
import evaluate
import numpy as np
import torch
from transformers import AutoProcessor, AutoModelForCTC, TrainingArguments, Trainer
from typing import Dict, List, Union

## Loading the dataset

In [None]:
# Load the PolyAI/minds14 French dataset and split it into training and test sets.
minds = load_dataset("PolyAI/minds14", name="fr-FR", split="train[:100]")
minds = minds.train_test_split(test_size=0.2)
minds

In [None]:
# Remove unnecessary columns from the dataset to simplify the data structure.
minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])

In [None]:
# Initialize the processor for the Wav2Vec2-XLSR-53 model.
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-xlsr-53-french")

In [None]:
# Adjust the sampling rate of the audio for compatibility with the model.
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]

## Preprocessing the data

In [None]:
# Define a function to convert the transcriptions to uppercase to match the tokenizer's vocabulary.
def uppercase(example):
    return {"transcription": example["transcription"].upper()}

minds = minds.map(uppercase)

In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]
    # Extract the input_values from the audio file and tokenize the transcription column with the processor.
    batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
    # Get the length of the input_values and add it to the batch.
    batch["input_length"] = len(batch["input_values"][0])
    return batch

encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)

## Defining a custom DataCollatorCTCWithPadding

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: AutoProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"][0]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")

## Defineing a custom metric calculation function

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # Replace label_ids containing -100 with the processor's pad_token_id
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode the predicted and label sequences for computing the Word Error Rate (WER).
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    # Load the Word Error Rate (WER) function and compute the WER.
    wer = evaluate.load("wer")
    wer = wer.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## Training

In [None]:
model = AutoModelForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53-french",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)

In [None]:
training_args = TrainingArguments(
    output_dir="../model/fine_tuned_wav2vec2_large_xlsr_53_french_model",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=2000,
    gradient_checkpointing=True,
    group_by_length=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=processor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

## Inference

In [None]:
# Load the dataset for inference purposes and configure the sampling rate for the audio data.
dataset = load_dataset("PolyAI/minds14", "fr-FR", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
sampling_rate = dataset.features["audio"].sampling_rate
audio_file = dataset[0]["audio"]["path"]

In [None]:
# Process the sample audio file
processor = AutoProcessor.from_pretrained("../model/fine_tuned_wav2vec2_large_xlsr_53_french_model/checkpoint-10")
inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [None]:
# Load the fine-tuned model from the checkpoint and generate logits for the input audio.
model = AutoModelForCTC.from_pretrained("../model/fine_tuned_wav2vec2_large_xlsr_53_french_model/checkpoint-10")
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
# Compute the predicted IDs from the logits and decode the predicted transcription.
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
transcription