Wandb un outil pour suivre lentrainement des poid de son model

In [1]:
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2ForCTC,Wav2Vec2Processor,Trainer,TrainingArguments,Wav2Vec2FeatureExtractor
import os
os.environ['WANDB_DISABLED '] = 'True'

2024-12-23 17:28:37.780224: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## PRETRAITEMENT SPEED UP

In [2]:
import wave
import numpy as np
import os
from pathlib import Path

def change_speed(input_file, output_file, speed_factor):
    """
    Change la vitesse d'un fichier WAV.

    Parameters:
    input_file (str): Chemin du fichier WAV d'entrée
    output_file (str): Chemin du fichier WAV de sortie
    speed_factor (float): Facteur de multiplication de la vitesse
                         (0.5 = vitesse divisée par 2, 2.0 = vitesse doublée)
    """
    try:
        with wave.open(input_file, 'rb') as wf:
            # Obtenir les paramètres du fichier
            n_channels = wf.getnchannels()
            sampwidth = wf.getsampwidth()
            framerate = wf.getframerate()
            n_frames = wf.getnframes()

            # Lire tous les frames
            frames = wf.readframes(n_frames)

            # Convertir les frames en tableau numpy
            signal = np.frombuffer(frames, dtype=np.int16)

            # Remodeler le signal si stéréo
            if n_channels == 2:
                signal = signal.reshape(-1, 2)

            # Calculer le nombre de frames à conserver
            new_length = int(len(signal) / speed_factor)

            # Rééchantillonner le signal
            indices = np.round(np.linspace(0, len(signal) - 1, new_length)).astype(int)
            new_signal = signal[indices]

            # Écrire le signal modifié dans le fichier de sortie
            with wave.open(output_file, 'wb') as wf_out:
                wf_out.setnchannels(n_channels)
                wf_out.setsampwidth(sampwidth)
                wf_out.setframerate(framerate)
                wf_out.writeframes(new_signal.tobytes())
        return True
    except Exception as e:
        print(f"Erreur lors du traitement de {input_file}: {str(e)}")
        return False

def process_directory(input_dir, output_dir, speed_factor):
    """
    Traite tous les fichiers WAV dans un dossier.

    Parameters:
    input_dir (str): Chemin du dossier contenant les fichiers WAV
    output_dir (str): Chemin du dossier où sauvegarder les fichiers modifiés
    speed_factor (float): Facteur de multiplication de la vitesse
    """
    # Créer le dossier de sortie s'il n'existe pas
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Compteurs pour le suivi
    total_files = 0
    successful_files = 0

    # Parcourir tous les fichiers WAV du dossier
    for file in os.listdir(input_dir):
        if file.lower().endswith('.wav'):
            total_files += 1
            input_path = os.path.join(input_dir, file)
            # Créer le nom du fichier de sortie avec indication de la vitesse
            speed_indicator = f"_{int(speed_factor*100)}percent"
            output_filename = file.rsplit('.', 1)[0] + speed_indicator + '.wav'
            output_path = os.path.join(output_dir, output_filename)

            print(f"Traitement de: {file}")
            if change_speed(input_path, output_path, speed_factor):
                successful_files += 1

    # Afficher le résumé
    print(f"\nRésumé du traitement:")
    print(f"Fichiers traités: {successful_files}/{total_files}")
    if total_files > 0:
        print(f"Taux de réussite: {(successful_files/total_files)*100:.1f}%")

# Exemple d'utilisation
if __name__ == "__main__":
    # Définir les chemins des dossiers
    input_directory = "../data/YembaEGRA/audio/CI1/W1"    # Dossier contenant les fichiers WAV originaux(chemin vers dssier du drive)
    output_directory = "output_wav"   # Dossier où seront sauvegardés les fichiers modifiés
    speed_factor = 1.5               # Facteur de vitesse (2.0 = double vitesse)

    # Traiter tous les fichiers du dossier
    process_directory(input_directory, output_directory, speed_factor)

Traitement de: spkr_7_word_1_occ_2_ci_1_l_3.wav
Traitement de: spkr_9_word_1_occ_1_ci_1_l_3.wav
Traitement de: spkr_14_word_1_occ_2_ci_1_l_3.wav
Traitement de: spkr_63_word_1_occ_2_ci_1_l_3.wav
Traitement de: spkr_34_word_1_occ_1_ci_1_l_3.wav
Traitement de: spkr_43_word_1_occ_1_ci_1_l_3.wav
Traitement de: spkr_65_word_1_occ_1_ci_1_l_3.wav
Traitement de: spkr_12_word_1_occ_1_ci_1_l_3.wav
Traitement de: spkr_1_word_1_occ_1_ci_1_l_3.wav
Traitement de: spkr_45_word_1_occ_2_ci_1_l_3.wav
Traitement de: spkr_32_word_1_occ_2_ci_1_l_3.wav
Traitement de: spkr_50_word_1_occ_2_ci_1_l_3.wav
Traitement de: spkr_27_word_1_occ_2_ci_1_l_3.wav
Traitement de: spkr_29_word_1_occ_1_ci_1_l_3.wav
Traitement de: spkr_21_word_1_occ_1_ci_1_l_3.wav
Traitement de: spkr_56_word_1_occ_1_ci_1_l_3.wav
Traitement de: spkr_58_word_1_occ_2_ci_1_l_3.wav
Traitement de: spkr_28_word_1_occ_2_ci_1_l_3.wav
Traitement de: spkr_26_word_1_occ_1_ci_1_l_3.wav
Traitement de: spkr_20_word_1_occ_2_ci_1_l_3.wav
Traitement de: spkr_59_

In [3]:
import logging
import transformers
transformers.logging.get_verbosity = lambda: logging.NOTSET

In [4]:
transformers.logging.get_verbosity()

0

## CREATION DU VOCABULAIRE POUR CREER LE DECODEUR CTC Tokeniser

#### Pretraitement du corpus

In [5]:
import pandas as pd
dfmeta =pd.read_csv("../data/YembaEGRA/metadata/words_corpus.csv")
dfmeta

Unnamed: 0,id_word,Yemba,Phonetique,Français,English,id_ci,Centre d'interetI(FR),Center of Interest(EN)
0,1,Mbeŋ,Mbǝŋ,La pluie,The rain,1,La nature,The nature
1,2,Mbīŋ,Mbǝŋ,La forêt,The forest,1,La nature,The nature
2,3,Míá ntshi,Míá tse,La rivière,A river,1,La nature,The nature
3,4,Lekwɛ̄t,Lekwɛ̄t,La montagne,A mountain,1,La nature,The nature
4,5,Meŋwɛ́’tsāŋ,Mǝŋwɛ́’tsāŋ,Le chat,A cat,1,La nature,The nature
5,6,Nzenzhɛ,Nzǝnzhɛ,la mouche,A fly,1,La nature,The nature
6,7,Nu,Nu,Le soleil,The sun,1,La nature,The nature
7,8,Ŋgāp,Ŋgāp,La poule,The hen,1,La nature,The nature
8,9,Ŋkā’,Ŋkā’,La plantation,The plantation,2,"Le village, La ville","The village, the city"
9,10,Aphíɛ ntsō,Aphíɛ ntsō,Le semis,Sowing,2,"Le village, La ville","The village, the city"


In [6]:
import os
import pandas as pd

def recuperer_fichiers_wav(dossier):
    """
    Parcourt récursivement l'arborescence d'un dossier pour récupérer les noms des fichiers .wav
    et les stocker dans un DataFrame Pandas.

    Args:
        dossier (str): Chemin du dossier à analyser.

    Returns:
        pd.DataFrame: Un DataFrame contenant les chemins complets et les noms des fichiers .wav.
    """
    fichiers_wav = []

    # Parcourir récursivement l'arborescence du dossier
    for chemin, _, fichiers in os.walk(dossier):
        for fichier in fichiers:
            if fichier.lower().endswith('.wav'):  # Vérifie si l'extension est .wav (insensible à la casse)
                fichiers_wav.append({
                    "Chemin complet": os.path.join(chemin, fichier),
                    "Nom du fichier": fichier
                })

    # Créer un DataFrame à partir de la liste des fichiers .wav
    df = pd.DataFrame(fichiers_wav)

    return df

# Exemple d'utilisation
chemin_dossier = "../data/YembaEGRA/audio/CI1/W1"
df_wav = recuperer_fichiers_wav(chemin_dossier)
print(df_wav)

                                        Chemin complet  \
0    ../data/YembaEGRA/audio/CI1/W1/spkr_7_word_1_o...   
1    ../data/YembaEGRA/audio/CI1/W1/spkr_9_word_1_o...   
2    ../data/YembaEGRA/audio/CI1/W1/spkr_14_word_1_...   
3    ../data/YembaEGRA/audio/CI1/W1/spkr_63_word_1_...   
4    ../data/YembaEGRA/audio/CI1/W1/spkr_34_word_1_...   
..                                                 ...   
129  ../data/YembaEGRA/audio/CI1/W1/spkr_6_word_1_o...   
130  ../data/YembaEGRA/audio/CI1/W1/spkr_8_word_1_o...   
131  ../data/YembaEGRA/audio/CI1/W1/spkr_62_word_1_...   
132  ../data/YembaEGRA/audio/CI1/W1/spkr_35_word_1_...   
133  ../data/YembaEGRA/audio/CI1/W1/spkr_42_word_1_...   

                        Nom du fichier  
0     spkr_7_word_1_occ_2_ci_1_l_3.wav  
1     spkr_9_word_1_occ_1_ci_1_l_3.wav  
2    spkr_14_word_1_occ_2_ci_1_l_3.wav  
3    spkr_63_word_1_occ_2_ci_1_l_3.wav  
4    spkr_34_word_1_occ_1_ci_1_l_3.wav  
..                                 ...  
129   spkr_6_wor

In [7]:
import pandas as pd
import re

def extraire_infos_fichiers(df):
    """
    Ajoute des colonnes (spkr, word, occ, ci, l) au DataFrame en extrayant les informations
    des noms de fichiers au format 'spkr_50_word_14_occ_2_ci_2_l_3.wav'.

    Args:
        df (pd.DataFrame): DataFrame contenant une colonne 'Nom du fichier' avec les noms des fichiers.

    Returns:
        pd.DataFrame: Le DataFrame avec les nouvelles colonnes extraites.
    """
    # Définir une fonction pour extraire les informations d'un nom de fichier
    def extraire_infos(fichier):
        # Regex pour correspondre au format attendu
        pattern = r"spkr_(\d+)_word_(\d+)_occ_(\d+)_ci_(\d+)_l_(\d+).wav"
        match = re.match(pattern, fichier)
        if match:
            return {
                "spkr": int(match.group(1)),
                "word": int(match.group(2)),
                "occ": int(match.group(3)),
                "ci": int(match.group(4)),
                "l": int(match.group(5))
            }
        else:
            # Retourner None si le format ne correspond pas
            return {"spkr": None, "word": None, "occ": None, "ci": None, "l": None}

    # Appliquer cette fonction à la colonne 'Nom du fichier' et créer les nouvelles colonnes
    infos_extraites = df["Nom du fichier"].apply(extraire_infos)
    df_infos = pd.DataFrame(list(infos_extraites))

    # Ajouter les nouvelles colonnes au DataFrame d'origine
    df = pd.concat([df, df_infos], axis=1)

    return df



In [8]:


df_fichier = extraire_infos_fichiers(df_wav)


In [9]:
from datasets import Dataset
import pandas as pd





division du dataset en trois 2 pour lentrainement pour faire des traitement parrallel et un pour le test

In [10]:
from sklearn.model_selection import train_test_split
resultat = pd.merge(df_fichier, dfmeta, left_on="word", right_on="id_word", how="inner")

train,test = train_test_split(resultat, test_size=0.008, random_state=42)
common_voice_train_1 = Dataset.from_pandas(train[int(len(train)/2):])
common_voice_train_2 = Dataset.from_pandas(train[0:int(len(train)/2)])
common_voice_test = Dataset.from_pandas(test)
common_voice_train_1

Dataset({
    features: ['Chemin complet', 'Nom du fichier', 'spkr', 'word', 'occ', 'ci', 'l', 'id_word', 'Yemba', 'Phonetique', 'Français', 'English', 'id_ci', "Centre d'interetI(FR)", 'Center of Interest(EN)', '__index_level_0__'],
    num_rows: 66
})

In [11]:
common_voice_train_1 = common_voice_train_1.remove_columns([ 'Nom du fichier', 'spkr', 'word', 'occ', 'ci', 'l', 'id_word', 'Phonetique', 'Français', 'English', 'id_ci', "Centre d'interetI(FR)", 'Center of Interest(EN)'])
common_voice_train_2 = common_voice_train_2.remove_columns([ 'Nom du fichier', 'spkr', 'word', 'occ', 'ci', 'l', 'id_word', 'Phonetique', 'Français', 'English', 'id_ci', "Centre d'interetI(FR)", 'Center of Interest(EN)'])

common_voice_test = common_voice_test.remove_columns([ 'Nom du fichier', 'spkr', 'word', 'occ', 'ci', 'l', 'id_word', 'Phonetique', 'Français', 'English', 'id_ci', "Centre d'interetI(FR)", 'Center of Interest(EN)'])

In [12]:
import random
from IPython.display import display, HTML
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [13]:
show_random_elements(common_voice_train_1, num_examples=20)

Unnamed: 0,Chemin complet,Yemba,__index_level_0__
0,../data/YembaEGRA/audio/CI1/W1/spkr_29_word_1_occ_1_ci_1_l_3.wav,Mbeŋ,13
1,../data/YembaEGRA/audio/CI1/W1/spkr_56_word_1_occ_2_ci_1_l_3.wav,Mbeŋ,114
2,../data/YembaEGRA/audio/CI1/W1/spkr_69_word_1_occ_1_ci_1_l_3.wav,Mbeŋ,70
3,../data/YembaEGRA/audio/CI1/W1/spkr_21_word_1_occ_1_ci_1_l_3.wav,Mbeŋ,14
4,../data/YembaEGRA/audio/CI1/W1/spkr_61_word_1_occ_1_ci_1_l_3.wav,Mbeŋ,75
5,../data/YembaEGRA/audio/CI1/W1/spkr_33_word_1_occ_1_ci_1_l_3.wav,Mbeŋ,29
6,../data/YembaEGRA/audio/CI1/W1/spkr_50_word_1_occ_1_ci_1_l_3.wav,Mbeŋ,115
7,../data/YembaEGRA/audio/CI1/W1/spkr_65_word_1_occ_2_ci_1_l_3.wav,Mbeŋ,103
8,../data/YembaEGRA/audio/CI1/W1/spkr_43_word_1_occ_2_ci_1_l_3.wav,Mbeŋ,107
9,../data/YembaEGRA/audio/CI1/W1/spkr_1_word_1_occ_2_ci_1_l_3.wav,Mbeŋ,105


In [14]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch["Yemba"] = re.sub(chars_to_ignore_regex, '', batch["Yemba"]).lower() + " "
    return batch

In [15]:
common_voice_train_1 = common_voice_train_1.map(remove_special_characters)
common_voice_train_2 = common_voice_train_2.map(remove_special_characters)

common_voice_test = common_voice_test.map(remove_special_characters)

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [16]:
show_random_elements(common_voice_train_1)

Unnamed: 0,Chemin complet,Yemba,__index_level_0__
0,../data/YembaEGRA/audio/CI1/W1/spkr_8_word_1_occ_1_ci_1_l_3.wav,mbeŋ,130
1,../data/YembaEGRA/audio/CI1/W1/spkr_12_word_1_occ_1_ci_1_l_3.wav,mbeŋ,7
2,../data/YembaEGRA/audio/CI1/W1/spkr_42_word_1_occ_2_ci_1_l_3.wav,mbeŋ,23
3,../data/YembaEGRA/audio/CI1/W1/spkr_51_word_1_occ_2_ci_1_l_3.wav,mbeŋ,121
4,../data/YembaEGRA/audio/CI1/W1/spkr_29_word_1_occ_1_ci_1_l_3.wav,mbeŋ,13
5,../data/YembaEGRA/audio/CI1/W1/spkr_63_word_1_occ_1_ci_1_l_3.wav,mbeŋ,111
6,../data/YembaEGRA/audio/CI1/W1/spkr_50_word_1_occ_1_ci_1_l_3.wav,mbeŋ,115
7,../data/YembaEGRA/audio/CI1/W1/spkr_28_word_1_occ_1_ci_1_l_3.wav,mbeŋ,123
8,../data/YembaEGRA/audio/CI1/W1/spkr_17_word_1_occ_1_ci_1_l_3.wav,mbeŋ,58
9,../data/YembaEGRA/audio/CI1/W1/spkr_42_word_1_occ_1_ci_1_l_3.wav,mbeŋ,133


In [17]:
def extract_all_chars(batch):
  all_text = " ".join(batch["Yemba"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [18]:
vocab_train_2 = common_voice_train_2.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train_2.column_names)

vocab_train_1 = common_voice_train_1.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train_1.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [19]:
vocab_train_2 = common_voice_train_2.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train_2.column_names)

vocab_train_1 = common_voice_train_1.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train_1.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [20]:
vocab_list = list(set(vocab_train_2["vocab"][0]) |set(vocab_train_1["vocab"][0]) | set(vocab_test["vocab"][0]))
print(vocab_list)

[' ', 'e', 'ŋ', 'b', 'm']


#### CREATION DU VOCABULAIRE

In [21]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{' ': 0, 'e': 1, 'ŋ': 2, 'b': 3, 'm': 4}

In [22]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [23]:
vocab_dict["[UNK]"] = len(vocab_dict)  #unknown token
vocab_dict["[PAD]"] = len(vocab_dict)  #epselon du language
len(vocab_dict)

7

In [24]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

#### CREATION DU CTC

In [25]:
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

## FINETUNING DE LENCODEUR

#### feature extractor

In [26]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
feature_extractor.sampling_rate

16000

In [27]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [28]:
processor.save_pretrained("./ASR YEMBA Data/wav2vec2-large-xlsr-Yemba")

[]

In [29]:
common_voice_train_1[0]

{'Chemin complet': '../data/YembaEGRA/audio/CI1/W1/spkr_12_word_1_occ_1_ci_1_l_3.wav',
 'Yemba': 'mbeŋ ',
 '__index_level_0__': 7}

In [30]:
import soundfile as sf
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = sf.read(batch["Chemin complet"])
    batch["speech"] = speech_array
    batch["sampling_rate"] = 16000
    batch["target_text"] = batch["Yemba"]
    return batch

def speech_file_to_array_fn_test(batch):
    speech_array, sampling_rate = sf.read(batch["Chemin complet"])
    batch["speech"] = speech_array
    batch["sampling_rate"] = 16000
    return batch

convertion des son en vecteur

In [31]:
common_voice_train_1 = common_voice_train_1.map(speech_file_to_array_fn, remove_columns=common_voice_train_1.column_names,num_proc=3)
common_voice_train_2 = common_voice_train_2.map(speech_file_to_array_fn, remove_columns=common_voice_train_2.column_names,num_proc=3)

common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names,num_proc=2)

Map (num_proc=3):   0%|          | 0/66 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/66 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

In [32]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(common_voice_train_1)-1)

ipd.Audio(data=np.asarray(common_voice_train_1[rand_int]["speech"]), autoplay=True, rate=16000)

In [33]:
rand_int = random.randint(0, len(common_voice_train_1)-1)

print("Target text:", common_voice_train_1[rand_int]["target_text"])
print("Input array shape:", np.asarray(common_voice_train_1[rand_int]["speech"]).shape)
print("Sampling rate:", common_voice_train_1[rand_int]["sampling_rate"])

Target text: mbeŋ 
Input array shape: (21862, 2)
Sampling rate: 16000


In [34]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [35]:
def prepare_dataset_test(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], padding=True,sampling_rate=batch["sampling_rate"][0]).input_values

    return batch

In [36]:
common_voice_train_1.shape,common_voice_test.shape

((66, 3), (2, 3))

In [37]:
common_voice_train_1 = common_voice_train_1.map(prepare_dataset, remove_columns=common_voice_train_1.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_train_2 = common_voice_train_2.map(prepare_dataset, remove_columns=common_voice_train_2.column_names, batch_size=8, num_proc=4, batched=True)

Map (num_proc=4):   0%|          | 0/66 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/66 [00:00<?, ? examples/s]



In [38]:
from datasets import concatenate_datasets

common_voice_train = concatenate_datasets([common_voice_train_1, common_voice_train_2])


### ETRAINEMENT

In [39]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

from tqdm import tqdm
import torch

In [40]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [41]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [42]:
# prompt: code necessaire pour wer_metric = load_metric("wer")

# D'abord installer evaluate
!pip install jiwer
!pip install evaluate
from evaluate import load
wer_metric = load("wer")

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew P

In [43]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [44]:
len(processor.tokenizer)

9

In [45]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
model.freeze_feature_extractor()



In [47]:
training_args = TrainingArguments(
  output_dir="./wav2vec2-large-xlsr-Yemba",
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=40,
  fp16=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=3e-4,
  warmup_steps=1000,
  save_total_limit=2,
)



In [48]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    tokenizer=processor.feature_extractor,
)

  trainer = Trainer(


ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [None]:
os.environ['WANDB_MODE'] = 'dryrun'
os.environ['WANDB_DISABLED '] = 'True'

In [None]:
common_voice_train.shape

In [None]:
trainer.train()