# Setup

In [1]:
# %%capture
# !pip install --upgrade huggingface_hub
# !pip install transformers
# !pip install torchaudio
# !pip install librosa
# !pip install jiwer
# !pip install soundfile
# !pip install evaluate
# !pip uninstall accelerate -y
# !pip install accelerate -U
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [1]:
!pip show transformers

Name: transformers
Version: 4.37.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: C:\Users\maron\anaconda3\Lib\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


In [None]:
import json
import torch
import torch.multiprocessing as mp
import torch.distributed as dist
import sys
import os
import toml
import warnings
import datetime
warnings.filterwarnings('ignore')
os.chdir(r"/mnt/f/IA/WOLOF/SPEECH_TO_TEXT")
# to add the path of the different on module
# sys.path.append(r'CODES/ASR-Wav2vec-Finetune-main')

In [None]:
import librosa
import numpy as np

def load_wav(path, sr):
    return librosa.load(path, sr = sr)[0]

def subsample(data, sub_sample_length):
    assert np.ndim(data) == 1, f"Only support 1D data. The dim is {np.ndim(data)}"
    length = len(data)

    if length > sub_sample_length:
        start = np.random.randint(length - sub_sample_length)
        end = start + sub_sample_length
        data = data[start:end]
        assert len(data) == sub_sample_length
        return data
    elif length < sub_sample_length:
        data = np.append(data, np.zeros(sub_sample_length - length, dtype=np.float32))
        assert len(data) == sub_sample_length
        return data
    else:
        return data

In [None]:
import sys
import torch

# from utils.feature import load_wav
from typing import Dict

class InstanceDataset:
    def __init__(self, data, sr):
        self.data = data
        self.sr = sr
        
    def __len__(self) -> int:
        return len(self.data)
        
    def __getitem__(self, idx) -> tuple:
        item = self.data.iloc[idx]
        feature = load_wav(item['path'], sr = self.sr)
        
        return feature, item['transcript']

In [None]:
import pandas as pd
import sys
import os
import re
import regex
import librosa
import numpy as np
from pandarallel import pandarallel
from typing import Dict, List

# For testing 
sys.path.append('..')

from sklearn.model_selection import train_test_split
# from utils.feature import load_wav
from tqdm import tqdm
from torch.utils.data import Dataset
# from dataloader.dataset import Dataset as InstanceDataset


class BaseDataset(Dataset):
    def __init__(self, path, sr, delimiter, special_tokens, transform = None):
        # self.rank = rank
        self.dist = dist
        self.sr = sr
        # Special characters to remove in your data 
        self.chars_to_ignore = r'[,?.!\-;:"“%\'�]'
        self.chars_to_ignore = r'[кɲớˈ\'\xa0\r\n]'
        self.chars_to_keep = r'[^a-zA-Z\sёñïóŋöäàéîā́сđớ\'ˈоɗɲtx їüúaëçèĩã̈ûjämсукéеɓìs️öŋïõăаrýànóvñlò̃qẽyfƭhgziâwíńồpêáôùёībkр]'
        self.replace_dict = {'ï': 'a', 'î': 'i', 'ā': 'a', 'ƭ': 'c', 'ī': 'i', 'ä': 'a', 'ɗ': 'nd', 'ń': 'ñ', 'ồ': 'o',
                    'ї': 'i', 'ü': 'u', 'ù': 'u', 'ú': 'u', 'ă': 'ã', '̃': '', 'â': 'a', '́': '', 'û': 'u',
                    '̈': '', 'è': 'e', 'ç': 's', 'ö': 'o', 'ý': 'y', 'ì': 'i', 'í': 'i', '̀': '', 'ɓ':'b', 'ô':'o',
                    'ê':'e', 'à':'a'}
        self.transform = transform
        self.df = self.load_data(path, delimiter)
        self.special_tokens = special_tokens
    
    def remove_special_characters(self, transcript) -> str:
        cleaned_transcript = ""
        for word in transcript.split(','):
            if cleaned_transcript != "":
                cleaned_transcript = cleaned_transcript + ", " + word.rstrip().lstrip()
            else:
                cleaned_transcript = word
        cleaned_transcript = regex.sub(self.chars_to_keep, '', cleaned_transcript).lower() + " "
        cleaned_transcript = regex.sub(self.chars_to_ignore, '', cleaned_transcript) + " "

        for key, value in self.replace_dict.items():
            cleaned_transcript = regex.sub(key, value, cleaned_transcript) + " "
        return cleaned_transcript

    def get_vocab_dict(self) -> Dict[int, str]:
        # Read https://huggingface.co/blog/fine-tune-wav2vec2-english for more information
        all_text = " ".join(list(self.df["transcript"]))
        #  remove special tokens in all_text, otherwise it will tokenize the special tokens' characters. Eg: <unk> -> '<', 'u', 'n', 'k', '>'
        for v in self.special_tokens.values():
            all_text = all_text.replace(v, '')
        vocab_list = list(set(all_text))
        vocab_list.sort()
        vocab_dict = {v: k for k, v in enumerate(vocab_list)}

        vocab_dict["|"] = vocab_dict[" "]
        del vocab_dict[" "]
        for v in self.special_tokens.values():
            vocab_dict[v] = len(vocab_dict)
        print(vocab_dict)
        return vocab_dict

    def preload_dataset(self, paths, sr) -> List:
        wavs = []
        print("Preloading {} data".format(self.mode))
        for path in tqdm(paths, total = len(paths)):
            wav = load_wav(path, sr)
            wavs += [wav]
        return wavs

    def load_data(self, path, delimiter) -> pd.DataFrame:
        df = pd.read_csv(path, delimiter = delimiter)
        return df

    def get_data(self) -> Dataset:
        ds = InstanceDataset(self.df, self.sr, self.preload_data, self.transform)
        return ds

In [2]:
import numpy as np
import pandas as pd
import os
import pickle

In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.


In [4]:
os.chdir(r"C:\Users\maron\OneDrive\02-Documents\03.PROJETS\00.INFORMATIQUE\02.AI\WOLOF")

In [None]:
special_tokens = {"bos_token" : "<bos>",
"eos_token" : "<eos>",
"unk_token" : "<unk>",
"pad_token" : "<pad>"}
sr = 16000
delimiter = "|"
path_train = r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\train_data.csv"
path_validation = r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\validation_data.csv"
train_base_ds = BaseDataset(path_train, sr, delimiter, special_tokens)
validation_base_ds = BaseDataset(path_train, sr, delimiter, special_tokens)

In [None]:
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor

vocab_dict = train_base_ds.get_vocab_dict()
with open(r'F:\IA\WOLOF\SPEECH_TO_TEXT\CODES\MODELS\WAV2VEC2\ASR\vocabs/vocab.json', 'w+') as f:
    json.dump(vocab_dict, f)
    f.close()
dist.barrier()
# Create processor
tokenizer = Wav2Vec2CTCTokenizer(r'F:\IA\WOLOF\SPEECH_TO_TEXT\CODES\MODELS\WAV2VEC2\ASR\vocabs/vocab.json', unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size = 1, sampling_rate = 16000, padding_value = 0.0, do_normalize = True, return_attention_mask = True)
processor = Wav2Vec2Processor(feature_extractor = feature_extractor, tokenizer = tokenizer)

In [None]:
model_path = r"SPEECH_TO_TEXT\CODES\MODELS\WAV2VEC2"
processor.save_pretrained(model_path)

## Create my data extractore

### Augmentation data testing

In [196]:
from torch.utils.data import Dataset
import librosa
import IPython.display as ipd

def augmentations(audio):
    rate = np.random.uniform(0.7, 1.3)
    scale_vol = np.random.uniform(0.5, 2)
    stretched_audio = librosa.effects.time_stretch(audio, rate = rate)
    stretched_audio = stretched_audio*scale_vol
    stretched_shifted_audio = np.roll(stretched_audio, 16000)
    wn = np.random.randn(len(stretched_shifted_audio))
    augmented_audio = stretched_shifted_audio + 0.005*wn
    return augmented_audio

with open(r"SPEECH_TO_TEXT\DATA\PREPROCESSED\test\test_data_2.pkl", 'rb') as f:
    batch = pickle.load(f)
        
audio = batch['audio'].iloc[0]

ipd.Audio(data = audio, autoplay = True, rate = 16000)

In [197]:
# Apply augmentations
audio = augmentations(audio)

ipd.Audio(data = audio, autoplay = True, rate = 16000)

### Dataset class implementation

In [212]:
from torch.utils.data import Dataset
import librosa

class MyDataset_train(Dataset):
    def __init__(self, directory, processor):
        self.directory = directory
        self.file_list = os.listdir(directory)
        self.processor = processor

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        with open(os.path.join(self.directory, self.file_list[idx]), 'rb') as f:
            batch = pickle.load(f)
        
        audio = batch['audio'].iloc[0]

        # Apply augmentations
        audio = self.augmentation(audio)

        input_values = self.processor(audio, sampling_rate=16000).input_values[0]
        labels = self.processor(text=batch['transcription'].iloc[0]).input_ids

        return {'input_values': input_values, 'labels': labels}
    
    def augmentation(self, audio):
        rate = np.random.uniform(0.7, 1.3)
        scale_vol = np.random.uniform(0.5, 2)
        stretched_audio = librosa.effects.time_stretch(audio, rate = rate)
        stretched_audio = stretched_audio*scale_vol
        stretched_shifted_audio = np.roll(stretched_audio, 16000)
        wn = np.random.randn(len(stretched_shifted_audio))
        augmented_audio = stretched_shifted_audio + 0.005*wn
        return augmented_audio


In [199]:
class MyDataset_validation(Dataset):
    def __init__(self, directory, processor):
        self.directory = directory
        self.file_list = os.listdir(directory)
        self.processor = processor

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        with open(os.path.join(self.directory, self.file_list[idx]), 'rb') as f:
            batch = pickle.load(f)
        
        audio = batch['audio'].iloc[0]
        input_values = self.processor(audio, sampling_rate=16000).input_values[0]
        labels = self.processor(text=batch['transcription'].iloc[0]).input_ids

        return {'input_values': input_values, 'labels': labels}

## Extraction of the vocabulary

In [7]:
def extract_all_chars():
    all_text = ""
    directory = r"SPEECH_TO_TEXT\DATA\PREPROCESSED\total_data"
    file_name = r"data_transcription_tot.csv"
    with open(os.path.join(directory, file_name), 'rb') as f:
        batch = pd.read_csv(f)
    for k in range(len(batch)):
        all_text += " " + batch['transcription'][k]

    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocab = extract_all_chars()

In [8]:
vocab_list = list(set(vocab["vocab"][0]))
vocab_list = list(set(vocab_list))

In [9]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'ŋ': 0,
 'l': 1,
 'о': 2,
 'а': 3,
 'ó': 4,
 'k': 5,
 '️': 6,
 'ñ': 7,
 'd': 8,
 'g': 9,
 'r': 10,
 'p': 11,
 'n': 12,
 'õ': 13,
 'с': 14,
 'i': 15,
 'e': 16,
 'ẽ': 17,
 'a': 18,
 'р': 19,
 'c': 20,
 'у': 21,
 'z': 22,
 'v': 23,
 'w': 24,
 'á': 25,
 'é': 26,
 ' ': 27,
 'u': 28,
 'o': 29,
 'ё': 30,
 'y': 31,
 'x': 32,
 'ã': 33,
 's': 34,
 'f': 35,
 'b': 36,
 'ë': 37,
 'j': 38,
 'q': 39,
 'h': 40,
 't': 41,
 'm': 42}

In [10]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [11]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

45

In [12]:
vocab_path = r"SPEECH_TO_TEXT\CODES\MODELS\WAV2VEC2"

import json
with open(vocab_path +'/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

## Preprocessor

In [13]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer(vocab_path + "/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [14]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size = 1, sampling_rate = 16000, padding_value = 0.0, do_normalize = True, return_attention_mask = True)

In [15]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor = feature_extractor, tokenizer = tokenizer)

In [16]:
model_path = r"SPEECH_TO_TEXT\CODES\MODELS\WAV2VEC2"
processor.save_pretrained(model_path)

[]

## Dataset setting

In [213]:
train_dataset = MyDataset_train(r"SPEECH_TO_TEXT\DATA\PREPROCESSED\test", processor)
validation_dataset = MyDataset_validation(r"SPEECH_TO_TEXT\DATA\PREPROCESSED\test", processor)

In [214]:
# train_dataset = MyDataset_validation(r"SPEECH_TO_TEXT\DATA\PREPROCESSED\test", processor)
# validation_dataset = MyDataset_validation(r"SPEECH_TO_TEXT\DATA\PREPROCESSED\test", processor)

# Training

## Set-up Trainer

In [257]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # labels_batch = self.processor.pad(
        #     label_features,
        #     padding=self.padding,
        #     max_length=self.max_length_labels,
        #     pad_to_multiple_of=self.pad_to_multiple_of_labels,
        #     return_tensors="pt",
        # )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [258]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [259]:
from evaluate import load

wer_metric = load("wer")

In [260]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis = -1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [261]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [262]:
model.freeze_feature_encoder()

In [263]:
model.gradient_checkpointing_enable()

In [267]:
from transformers import TrainingArguments
model_path = r"SPEECH_TO_TEXT\CODES\MODELS\WAV2VEC2"
training_args = TrainingArguments(
  # output_dir="/content/gdrive/MyDrive/wav2vec2-large-xlsr-turkish-demo",
  output_dir = model_path + "\wav2vec2-large-xlsr-wolof",
  group_by_length = True,
  per_device_train_batch_size = 8,
  gradient_accumulation_steps = 4,
  evaluation_strategy = "steps",
  num_train_epochs = 30,
  fp16 = False,
  save_steps = 1,
  eval_steps = 1,
  logging_steps = 10,
  learning_rate = 3e-4,
  warmup_steps = 500,
  save_total_limit = 2,
)

In [268]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    data_collator = data_collator,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = train_dataset,
    eval_dataset = validation_dataset,
    tokenizer = processor.feature_extractor,
)

## Training

In [269]:
trainer.train()

  0%|          | 0/60 [00:00<?, ?it/s]



  0%|          | 0/8 [00:00<?, ?it/s]



{'eval_loss': 9.45880126953125, 'eval_wer': 1.004158004158004, 'eval_runtime': 320.047, 'eval_samples_per_second': 0.194, 'eval_steps_per_second': 0.025, 'epoch': 0.5}


PermissionError: [WinError 5] Access is denied: 'SPEECH_TO_TEXT\\CODES\\MODELS\\WAV2VEC2/wav2vec2-large-xlsr-wolof\\tmp-checkpoint-1' -> 'SPEECH_TO_TEXT\\CODES\\MODELS\\WAV2VEC2/wav2vec2-large-xlsr-wolof\\checkpoint-1'