### Setup

In [1]:
## path
import os
path = 'drive/MyDrive/Colab Notebooks/'
path_model = os.path.join(path, '/model/wav2vec2-large-xlsr-french-23May')

import warnings
warnings.filterwarnings('ignore')

In [2]:
# installing
!pip install datasets
!pip install git+https://github.com/huggingface/transformers
!pip install wandb==0.10.25
!pip install jiwer

Collecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/46/1a/b9f9b3bfef624686ae81c070f0a6bb635047b17cdb3698c7ad01281e6f9a/datasets-1.6.2-py3-none-any.whl (221kB)
[K     |████████████████████████████████| 225kB 15.1MB/s 
[?25hCollecting xxhash
[?25l  Downloading https://files.pythonhosted.org/packages/7d/4f/0a862cad26aa2ed7a7cd87178cbbfa824fc1383e472d63596a0d018374e7/xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243kB)
[K     |████████████████████████████████| 245kB 26.0MB/s 
Collecting huggingface-hub<0.1.0
  Downloading https://files.pythonhosted.org/packages/32/a1/7c5261396da23ec364e296a4fb8a1cd6a5a2ff457215c6447038f18c0309/huggingface_hub-0.0.9-py3-none-any.whl
Collecting fsspec
[?25l  Downloading https://files.pythonhosted.org/packages/bc/52/816d1a3a599176057bf29dfacb1f8fadb61d35fbd96cb1bab4aaa7df83c0/fsspec-2021.5.0-py3-none-any.whl (111kB)
[K     |████████████████████████████████| 112kB 23.1MB/s 
Installing collected packages: xxhash, huggingfac

In [3]:
## load packages
# standard python packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import os.path
import time
import gc

# pre-processing
import librosa as lb

# torch
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torch.utils.data import random_split

import torch.nn as nn
import torch.nn.functional as F

# transformers
from transformers import Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor
from transformers import TrainingArguments
from transformers import Trainer
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup

# data collator for model
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

#
import wandb

# metric
from datasets import load_metric

In [4]:
## seeding
random.seed(10)
np.random.seed(10)
torch.manual_seed(10)
torch.cuda.manual_seed_all(10)

In [6]:
## Configuration
wandb.login()

sweep_config = {
    "method": "grid"
}

metric = {
    "name": "loss",
    "goal": "minimize"   
    }

sweep_config["metric"] = metric

parameters_dict = {
    "learning_rate": {
        "value": 4.4e-4,
    },
    "batch_size": {
        "value": 10,
    },
    "warmup_steps": {
        "value": 100,
    },
    "attention_dropout": {
        "value": 0.024,
    },
    "hidden_dropout": {
        "value": 0.024,
    }, 
    "feat_proj_dropout": {
        "value": 0.0,
    },
    "mask_time_prob": {
        "value": 0.057,
    },
    "layerdrop": {
        "value": 0.024,
    },
    }

sweep_config["parameters"] = parameters_dict
sweep_id = wandb.sweep(sweep_config, project="zindi-asr")

NameError: ignored

In [5]:
torch.cuda.is_available()

True

In [6]:
## mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
## prevent disconnecting
# right mouse click -> inspect -> Console tab and insert code
'''
function ConnectButton(){
    console.log("Connect pushed");
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
}
setInterval(ConnectButton,60000);
'''

'\nfunction ConnectButton(){\n    console.log("Connect pushed");\n    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()\n}\nsetInterval(ConnectButton,60000);\n'

In [9]:
## useful links
# https://heartbeat.fritz.ai/the-3-deep-learning-frameworks-for-end-to-end-speech-recognition-that-power-your-devices-37b891ddc380
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

### Data

In [10]:
## useful links
# https://www.openslr.org/12
# https://www.machinecurve.com/index.phjp/2021/02/17/easy-speech-recognition-with-machine-learning-and-huggingface-transformers/
# audio processing: https://librosa.org/doc/main/generated/librosa.load.html
# https://maelfabien.github.io/machinelearning/wav2vec/#

In [7]:
## read into memory (small) -> storing takes around 2hrs
df = pd.read_csv('drive/MyDrive/Colab Notebooks/data/ASR_train.csv')
nsamples = len(df)

# check if already existent
if os.path.isfile('drive/MyDrive/Colab Notebooks/data/ASR_train_audio'+str(nsamples)+'.ft'):
    print ("File exist")
    df = pd.read_feather('drive/MyDrive/Colab Notebooks/data/ASR_train_audio'+str(nsamples)+'.ft')
else:
    print("File does not exist")

    # initialize with list
    audio_signals = len(df['ID'])*[[0]]
    df['audio_signal'] = audio_signals

    # functional but not elegant (nor fast probably)
    for k in range(nsamples):
      id = df.iloc[k]['ID']
      path_data = os.path.join('drive/MyDrive/Colab Notebooks/data/clips/', id+'.mp3')
      waveform, rate = lb.load(path_data, sr=16*1e3)
      df.at[k, 'audio_signal'] = waveform

      if k % 100 == 0:
        print('file '+ str(k))

    # store as faster feather format
    df[:nsamples].to_feather('drive/MyDrive/Colab Notebooks/data/ASR_train_audio'+str(nsamples)+'.ft')

    #
    df = df[:nsamples]

File exist


In [16]:
## train valid split
from sklearn.model_selection import train_test_split

# df_train -> used in optimization
# df_valid -> used to evaluate model during optimization
# df_valid2 -> independent set for testing
df_train, df_valid = train_test_split(df, test_size=0.15, random_state=1234)

In [9]:
# only take read samples
dfFrench = pd.read_csv('drive/MyDrive/Colab Notebooks/data/ASR_French/fn_text.txt', delimiter="wav ")[:6298]
dfFrench.columns = ["ID", "transcription"]

In [10]:
import random
train_idsF = random.sample(range(len(dfFrench)), int(0.2* len(df_train)))

# initialize with list
audio_signals = len(dfFrench)*[[0]]
dfFrench['audio_signal'] = audio_signals

# functional but not elegant (nor fast probably)
for k, i in enumerate(train_idsF):
  id = dfFrench.iloc[i]['ID']
  path_data = os.path.join('drive/MyDrive/Colab Notebooks/data/ASR_French/'+id[13:25]+id[36:47], id[47:]+'wav')
  waveform, rate = lb.load(path_data, sr=16*1e3)
  dfFrench.at[i, 'audio_signal'] = waveform

  if k % 100 == 0:
    print('file '+ str(k))

file 0
file 100
file 200
file 300
file 400
file 500
file 600
file 700
file 800
file 900
file 1000
file 1100


In [11]:
# only take read samples
dfWolof = pd.read_csv('drive/MyDrive/Colab Notebooks/data/ASR_Wolof/train/text', names=["mixed"])
dfWolof[["ID", "transcription"]] = dfWolof["mixed"].str.split(' ', 1, expand=True)
dfWolof = dfWolof.drop(["mixed"], axis=1)

In [12]:
import random
train_idsW = random.sample(range(len(dfWolof)), int(0.2* len(df_train)))

# initialize with list
audio_signals = len(dfWolof)*[[0]]
dfWolof['audio_signal'] = audio_signals

# functional but not elegant (nor fast probably)
for k, i in enumerate(train_idsW):
  id = dfWolof.iloc[i]['ID']
  path_data = os.path.join('drive/MyDrive/Colab Notebooks/data/ASR_Wolof/train/'+id[4:6], id+'.wav')
  waveform, rate = lb.load(path_data, sr=16*1e3)
  dfWolof.at[i, 'audio_signal'] = waveform

  if k % 100 == 0:
    print('file '+ str(k))

file 0
file 100
file 200
file 300
file 400
file 500
file 600
file 700
file 800
file 900
file 1000
file 1100


In [13]:
## listen to a sample
import IPython.display as ipd

idx = 35
print("Target text:", dfWolof["transcription"].values[train_idsW[idx]])
ipd.Audio(data=dfWolof["audio_signal"].values[train_idsW[idx]], autoplay=True, rate=16000)

Target text:  ngoon gu nekk nga defal ko benn ci rasu gi


In [17]:
print(len(dfWolof.iloc[train_idsW]))
print(len(dfFrench.iloc[train_idsF]))
print(len(df_train))

1136
1136
5680


In [18]:
df_train = df_train[['ID', 'transcription', 'audio_signal']]
df_train = pd.concat([df_train, dfFrench.iloc[train_idsF], dfWolof.iloc[train_idsW]])
print(len(df_train))

7952


In [20]:
## dataset library (1-2GB/s data processing)
!pip install nlp
from nlp import Dataset

data_train = Dataset.from_pandas(df_train[['ID', 'transcription', 'audio_signal']])
data_valid = Dataset.from_pandas(df_valid[['ID', 'transcription', 'audio_signal']])



In [21]:
## Lower casing (no punctuation included)
import re
def remove_special_characters(batch):
    batch['text'] = re.sub('[,().?!~;1234567890^]', '', batch["transcription"].lower()) + ' ' # lower casing + remove ( ) , " + word separator at the end
    return batch

data_train = data_train.map(remove_special_characters, batch_size=16, remove_columns=['transcription'])
data_valid = data_valid.map(remove_special_characters, batch_size=16, remove_columns=['transcription'])

HBox(children=(FloatProgress(value=0.0, max=7952.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))




In [39]:
data_vocab = Dataset.from_pandas(df_train[['ID', 'transcription', 'audio_signal']].iloc[random.sample(range(len(df_train)), int(0.2* len(df_train)))])
data_vocab = data_vocab.map(remove_special_characters, batch_size=16, remove_columns=['transcription'])

HBox(children=(FloatProgress(value=0.0, max=1590.0), HTML(value='')))




### Model

#### XLSR Model Transformers

In [18]:
## useful links
# https://github.com/pytorch/fairseq/tree/master/examples/wav2vec
# https://github.com/pytorch/fairseq/issues/3199
# https://bleepcoder.com/fairseq/708379224/wav2vec-2-0-inference-pipeline
# https://huggingface.co/transformers/training.html
# https://huggingface.co/blog/fine-tune-xlsr-wav2vec2 fine-tuning XLSR
# https://distill.pub/2017/ctc/ sequence modeling with CTC

#### Pre-Processing

In [22]:
## tokenizer (for output text)
#tokenizer = Wav2Vec2CTCTokenizer(vocab_path, bos_token='<s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', word_delimiter_token=' ')
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-large-xlsr-53-french") # IMPORTANT: before used Wav2VecTokenizer (not CTC)

## feature extractor (best guess: for input to cut into windows, normalize etc.)
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

## processor (combine tokenizer and feature extractor)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=460.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=378.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=85.0, style=ProgressStyle(description_w…




#### Preprocess Data

In [20]:
## listen to a sample
'''
import IPython.display as ipd

rand_int = random.randint(0, len(data_train))
print("Target text:", data_train[rand_int]["text"])

ipd.Audio(data=np.asarray(data_train[rand_int]["audio_signal"]), autoplay=True, rate=16000)
'''

'\nimport IPython.display as ipd\n\nrand_int = random.randint(0, len(data_train))\nprint("Target text:", data_train[rand_int]["text"])\n\nipd.Audio(data=np.asarray(data_train[rand_int]["audio_signal"]), autoplay=True, rate=16000)\n'

In [23]:
## extract input_values (normalization)
def prepare_dataset(batch):
    batch["input_values"] = processor(batch["audio_signal"], sampling_rate=16*1e3).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

# parameter num_proc does not exist in the currently used version of datasets
data_train = data_train.map(prepare_dataset, remove_columns=data_train.column_names, batch_size=16, batched=True)
data_valid = data_valid.map(prepare_dataset, remove_columns=data_valid.column_names, batch_size=16, batched=True)

HBox(children=(FloatProgress(value=0.0, max=497.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))




#### Training

In [24]:
## data collator (dynamic padding)
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        # input_values, attention_mask, labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [25]:
## metric
wer_metric = load_metric("wer")

def compute_metrics(pred):
    # argmax of softmax
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # -100 id -> pad token
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    # prediction id -> character
    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics?
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

Collecting jiwer
  Downloading https://files.pythonhosted.org/packages/8c/cc/fb9d3132cba1f6d393b7d5a9398d9d4c8fc033bc54668cf87e9b197a6d7a/jiwer-2.2.0-py3-none-any.whl
Collecting python-Levenshtein
[?25l  Downloading https://files.pythonhosted.org/packages/2a/dc/97f2b63ef0fa1fd78dcb7195aca577804f6b2b51e712516cc0e902a9a201/python-Levenshtein-0.12.2.tar.gz (50kB)
[K     |████████████████████████████████| 51kB 5.5MB/s 
Building wheels for collected packages: python-Levenshtein
  Building wheel for python-Levenshtein (setup.py) ... [?25l[?25hdone
  Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp37-cp37m-linux_x86_64.whl size=149801 sha256=97e26f88465118b8d926e62a7a5392fb245a7fc954ec7806ed4bcb6d10df6ab6
  Stored in directory: /root/.cache/pip/wheels/b3/26/73/4b48503bac73f01cf18e52cd250947049a7f339e940c5df8fc
Successfully built python-Levenshtein
Installing collected packages: python-Levenshtein, jiwer
Successfully installed jiwer-2.2.0 python-Levenshtein-0.12.

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1947.0, style=ProgressStyle(description…




In [301]:
## training
def train(config=None):
  # Initialize a new wandb run
    with wandb.init(config=config):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        ## model
        ## Note: play around with hyperparameters (take training to laptop and perform grid search?)
        # https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/

        model = Wav2Vec2ForCTC.from_pretrained(
            "facebook/wav2vec2-large-xlsr-53-french",
            attention_dropout=config.attention_dropout,
            hidden_dropout=config.hidden_dropout,
            feat_proj_dropout=config.feat_proj_dropout,
            mask_time_prob=config.mask_time_prob,
            layerdrop=config.layerdrop,
            gradient_checkpointing=True, # save GPU memory
            ctc_loss_reduction="mean",
            pad_token_id=processor.tokenizer.pad_token_id, # define pad token
            #vocab_size=len(processor.tokenizer)# -> mis-match of last layer due to vocab size
        )

        model.to('cuda')
        model.freeze_feature_extractor()

        # freeze all layers
        for name, param in model.named_parameters():
          # param.requires_grad = False
          if 'lm_head' not in name:
            param.requires_grad = False

          if param.requires_grad:
            print(name)

        # new classifier
        model.lm_head = torch.nn.Linear(1024, out_features=len(processor.tokenizer), bias=True)

        #
        '''
        grouped_params = model.parameters()
        optimizer = AdamW(grouped_params, lr=config.learning_rate)
        scheduler = get_cosine_schedule_with_warmup(optimizer, config.warmup_steps, num_training_steps=int(len(data_train)/8*20))
        optimizers = optimizer, scheduler
        '''

        ## hyperparameters
        run_name = "parameter-sweep-lr"+ str(time.strftime("%d-%m-%Y %H:%M"))
        training_args = TrainingArguments(
          output_dir='./drive/MyDrive/Colab Notebooks/model/wav2vec2-large-xlsr-french-23May/'+run_name,
          group_by_length=True,
          per_device_train_batch_size=config.batch_size,
          gradient_accumulation_steps=2,
          evaluation_strategy="steps",
          num_train_epochs=10,
          fp16=True, # True only on cuda
          save_steps=780,
          eval_steps=50, #130
          logging_steps=65,
          learning_rate=config.learning_rate,
          warmup_steps=config.warmup_steps,
          save_total_limit=1,
          report_to="wandb",
          run_name=run_name,
        )

        ## Trainer
        trainer = Trainer(
            model=model,
            data_collator=data_collator,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=data_train,
            eval_dataset=data_train,
            tokenizer=processor.feature_extractor,
            #optimizers=optimizers,
        )

        ## Now train complete model
        model.freeze_feature_extractor()

        # freeze all layers
        for name, param in model.named_parameters():
          param.requires_grad = True

        model.freeze_feature_extractor()

        ## Trainer
        trainer = Trainer(
            model=model,
            data_collator=data_collator,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=data_train,
            eval_dataset=data_train,
            tokenizer=processor.feature_extractor,
            #optimizers=optimizers,
        )

        # garbage collector
        gc.collect()
        torch.cuda.empty_cache()

        ## start training
        model.train()
        trainer.train()
        wandb.finish()

In [302]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: ft7sy86e with config:
[34m[1mwandb[0m: 	attention_dropout: 0.024
[34m[1mwandb[0m: 	batch_size: 10
[34m[1mwandb[0m: 	feat_proj_dropout: 0
[34m[1mwandb[0m: 	hidden_dropout: 0.024
[34m[1mwandb[0m: 	layerdrop: 0.024
[34m[1mwandb[0m: 	learning_rate: 0.00044
[34m[1mwandb[0m: 	mask_time_prob: 0.057
[34m[1mwandb[0m: 	warmup_steps: 100
[34m[1mwandb[0m: wandb version 0.10.30 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


lm_head.weight
lm_head.bias




Step,Training Loss,Validation Loss,Wer
50,No log,3.274332,1.0
100,10.479600,3.080422,1.0
150,3.131900,1.598549,1.177211
200,1.648600,0.720911,0.877557
250,1.648600,0.421977,0.711992
300,0.734100,0.260678,0.583569
350,0.426500,0.169657,0.464274


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/loss,0.16966
eval/wer,0.46427
eval/runtime,52.1352
eval/samples_per_second,14.597
train/epoch,9.99
train/global_step,380.0
_runtime,1572.0
_timestamp,1621766671.0
_step,12.0
train/loss,0.4265


0,1
eval/loss,██▄▂▂▁▁
eval/wer,▆▆█▅▃▂▁
eval/runtime,▃▇█▅▃▁▁
eval/samples_per_second,▆▂▁▄▆██
train/epoch,▁▁▂▃▃▄▄▅▅▆▇▇█
train/global_step,▁▁▂▃▃▄▄▅▅▆▇▇█
_runtime,▁▁▂▃▃▄▄▅▆▆▇██
_timestamp,▁▁▂▃▃▄▄▅▆▆▇██
_step,▁▂▂▃▃▄▅▅▆▆▇▇█
train/loss,█▃▂▁▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


### Evaluation

In [18]:
from datasets import load_metric
import difflib

#from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
if not 'model' in globals():
  print('Load model')
  model = Wav2Vec2ForCTC.from_pretrained('./drive/MyDrive/Colab Notebooks/model/checkpoint-8000/').to("cuda")

model.eval();

# new processor if not yet existent
if not 'processor' in globals():
  print('Load processor')
  tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-large-xlsr-53-french")
  feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
  processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
  # processor = Wav2Vec2Processor.from_pretrained('./drive/MyDrive/Colab Notebooks/model/wav2vec2-large-xlsr-french-test/checkpoint-150/') # for some reason the file is not found

#
def prepare_dataset(batch):
    return processor(batch, return_tensors="pt", sampling_rate=16*1e3)

wer_ = []

#
input_dict = df_valid2['audio_signal'].apply(prepare_dataset)
label_str = ''
pred_str = ''

for idx in range(len(df_valid2)):
  #print('-----------------')
  logits = model(input_dict.values[idx].input_values.to("cuda")).logits
  pred_ids = torch.argmax(logits, dim=-1)[0]
  pred_str+= processor.decode(pred_ids)+ ' '
  label_str+= df_valid2["transcription"].values[idx].lower()+ ' '

wer_.append(wer_metric.compute(predictions=[pred_str], references=[label_str]))
np.mean(wer_)

1337


0.03960628075931568