### Setup

Mount the drive, install pip packages, load libraries and set up wandb for training logging

In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

## path
path = 'drive/MyDrive/Colab Notebooks/Zindi ASR/'
path_data = os.path.join(path, 'data/ASR_Zindi/')
path_model = os.path.join(path, 'models/')

In [2]:
## mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
## installing pip packages
!pip install datasets
!pip install git+https://github.com/huggingface/transformers
!pip install wandb==0.10.25
!pip install jiwer
!pip install nlp

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-hk3as8eu
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-hk3as8eu
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (PEP 517) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.7.0.dev0-cp37-none-any.whl size=2308963 sha256=4a38fa1e91eb400c19df223c1cb7eb7c45e8b745cb8ae5ab59fcc8aaa696ecbc
  Stored in directory: /tmp/pip-ephem-wheel-cache-zwzq_xjv/wheels/70/d3/52/b3fa4f8b8ef04167ac62e5bb2accb62ae764db2a378247490e
Successfully built transformers


In [4]:
## load packages
# standard python packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import os.path
import time
import gc

# pre-processing
import librosa as lb
import re

# torch
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torch.utils.data import random_split

import torch.nn as nn
import torch.nn.functional as F

# transformers
from transformers import Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor
from transformers import TrainingArguments
from transformers import Trainer
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup

# data collator for model
from nlp import Dataset
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

# wandb for logging
import wandb

# metric
from datasets import load_metric

In [5]:
## Seeding
random.seed(10)
np.random.seed(10)
torch.manual_seed(10)
torch.cuda.manual_seed_all(10)

In [6]:
## Model & Learning Configuration
wandb.login()

sweep_config = {
    "method": "grid"
}

metric = {
    "name": "loss",
    "goal": "minimize"   
    }

sweep_config["metric"] = metric

parameters_dict = {
    "learning_rate": {
        "value": 4.4e-4,
    },
    "batch_size": {
        "value": 16,
    },
    "warmup_steps": {
        "value": 100,
    },
    "attention_dropout": {
        "value": 0.024,
    },
    "hidden_dropout": {
        "value": 0.024,
    }, 
    "feat_proj_dropout": {
        "value": 0.0,
    },
    "mask_time_prob": {
        "value": 0.057,
    },
    "layerdrop": {
        "value": 0.024,
    },
    }

sweep_config["parameters"] = parameters_dict
sweep_id = wandb.sweep(sweep_config, project="zindi-asr")

[34m[1mwandb[0m: Currently logged in as: [33mromanengeler1805[0m (use `wandb login --relogin` to force relogin)


Create sweep with ID: gzp6ets6
Sweep URL: https://wandb.ai/romanengeler1805/zindi-asr/sweeps/gzp6ets6


### Data

Load the data and remove special characters

In [7]:
## Read data into memory
df = pd.read_csv(path_data+'/ASR_train.csv')
nsamples = len(df)

# check if already existent
if os.path.isfile(path_data+'/ASR_train_audio'+str(nsamples)+'.ft'):
    print ("File exist")
    df = pd.read_feather(path_data+'/ASR_train_audio'+str(nsamples)+'.ft')
else:
    print("File does not exist")

    # initialize with list
    audio_signals = len(df['ID'])*[[0]]
    df['audio_signal'] = audio_signals

    # functional but not elegant (nor fast probably)
    for k in range(nsamples):
      id = df.iloc[k]['ID']
      path_data = os.path.join(path_data+'/clips/', id+'.mp3')
      waveform, rate = lb.load(path_data, sr=16*1e3)
      df.at[k, 'audio_signal'] = waveform

      if k % 100 == 0:
        print('file '+ str(k))

    # store as faster feather format
    df[:nsamples].to_feather(path_data+'/ASR_train_audio'+str(nsamples)+'.ft')

    #
    df = df[:nsamples]

File exist


In [8]:
## train valid split
from sklearn.model_selection import train_test_split
df_train, df_valid = train_test_split(df, test_size=0.15, random_state=1234)

In [9]:
## dataset library (1-2GB/s data processing)
data_train = Dataset.from_pandas(df_train[['ID', 'transcription', 'audio_signal']])
data_valid = Dataset.from_pandas(df_valid[['ID', 'transcription', 'audio_signal']])

In [10]:
## Lower casing (no punctuation included)
def remove_special_characters(batch):
    batch['text'] = re.sub('[,().?!~;1234567890^]', '', batch["transcription"].lower()) + ' ' # lower casing + remove ( ) , " + word separator at the end
    return batch

data_train = data_train.map(remove_special_characters, batch_size=16, remove_columns=['transcription'])
data_valid = data_valid.map(remove_special_characters, batch_size=16, remove_columns=['transcription'])

HBox(children=(FloatProgress(value=0.0, max=5680.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))




### Processor

Prepare tokenizer and feature extractor as well as data collator

In [11]:
# tokenizer (for output text)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-large-xlsr-53-french") # IMPORTANT: before used Wav2VecTokenizer (not CTC)

# feature extractor (for input to cut into windows, normalize etc.)
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

# processor (combine tokenizer and feature extractor)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [12]:
## extract input_values (normalization)
def prepare_dataset(batch):
    batch["input_values"] = processor(batch["audio_signal"], sampling_rate=16*1e3).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

#
data_train = data_train.map(prepare_dataset, remove_columns=data_train.column_names, batch_size=16, batched=True)
data_valid = data_valid.map(prepare_dataset, remove_columns=data_valid.column_names, batch_size=16, batched=True)

HBox(children=(FloatProgress(value=0.0, max=355.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))




In [13]:
## data collator (dynamic padding)
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        # input_values, attention_mask, labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

### Training

Initialize metric and train the model

In [14]:
## word error rate
wer_metric = load_metric("wer")

def compute_metrics(pred):
    # argmax of softmax
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # -100 id -> pad token
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    # prediction id -> character
    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics?
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [15]:
## training
def train(config=None):
  # Initialize a new wandb run
    with wandb.init(config=config):
        config = wandb.config

        # model
        model = Wav2Vec2ForCTC.from_pretrained(
            "facebook/wav2vec2-large-xlsr-53-french",
            attention_dropout=config.attention_dropout,
            hidden_dropout=config.hidden_dropout,
            feat_proj_dropout=config.feat_proj_dropout,
            mask_time_prob=config.mask_time_prob,
            layerdrop=config.layerdrop,
            gradient_checkpointing=True, # save GPU memory
            ctc_loss_reduction="mean",
            pad_token_id=processor.tokenizer.pad_token_id, # define pad token
            #vocab_size=len(processor.tokenizer)# -> mis-match of last layer due to vocab size
        )

        # freeze feature extractor
        model.to('cuda')
        model.freeze_feature_extractor()

        # hyperparameters for training
        training_args = TrainingArguments(
          output_dir=path_model+ str(time.strftime("%d-%m-%Y %H:%M")),
          group_by_length=True,
          per_device_train_batch_size=config.batch_size,
          gradient_accumulation_steps=2,
          evaluation_strategy="steps",
          num_train_epochs=30,
          fp16=True, # True only on cuda
          save_steps=560,
          eval_steps=130,
          logging_steps=130,
          learning_rate=config.learning_rate,
          warmup_steps=config.warmup_steps,
          save_total_limit=1,
          report_to="wandb",
          run_name=path_model+ str(time.strftime("%d-%m-%Y %H:%M")),
        )

        # Trainer
        trainer = Trainer(
            model=model,
            data_collator=data_collator,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=data_train,
            eval_dataset=data_valid,
            tokenizer=processor.feature_extractor,
            #optimizers=optimizers,
        )

        # garbage collector
        gc.collect()
        torch.cuda.empty_cache()

        # start training
        model.train()
        trainer.train()
        wandb.finish()

In [16]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: 47wo7hou with config:
[34m[1mwandb[0m: 	attention_dropout: 0.024
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	feat_proj_dropout: 0
[34m[1mwandb[0m: 	hidden_dropout: 0.024
[34m[1mwandb[0m: 	layerdrop: 0.024
[34m[1mwandb[0m: 	learning_rate: 0.00044
[34m[1mwandb[0m: 	mask_time_prob: 0.057
[34m[1mwandb[0m: 	warmup_steps: 100
[34m[1mwandb[0m: wandb version 0.10.30 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




Step,Training Loss,Validation Loss


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
