In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [55]:
import tensorflow 
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

['/device:GPU:0']

In [2]:
!mkdir dataset
!unzip -q "/content/drive/MyDrive/Saarthi Internship Task/task_data.zip" -d "/content/dataset"

In [3]:
!pip -q install transformers
!pip -q install datasets
!pip -q install jiwer

[K     |████████████████████████████████| 2.5MB 25.1MB/s 
[K     |████████████████████████████████| 3.3MB 46.8MB/s 
[K     |████████████████████████████████| 901kB 49.0MB/s 
[K     |████████████████████████████████| 245kB 30.2MB/s 
[K     |████████████████████████████████| 122kB 52.0MB/s 
[K     |████████████████████████████████| 245kB 52.5MB/s 
[K     |████████████████████████████████| 51kB 8.5MB/s 
[?25h  Building wheel for python-Levenshtein (setup.py) ... [?25l[?25hdone


In [4]:
import json

import librosa
import pandas as pd
from tqdm import tqdm_notebook
from datasets import Dataset, load_metric

from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor

from transformers import Trainer
from transformers import TrainingArguments

In [5]:
class Config:
    """This class has all the parameters that we require to set"""
    
    globals_ = {
        'seed': 42,
        'sr': 8000,
        'dataset_bs': 4,
        'dataset_num_workers': 2,
        'base_dir': '/content/dataset/task_data',
        'processor_out_dir': '/content/speech',
        'train_model_out_dir': '/content/res',
        'vocab_out_dir': '/content'
    }

    model_ = {
        'name': 'facebook/wav2vec2-base'
    }

    train_params_ = {
        
    }


In [6]:
def audio_feature(df):
  audio = list()
  duration = list()
  for each in tqdm_notebook(df.path):
    speech_array, sampling_rate = librosa.load(f"{Config.globals_['base_dir']}/{each}", sr=Config.globals_['sr'])
    d = librosa.get_duration(y=speech_array, sr=sampling_rate)
    audio.append(speech_array)
    duration.append(d)
  return audio, duration

def get_dataset():

  train_data = pd.read_csv(f"{Config.globals_['base_dir']}/train_data.csv")

  train_data["transcription"] = train_data["transcription"].str.replace("[\’\'\,\.\?]",'').str.lower()

  audio, duration = audio_feature(train_data)
  train_data["data"] = audio
  train_data["duration"] = duration

  valid_data = train_data.sample(frac=1, random_state=Config.globals_['seed'])[9000:].reset_index()
  train_data = train_data.sample(frac=1, random_state=Config.globals_['seed'])[:9000].reset_index()

  train_data=train_data.drop(['index', 'path', 'action', 'object', 'location'], axis=1)
  valid_data=valid_data.drop(['index', 'path', 'action', 'object', 'location'], axis=1)


  train_data = Dataset.from_pandas(train_data)
  valid_data = Dataset.from_pandas(valid_data)

  return train_data, valid_data

In [None]:
#train_data = train_data[train_data.duration>1]
#valid_data = valid_data[valid_data.duration>1]

In [7]:
train_data, valid_data = get_dataset()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=11566.0), HTML(value='')))




In [11]:
class Vocab:
  def __init__(self, train_data, valid_data):
    self.train_data=train_data
    self.valid_data=valid_data  

  def extract_all_chars(self, batch):
    all_text = " ".join(batch["transcription"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}
  
  def get_vocab(self):
    vocab_train = self.train_data.map(self.extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=self.train_data.column_names)
    vocab_test = self.valid_data.map(self.extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=self.valid_data.column_names)
 
    vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
    vocab_dict = {v: k for k, v in enumerate(vocab_list)}  

    vocab_dict["|"] = vocab_dict[" "]
    del vocab_dict[" "]

    vocab_dict["[UNK]"] = len(vocab_dict)
    vocab_dict["[PAD]"] = len(vocab_dict)

    with open(f"{Config.globals_['vocab_out_dir']}/vocab.json", 'w') as vocab_file:
        json.dump(vocab_dict, vocab_file)


In [12]:
Vocab(train_data, valid_data).get_vocab()

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [13]:
!mkdir speech

tokenizer = Wav2Vec2CTCTokenizer(f"{Config.globals_['vocab_out_dir']}/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=Config.globals_['sr'], padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained(Config.globals_['processor_out_dir'])

mkdir: cannot create directory ‘speech’: File exists


In [14]:
import numpy as np

def speech_file_to_array_fn(batch):
    batch["speech"] = batch["data"]
    batch["sampling_rate"] = Config.globals_['sr']
    batch["target_text"] = batch["transcription"]
    return batch

train_data = train_data.map(speech_file_to_array_fn, remove_columns=train_data.column_names)
valid_data = valid_data.map(speech_file_to_array_fn, remove_columns=valid_data.column_names)

HBox(children=(FloatProgress(value=0.0, max=9000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2566.0), HTML(value='')))




In [15]:
import IPython.display as ipd
import numpy as np
import random

def get_random_example(df):
  rand_int = random.randint(0, len(df))

  print("Target text:", df[rand_int]["target_text"])
  print("Input array shape:", np.asarray(df[rand_int]["speech"]).shape)
  print("Sampling rate:", df[rand_int]["sampling_rate"])

  return ipd.Audio(data=np.asarray(df[rand_int]["speech"]), autoplay=True, rate=Config.globals_['sr'])

In [16]:
get_random_example(valid_data)

Target text: bring newspaper
Input array shape: (13654,)
Sampling rate: 8000


In [17]:
def prepare_dataset(batch):
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, batch_size=Config.globals_['dataset_bs'], num_proc=Config.globals_['dataset_num_workers'], batched=True)
valid_data = valid_data.map(prepare_dataset, remove_columns=valid_data.column_names, batch_size=Config.globals_['dataset_bs'], num_proc=Config.globals_['dataset_num_workers'], batched=True)

 

HBox(children=(FloatProgress(value=0.0, description=' #0', max=1125.0, style=ProgressStyle(description_width='…

  return array(a, dtype, copy=False, order=order)


 

HBox(children=(FloatProgress(value=0.0, description=' #1', max=1125.0, style=ProgressStyle(description_width='…

  return array(a, dtype, copy=False, order=order)




 

HBox(children=(FloatProgress(value=0.0, description=' #0', max=321.0, style=ProgressStyle(description_width='i…

  return array(a, dtype, copy=False, order=order)


 

HBox(children=(FloatProgress(value=0.0, description=' #1', max=321.0, style=ProgressStyle(description_width='i…

  return array(a, dtype, copy=False, order=order)






In [18]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [19]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

wer_metric = load_metric("wer")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1947.0, style=ProgressStyle(description…




In [20]:
from transformers import Wav2Vec2ForCTC
model = Wav2Vec2ForCTC.from_pretrained(
    Config.model_['name'], 
    gradient_checkpointing=True, 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

model.config.ctc_zero_infinity = True #https://discuss.huggingface.co/t/wav2vec2-how-to-correct-for-nan-in-training-and-validation-loss/6089
model.freeze_feature_extractor()
print(f'Model defined')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1843.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=380267417.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['project_q.weight', 'project_q.bias', 'project_hid.weight', 'quantizer.weight_proj.weight', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

Model defined


In [None]:
!mkdir res

training_args = TrainingArguments(
  output_dir=Config.globals_['train_model_out_dir'],
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=15,
  fp16=True,
  save_steps=200,
  eval_steps=200,
  logging_steps=200,
  learning_rate=1e-4,
  warmup_steps=400,
  save_total_limit=1,
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=valid_data,
    tokenizer=processor.feature_extractor,
)


In [23]:
trainer.train()

***** Running training *****
  Num examples = 9000
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 4215
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
  args.max_grad_norm,


Step,Training Loss,Validation Loss,Wer
200,3.9043,3.05658,1.0
400,2.9047,2.463906,1.059827
600,2.1942,1.295012,0.695362
800,1.505,0.769933,0.545118
1000,1.1553,0.525393,0.453528
1200,0.9854,0.417515,0.403086
1400,0.8203,0.330255,0.360585
1600,0.7248,0.280924,0.344703
1800,0.6567,0.262446,0.340462
2000,0.6224,0.235089,0.321242


***** Running Evaluation *****
  Num examples = 2566
  Batch size = 8
Saving model checkpoint to /content/res/checkpoint-200
Configuration saved in /content/res/checkpoint-200/config.json
Model weights saved in /content/res/checkpoint-200/pytorch_model.bin
Configuration saved in /content/res/checkpoint-200/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 2566
  Batch size = 8
Saving model checkpoint to /content/res/checkpoint-400
Configuration saved in /content/res/checkpoint-400/config.json
Model weights saved in /content/res/checkpoint-400/pytorch_model.bin
Configuration saved in /content/res/checkpoint-400/preprocessor_config.json
Deleting older checkpoint [/content/res/checkpoint-200] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2566
  Batch size = 8
Saving model checkpoint to /content/res/checkpoint-600
Configuration saved in /content/res/checkpoint-600/config.json
Model weights saved in /content/res/checkpoint-600/pytorch_mo

TrainOutput(global_step=4215, training_loss=0.9580889738044422, metrics={'train_runtime': 2839.7782, 'train_samples_per_second': 47.539, 'train_steps_per_second': 1.484, 'total_flos': 1.4347409545635794e+18, 'train_loss': 0.9580889738044422, 'epoch': 15.0})