When doing prepare dataset following specs are required
* 64 GB Ram
* 300 GB Disk Storage

When doing training the recomended specs are:
* 24GB GPU RAM
* 1 GPU (Nvidia GTX 4090 or better)
* 300 GB of free disk space

In [None]:
%%capture
#https://towardsdatascience.com/leveraging-the-power-of-jupyter-notebooks-26b4b8d7c622
! jupyter notebook --generate-config
! jupyter notebook --NotebookApp.max_buffer_size=258000000000
! jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000

In [None]:
%conda install -c pytorch -c conda-forge -c huggingface pandas librosa numpy ipywidgets pytorch torchvision torchaudio datasets transformers wandb huggingface_hub accelerate

In [None]:
%pip install pandas 
%pip install datasets
%pip install transformers
%pip install librosa
%pip install wandb -qU
%pip install git+https://github.com/huggingface/huggingface_hub
%pip install jiwer
%pip install transformers[torch]
%pip install accelerate -U
%pip install ipywidgets
%pip install torchaudio

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import wandb

wandb.login()

In [None]:
import pandas as pd
import numpy as np
import random
from IPython.display import display, HTML
import os
import json
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
import IPython.display as ipd
import torchaudio


In [None]:
from transformers import Wav2Vec2ForCTC
from transformers import Wav2Vec2Processor
from datasets import load_dataset, load_metric
from datasets import Dataset
from datasets import ClassLabel
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from transformers import TrainingArguments
from transformers import Trainer
from jiwer import wer
import statistics
from transformers import Wav2Vec2CTCTokenizer

In [None]:
! mkdir tsv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/06_train.csv --output tsv/train.csv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/06_val.csv --output tsv/validation.csv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/05_benchmark.csv --output tsv/test.csv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/vocab.json --output vocab.json

In [None]:
dataTrain = pd.read_csv("tsv/train.csv")
dataValid = pd.read_csv("tsv/validation.csv")
dataTest = pd.read_csv("tsv/test.csv")

In [None]:
len(dataTrain), len(dataValid), len(dataTest)

In [None]:
pd.options.mode.chained_assignment = None
dataTest['path'] = dataTest['file_name'].apply(lambda x: f'/media/monlamai/SSD/data/wav16k/{x}.wav')

dataValid['path'] = dataValid['file_name'].apply(lambda x: f'/media/monlamai/SSD/data/wav16k/{x}.wav')

dataTrain['path'] = dataTrain['file_name'].apply(lambda x: f'/media/monlamai/SSD/data/wav16k/{x}.wav')

In [None]:
import os
dataTest['path'].apply(lambda x: os.path.isfile(x)).value_counts()
# dataValid['path'].apply(lambda x: os.path.isfile(x)).value_counts()
# batch_df['path'].apply(lambda x: os.path.isfile(x)).value_counts()

In [None]:
from datasets import Dataset
common_voice_train = Dataset.from_pandas(dataTrain)
common_voice_valid = Dataset.from_pandas(dataValid)
common_voice_test = Dataset.from_pandas(dataTest)

common_voice_test_transcription = Dataset.from_pandas(dataTest)
common_voice_valid_transcription = Dataset.from_pandas(dataValid)

In [None]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(common_voice_train.remove_columns(['dept', 'grade', 'wylie', 'char_len', 'audio_len', 'url']), num_examples=5)

In [None]:
def extract_all_chars(batch):
  all_text = " ".join(batch["uni"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [None]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)
vocab_valid = common_voice_valid.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_valid.column_names)

In [None]:
vocab_list = list(
    set(vocab_train["vocab"][0]) | 
    set(vocab_test ["vocab"][0]) | 
    set(vocab_valid["vocab"][0])
    )

In [None]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

In [None]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [None]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

In [None]:
vocab_dict

In [None]:
# import json
# with open('vocab.json', 'w') as vocab_file:
#     json.dump(vocab_dict, vocab_file)

In [None]:
# ! aws s3 cp vocab.json s3://monlam.ai.stt/tsv/vocab.json

In [None]:
tokenizer = Wav2Vec2CTCTokenizer("./new_vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [None]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [None]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
processor.save_pretrained("wav2vec2_run10")

In [None]:
from torchaudio.transforms import Resample

def speech_file_to_array_fn(batch):
    # print(batch)
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    # print(speech_array.shape, sampling_rate)
    if sampling_rate != 16000:
        print("resampling")
        resampler = Resample(orig_freq=sampling_rate, new_freq=16000)
        speech_array = resampler(speech_array)
        sampling_rate = 16000
    
    # print(speech_array.shape, sampling_rate)
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["uni"]
    return batch

In [None]:
common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names)

In [None]:
common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names)

In [None]:
common_voice_valid = common_voice_valid.map(speech_file_to_array_fn, remove_columns=common_voice_valid.column_names)

humm. This does not work. 

In [None]:
common_voice_train

In [None]:
rand_int = random.randint(0, len(common_voice_test)-1)

ipd.Audio(data=np.asarray(common_voice_test[rand_int]["path"]), autoplay=True, rate=16000)

In [None]:
# rand_int = random.randint(0, len(common_voice_test)-1)

print("Target text:", common_voice_train[rand_int]["target_text"])
print("Input array shape:", np.asarray(common_voice_train[rand_int]["speech"]).shape)
print("Sampling rate:", common_voice_train[rand_int]["sampling_rate"])

In [None]:
def prepare_dataset(batch):
    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"]).input_values
    # reshape to (n,)
    batch["input_values"] = np.squeeze(batch["input_values"])
    # if batch["sampling_rate"] != 16000:
    #     print("sampling rate not 16k", batch)
    
    # with processor.as_target_processor():
    #     batch["labels"] = processor(batch["target_text"]).input_ids

    batch["labels"] = processor(text=batch["target_text"]).input_ids
    return batch

In [None]:
# def prepare_dataset(batch):
#     # check that all files have the correct sampling rate
#     assert (
#         len(set(batch["sampling_rate"])) == 1
#     ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

#     batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values

#     with processor.as_target_processor():
#         batch["labels"] = processor(batch["target_text"]).input_ids
#     return batch

# common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, batch_size=4, num_proc=2, batched=True)
# common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=4, batched=True)
# common_voice_valid = common_voice_valid.map(prepare_dataset, remove_columns=common_voice_valid.column_names, batch_size=8, num_proc=4, batched=True)

In [None]:
common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names)
common_voice_train.save_to_disk(f"/media/monlamai/SSD/wav2vec2/train_prepare_dataset.arrow")

In [None]:
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names)
common_voice_test.save_to_disk("/media/monlamai/SSD/wav2vec2/test_prepare_dataset.arrow")

In [None]:
common_voice_valid = common_voice_valid.map(prepare_dataset, remove_columns=common_voice_valid.column_names)
common_voice_valid.save_to_disk("/media/monlamai/SSD/wav2vec2/valid_prepare_dataset.arrow")

In [None]:
# from datasets import DatasetDict
# ddict = DatasetDict({
#     "train": common_voice_train,
#     "valid": common_voice_valid,
#     "test": common_voice_test,
# })
# ddict.push_to_hub("prepare_dataset_run8")

### Load the datasets from disk

In [None]:
from datasets import load_from_disk
common_voice_train = load_from_disk('/media/monlamai/SSD/wav2vec2/train_prepare_dataset.arrow')
common_voice_test = load_from_disk( '/media/monlamai/SSD/wav2vec2/test_prepare_dataset.arrow')
common_voice_valid = load_from_disk('/media/monlamai/SSD/wav2vec2/valid_prepare_dataset.arrow')

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
from datasets import load_metric
cer_metric = load_metric("cer")

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", # commented for for continue training
    # "/media/monlamai/SSD/wav2vec2/wav2vec2_run9/checkpoint-80000", # inserted for continue training
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True, # If True, use gradient checkpointing to save memory at the expense of slower backward pass.
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id, # commented for for continue training
    vocab_size=len(processor.tokenizer), # commented for for continue training
    # ignore_mismatched_sizes=True,
)

In [None]:
model.freeze_feature_extractor()

In [None]:
model.config.ctc_zero_infinity = True

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
  output_dir="/media/monlamai/SSD/wav2vec2/wav2vec2_run10",
  group_by_length=True,
  per_device_train_batch_size=8,
  gradient_accumulation_steps=2, # increase by 2x for every 2x decrease in batch size
  evaluation_strategy="steps",
  num_train_epochs=25,
  # fp16=True,
  save_steps=5000,
  eval_steps=5000,
  logging_steps=100,
  report_to=['wandb'],
  learning_rate=3e-5,
  warmup_steps=500,
  save_total_limit=10,
  push_to_hub=False,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_valid,
    tokenizer=processor.feature_extractor,
)

In [None]:
# resume_from_checkpoint=True # commented for for continue training
# trainer.train(resume_from_checkpoint=True)

trainer.train()

In [None]:
model = Wav2Vec2ForCTC.from_pretrained("openpecha/wav2vec2_run9").to("cuda")
processor = Wav2Vec2Processor.from_pretrained("openpecha/wav2vec2_run9")

In [None]:
input_dict = processor(common_voice_test[0]["input_values"], return_tensors="pt", padding=True)

logits = model(input_dict.input_values.to("cuda")).logits

pred_ids = torch.argmax(logits, dim=-1)[0]

In [None]:
print("Prediction:")
print(processor.decode(pred_ids))

print("\nReference:")
print(common_voice_test_transcription[0]["sentence"].lower())


In [None]:
prediction = []
reference = []
paths = []

for i in range(0,len(common_voice_test)):

  input_dict = processor(common_voice_test[i]["input_values"], return_tensors="pt", padding=True)
  logits = model(input_dict.input_values.to("cuda")).logits
  pred_ids = torch.argmax(logits, dim=-1)[0]

  #print("Prediction:")
  prediction.append(processor.decode(pred_ids))

  #print("\nReference:")
  reference.append(common_voice_test_transcription[i]["sentence"].lower())

  path = common_voice_test_transcription[i]["path"]
  path = path.split("/")
  path = path[-1]
  paths.append(path)

In [None]:
for i in range(0,len(reference)):
  print(paths[i])
  print(reference[i])
  print(prediction[i])
  print("---")

In [None]:
# This are necessary for the statistics reporting
from google.colab import files
import re
from jiwer import wer
import statistics

In [None]:
# Calculate Levenshtein Distance between two strings (character distance)
# https://colab.research.google.com/github/Alexjmsherman/nlp_practicum_cohort3_instructor/blob/master/lessons/lesson_8_text_similarity/text_similarity_solution.ipynb#scrollTo=sSj3zYpq-sc1

def levenshtein(seq1, seq2):
    # create a matrix
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))

    # set col numbers (0, n-1)
    for x in range(size_x):
        matrix [x, 0] = x

    # set row numbers (0, n-1)
    for y in range(size_y):
        matrix [0, y] = y

    # calculate distance
    for x in range(1, size_x):
        for y in range(1, size_y):
            # if characters match do not increase distance
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = matrix[x-1, y-1]
            # if characters don't match increase min distance by 1
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )

    return (matrix[size_x - 1, size_y - 1])

In [None]:
#===============================================================================
# Evaluate checkpoints; calculate their word/character error rates and
# get the predictions for the sentences in the test set.
#===============================================================================

checkpointNums = ["2900"]

medianStats = ""

for ch in checkpointNums:

	checkpointNum = ch

	filename = "wav2vec2-res-" + str(runId) + "-ch" + ch + ".csv"
	idThisRun = "wav2vec2-" + str(runId)

	# model = Wav2Vec2ForCTC.from_pretrained("/content/wav2vec2-large-xlsr/checkpoint-"+ch).to("cuda")
	# processor = Wav2Vec2Processor.from_pretrained("/content/wav2vec2-large-xlsr")

	input_dict = processor(common_voice_test[0]["input_values"], return_tensors="pt", padding=True)
	logits = model(input_dict.input_values.to("cuda")).logits
	pred_ids = torch.argmax(logits, dim=-1)[0]

	prediction = []
	reference = []
	paths = []

	for i in range(0,len(common_voice_test)):

		input_dict = processor(common_voice_test[i]["input_values"], return_tensors="pt", padding=True)
		logits = model(input_dict.input_values.to("cuda")).logits
		pred_ids = torch.argmax(logits, dim=-1)[0]

		#print("Prediction:")
		prediction.append(processor.decode(pred_ids))

		#print("\nReference:")
		reference.append(common_voice_test_transcription[i]["sentence"].lower())

		path = common_voice_test_transcription[i]["path"]
		path = path.split("/")
		path = path[-1]
		paths.append(path)

	output = "wav,src,res,loss,charDist,charLen,wordDist,wordLen,cer,wer,origin,condition,id,typeMonoTri,ngram\n"
	cerList = []
	werList = []

	for i in range(0,len(reference)):

		levDistChar = levenshtein(reference[i],prediction[i])
		cer = levDistChar / len(reference[i])

		werSent = wer(reference[i],prediction[i])
		charLen = len(reference[i])
		charDist = levDistChar
		wordLen = len(prediction[i].split(' '))
		wordDist = werSent*wordLen

		cerList.append(cer)
		werList.append(werSent)

		wavFile = paths[i].replace(".wav","")

		output += wavFile + "," + reference[i] + "," + prediction[i] + ",," + str(charDist) + "," + str(charLen) + "," + str(wordDist) + "," + str(wordLen) + "," + str(round(cer,2)) + "," + str(round(werSent,2)) + "," + "wav2vec2" + "," + "standard-" + ch + "," + str(idThisRun) + "," + "na" + "," + "na" + "\n"

	output = output[:-1]
	#print(output)

	cerMedian = statistics.median(cerList)
	werMedian = statistics.median(werList)

	medianStats += runId + "/" + ch + " Median CER:\t" + str(round(cerMedian,3)) + "\n"
	medianStats += runId + "/" + ch + " Median WER:\t" + str(round(werMedian,3)) + "\n\n"

	#print(runId + "/" + ch + " Median CER:\t" + str(round(cerMedian,3)))
	#print(runId + "/" + ch + " Median WER:\t" + str(round(werMedian,3)))

	print(output)

print(medianStats)

In [None]:
medianStats = "Run: " + runId + "\n\n" + medianStats

statsFilename = "wav2vec2-res-"+str(runId)+"-stats-median.txt"
f = open(datasetPath + "logs-wav2vec2-res/" + statsFilename, "w")
f.write(medianStats)
f.close()

print(medianStats)

In [None]:
# Visualization of CER and WER
df = pd.read_csv(datasetPath + "logs-wav2vec2-res/" + filename)
df.boxplot(by =['origin'], column =['cer','wer'], grid = False)

# For quick visualization only (display only sentences with CER and WER less than 2)
dfOnlyLessThanTwo = df[df['cer']<2]
dfOnlyLessThanTwo = dfOnlyLessThanTwo[dfOnlyLessThanTwo['wer']<2]
dfOnlyLessThanTwo.boxplot(by =['origin'], column =['cer','wer'], grid = False)