In [95]:
import os
from pathlib import Path 

In [96]:
PATH = "E:/CSIT/7thsem/FYP/hugging_face"

In [97]:
# Set environment variables first
os.environ["HF_HOME"] = PATH
os.environ["HF_DATASETS_CACHE"] = PATH
os.environ["TRANSFORMERS_CACHE"] = PATH
os.environ['HUGGINGFACE_HUB_CACHE'] = PATH

In [98]:
import torchaudio
import torch
import librosa
import jiwer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import datasets
from datasets import Dataset, Audio, load_dataset
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC, TrainingArguments, Trainer, EarlyStoppingCallback, AutoModelForCTC, AdamW, get_scheduler
import random
import IPython.display as ipd
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import evaluate
import GPUtil
import shutil
from tqdm import tqdm
import tarfile
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from transformers import TrainerCallback

In [99]:
# Check if CUDA is available 
if torch.cuda.is_available():
    print("CUDA is available")
    print(f"Number of CUDA devices: {torch.cuda.device_count()}")
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available")    

CUDA is available
Number of CUDA devices: 1
CUDA device name: NVIDIA GeForce RTX 4060 Laptop GPU


In [100]:
# # Downloading the english dataset
# common_voice_train = load_dataset("mozilla-foundation/common_voice_17_0", "en", split="train[:15000]", download_mode="force_redownload", trust_remote_code=True)
# common_voice_dev = load_dataset("mozilla-foundation/common_voice_17_0", "en", split="dev[:2000]", download_mode="force_redownload", trust_remote_code=True)
# common_voice_test = load_dataset("mozilla-foundation/common_voice_17_0", "en", split="test[:2000]", download_mode="force_redownload", trust_remote_code=True)


## Extracting Sub-Dataset

In [101]:
folder_path = "E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/audio/en/train"

In [102]:
files = [f for f in os.listdir(folder_path)]
file_count = len(files)

In [103]:
file_count

40000

In [104]:
import csv

In [105]:
# Function to filter entries according to the available audio file

def filter_audio_data(audio_folder, tsv_file, output_tsv):

    # Go through the audio directory and save the path in audio_files if file names match
    audio_files = set(os.path.splitext(f)[0] for f in os.listdir(audio_folder)
                      if os.path.isfile(os.path.join(audio_folder, f)))
    print(f"Found {len(audio_files)} audio files in the folder.")

    # Open the TSV file store filtered entries
    filtered_entries = []

    with open(tsv_file, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f, delimiter="\t")

        for row in reader:
            # Extract the audio file name and strip whitespaces
            audio_file_name = os.path.splitext(os.path.basename(row['path']))[0]

            # Check if the audio file with given name exists in the extracted audio folder
            if audio_file_name in audio_files:
                # Update the row's path to the full local path
                row["path"] = os.path.join(audio_folder, f"{audio_file_name}.mp3")

                # Append the matched entry to the list
                filtered_entries.append(row)

    # Write the filtered data to a new TSV file
    with open(output_tsv, "w", encoding="utf-8", newline='') as f:
        # Get fieldnames from the original file
        fieldnames = reader.fieldnames       
        writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter="\t")

        writer.writeheader()
        writer.writerows(filtered_entries)

    print(f"filtered {len(filtered_entries)} entries from {tsv_file}")             

In [106]:
train_audio_folder = "E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/audio/en/train/"
test_audio_folder = "E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/audio/en/test/"
dev_audio_folder = "E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/audio/en/dev/"

In [107]:
train_tsv_file = "E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/meta_data/train.tsv"
test_tsv_file = "E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/meta_data/transcript_en_test.tsv"
dev_tsv_file = "E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/meta_data/transcript_en_dev.tsv"

In [108]:
train_output_tsv = "E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/processed_data/meta_data/filtered_train.tsv"
test_output_tsv = "E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/processed_data/meta_data/filtered_test.tsv"
dev_output_tsv = "E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/processed_data/meta_data/filtered_dev.tsv"

In [109]:
filter_audio_data(train_audio_folder, train_tsv_file, train_output_tsv)
filter_audio_data(test_audio_folder, test_tsv_file, test_output_tsv)
filter_audio_data(dev_audio_folder, dev_tsv_file, dev_output_tsv)

Found 40000 audio files in the folder.
filtered 40000 entries from E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/meta_data/train.tsv
Found 16393 audio files in the folder.
filtered 16393 entries from E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/meta_data/transcript_en_test.tsv
Found 16393 audio files in the folder.
filtered 16392 entries from E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/meta_data/transcript_en_dev.tsv


In [110]:
# Testing path

file_path = "E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/audio/en/train/common_voice_en_17924809.mp3"

if os.path.exists(file_path):
    print(f"The file is accessible: {file_path}")
else:
    print(f"The file does not exist or is inaccessible: {file_path}")


The file is accessible: E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/audio/en/train/common_voice_en_17924809.mp3


In [111]:
# Read the metadata file
common_voice_train = pd.read_csv(train_output_tsv, sep="\t")
common_voice_train.head()

Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,f15d2e0fd19c04421174108a8c02c3c2ef8e76365cdcc4...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,05001a328ff9f6589eb0ca7b8df65cd9662698a85e204e...,"Every evening, the dogs in our neighbourhood a...",,2,0,,,,,en,
1,f1619cc90b57fc09cad0f22e0034b9b70f9e1730043bba...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,1ad0ca90c9d90d3864a07ffe269d29cf746d810d7471b2...,A donor has since been found.,,2,0,,,,,en,
2,f1619cc90b57fc09cad0f22e0034b9b70f9e1730043bba...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,1ad9e86d5d4886d66460e453a00e844fee8b48dfc7d589...,New York at that time had become a premier cen...,,2,1,,,,,en,
3,f1619cc90b57fc09cad0f22e0034b9b70f9e1730043bba...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,1ae9db10a316627e0045280707d781f04f1a9f52537d4a...,Weise proceeded into the main corridor of the ...,,2,1,,,,,en,
4,f161e283a59077cdca316bcf7e2f46021a30a163548197...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,0438b75e57e8141ec5080893c825da6c301342a5b5b1cc...,When can I see him?,,2,1,,,,,en,


In [112]:
common_voice_validate = pd.read_csv(dev_output_tsv, sep="\t")
common_voice_validate.head()

Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,026ccc30041813bc12ec418a49ef522af835d6e7133d24...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,5472f4fe8e435f04c6fa1ddcb69f2a659a768deb1ed414...,He went to Georgia Southern University.,,2,1,,,,,en,
1,026ccc30041813bc12ec418a49ef522af835d6e7133d24...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,546c1cf75588dd80f1659b04cecaf6da25f4ca4eaf9c61...,A massive state effort was undertaken to follo...,,2,1,,,,,en,
2,026ccc30041813bc12ec418a49ef522af835d6e7133d24...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,548096bc2e4623cf7c402a38746bb24af6585f7da80231...,Slip Anchor returned to the track in the Septe...,,2,0,,,,,en,
3,026d2144316a2ac0bc0872288def657af5bac642c6c1b8...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,cdfcbf730909cd88865d2fe4f2bedf0b0c3bee00c5eae4...,He stood at stud at Fair Chance Farms in Washi...,,4,0,,,United States English,,en,
4,026d2144316a2ac0bc0872288def657af5bac642c6c1b8...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,cdfc21d770270de6a05123e431a4ba8351c4bf0f3e097e...,He repeated his thesis succinctly: she was not...,,2,1,,,United States English,,en,


In [113]:
common_voice_test = pd.read_csv(test_output_tsv, sep="\t")
common_voice_test.head()

Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,000abb3006b78ea4c1144e55d9d158f05a9db011016051...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,b2cf0b3308b6e00b46f3e48fe59682452ae737a596aa29...,"Joe Keaton disapproved of films, and Buster al...",,3,1,,,,,en,
1,0013037a1d45cc33460806cc3f8ecee9d536c45639ba4c...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,de9d68400821d73ee6868687d0562c60282ee9446d1191...,She'll be all right.,,2,1,,,,,en,
2,0014c5a3e5715a54855257779b89c2bb498d470b225866...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,14c0567a0a6c4ea49f2a1b2ac73de940b8c07779c70bf0...,six,,3,2,,,,,en,Benchmark
3,001509f4624a7dee75247f6a8b642c4a0d09f8be3eeea6...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,db626d9888c3fbc71f1be862592c7831f27f275e199510...,All's well that ends well.,,2,0,,,,,en,
4,001519f234e04528a2b36158c205dbe61c8da45ab0242f...,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,a975540ad0c7ceaaec94cd17ae3b55e1eaf89a61d4a579...,It is a busy market town that serves a large s...,,2,0,,,,,en,


In [114]:
common_voice_train = common_voice_train.drop(columns=["client_id", "sentence_id", "sentence_domain", "up_votes", "down_votes", "age", "gender", "accents", "variant", "locale", "segment"])
common_voice_train.head()

Unnamed: 0,path,sentence
0,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,"Every evening, the dogs in our neighbourhood a..."
1,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,A donor has since been found.
2,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,New York at that time had become a premier cen...
3,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,Weise proceeded into the main corridor of the ...
4,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,When can I see him?


In [115]:
common_voice_validate = common_voice_validate.drop(columns=["client_id", "sentence_id", "sentence_domain", "up_votes", "down_votes", "age", "gender", "accents", "variant", "locale", "segment"])
common_voice_validate.head()

Unnamed: 0,path,sentence
0,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,He went to Georgia Southern University.
1,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,A massive state effort was undertaken to follo...
2,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,Slip Anchor returned to the track in the Septe...
3,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,He stood at stud at Fair Chance Farms in Washi...
4,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,He repeated his thesis succinctly: she was not...


In [116]:
common_voice_test = common_voice_test.drop(columns=["client_id", "sentence_id", "sentence_domain", "up_votes", "down_votes", "age", "gender", "accents", "variant", "locale", "segment"])
common_voice_test.head()

Unnamed: 0,path,sentence
0,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,"Joe Keaton disapproved of films, and Buster al..."
1,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,She'll be all right.
2,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,six
3,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,All's well that ends well.
4,E:/CSIT/7thsem/FYP/hugging_face/common_voice_d...,It is a busy market town that serves a large s...


## Creating Tokenizer

In [117]:
# Converting pandas dataframe to the huggingface dataset
cv_train_dataset = Dataset.from_pandas(common_voice_train)
cv_validate_dataset = Dataset.from_pandas(common_voice_validate)
cv_test_dataset = Dataset.from_pandas(common_voice_test)

In [118]:
def replace_hatted_characters(batch):
    batch["sentence"] = re.sub('[ß]', 's', batch["sentence"])
    batch["sentence"] = re.sub('[à]', 'a', batch["sentence"])
    batch["sentence"] = re.sub('[á]', 'o', batch["sentence"])
    batch["sentence"] = re.sub('[â]', 'a', batch["sentence"])
    batch["sentence"] = re.sub('[ä]', 'a', batch["sentence"])
    batch["sentence"] = re.sub('[è]', 'a', batch["sentence"])
    batch["sentence"] = re.sub('[é]', 'e', batch["sentence"])
    batch["sentence"] = re.sub('[ê]', 'e', batch["sentence"])
    batch["sentence"] = re.sub('[ë]', 'e', batch["sentence"])
    batch["sentence"] = re.sub('[í]', 'e', batch["sentence"])
    batch["sentence"] = re.sub('[ï]', 'i', batch["sentence"])
    batch["sentence"] = re.sub('[ó]', 'u', batch["sentence"])
    batch["sentence"] = re.sub('[ô]', 'o', batch["sentence"])
    batch["sentence"] = re.sub('[ö]', 'o', batch["sentence"])
    batch["sentence"] = re.sub('[ü]', 'u', batch["sentence"])
    batch["sentence"] = re.sub('[ō]', 'o', batch["sentence"])
    batch["sentence"] = re.sub('[ř]', 's', batch["sentence"])
    batch["sentence"] = re.sub('[ž]', 'z', batch["sentence"])

    return batch

In [119]:
cv_train_dataset = cv_train_dataset.map(replace_hatted_characters)
cv_validate_dataset = cv_validate_dataset.map(replace_hatted_characters)
cv_test_dataset = cv_test_dataset.map(replace_hatted_characters)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/16392 [00:00<?, ? examples/s]

Map:   0%|          | 0/16393 [00:00<?, ? examples/s]

In [120]:
# Characters to remove
chars_to_ignore_regex = r'[\t\n\,\?\.\!\-\;\:\"\“\%\”\�\+\_\…\‘\&\–\—\/\\\(\)\’\[\]]'

# Function to remove the special characters

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
    return batch



In [121]:
# Applying remove_special_characters to all items in the sentence column
cv_train_dataset = cv_train_dataset.map(remove_special_characters)
cv_validate_dataset = cv_validate_dataset.map(remove_special_characters)
cv_test_dataset = cv_test_dataset.map(remove_special_characters)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/16392 [00:00<?, ? examples/s]

Map:   0%|          | 0/16393 [00:00<?, ? examples/s]

In [122]:
# Function to concatenate all the transcriptions into one long transcription

def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text":[all_text]}


In [123]:
#  Applying extract_all_chars to all datasets

vocab_train = cv_train_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=cv_train_dataset.column_names)
vocab_validate = cv_validate_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=cv_validate_dataset.column_names)
vocab_test = cv_test_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=cv_test_dataset.column_names)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/16392 [00:00<?, ? examples/s]

Map:   0%|          | 0/16393 [00:00<?, ? examples/s]

In [124]:
vocab_train

Dataset({
    features: ['vocab', 'all_text'],
    num_rows: 1
})

In [125]:
vocab_validate

Dataset({
    features: ['vocab', 'all_text'],
    num_rows: 1
})

In [126]:
vocab_test

Dataset({
    features: ['vocab', 'all_text'],
    num_rows: 1
})

In [127]:
# Unionizing all the characters to remove duplicates

vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_validate["vocab"][0]) | set(vocab_test["vocab"][0]))

In [128]:
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 "'": 1,
 '0': 2,
 '1': 3,
 '2': 4,
 '3': 5,
 '4': 6,
 '5': 7,
 '6': 8,
 '7': 9,
 '8': 10,
 '9': 11,
 'a': 12,
 'b': 13,
 'c': 14,
 'd': 15,
 'e': 16,
 'f': 17,
 'g': 18,
 'h': 19,
 'i': 20,
 'j': 21,
 'k': 22,
 'l': 23,
 'm': 24,
 'n': 25,
 'o': 26,
 'p': 27,
 'q': 28,
 'r': 29,
 's': 30,
 't': 31,
 'u': 32,
 'v': 33,
 'w': 34,
 'x': 35,
 'y': 36,
 'z': 37}

In [129]:
# Giving " " a more visible character |
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [130]:
# Adding padding and blank token to the vocab
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

40

In [131]:
vocab_dict

{"'": 1,
 '0': 2,
 '1': 3,
 '2': 4,
 '3': 5,
 '4': 6,
 '5': 7,
 '6': 8,
 '7': 9,
 '8': 10,
 '9': 11,
 'a': 12,
 'b': 13,
 'c': 14,
 'd': 15,
 'e': 16,
 'f': 17,
 'g': 18,
 'h': 19,
 'i': 20,
 'j': 21,
 'k': 22,
 'l': 23,
 'm': 24,
 'n': 25,
 'o': 26,
 'p': 27,
 'q': 28,
 'r': 29,
 's': 30,
 't': 31,
 'u': 32,
 'v': 33,
 'w': 34,
 'x': 35,
 'y': 36,
 'z': 37,
 '|': 0,
 '[UNK]': 38,
 '[PAD]': 39}

In [132]:
# Saving the vocabulary as a json file

import json

with open('vocab_cv.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [133]:
# Creating the tokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [134]:
tokenizer

Wav2Vec2CTCTokenizer(name_or_path='./', vocab_size=39, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	37: AddedToken("[UNK]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	38: AddedToken("[PAD]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	39: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	40: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Creating Feature Extractor Pipeline

In [135]:
# Creating the feature extractor for audio
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [136]:
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

In [137]:
# Creating the processor to wrap the tokenizer and feature extractor together
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [138]:
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='./', vocab_size=39, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	37: AddedToken("[UNK]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	38: AddedToken("[PAD]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	39: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	40: AddedToken("</s>", rstrip=False, lstrip=False, single_word=Fa

## Preprocess Data

In [139]:
cv_train_dataset

Dataset({
    features: ['path', 'sentence'],
    num_rows: 40000
})

In [140]:
cv_validate_dataset

Dataset({
    features: ['path', 'sentence'],
    num_rows: 16392
})

In [141]:
cv_test_dataset

Dataset({
    features: ['path', 'sentence'],
    num_rows: 16393
})

In [142]:
import random

random.seed(42)

In [143]:
# Creating a subset of the cv_train_dataset to have a size of 15k
indices = random.sample(range(len(cv_train_dataset)), 2000)

# Select the subset
cv_train_subset = cv_train_dataset.select(indices)

In [144]:
cv_train_subset

Dataset({
    features: ['path', 'sentence'],
    num_rows: 2000
})

In [145]:
# Creating a subset of the cv_validate_dataset to have a size of 2k
indices = random.sample(range(len(common_voice_validate)), 600)

# Select the subset
cv_validate_subset = cv_validate_dataset.select(indices)


In [146]:
cv_validate_subset

Dataset({
    features: ['path', 'sentence'],
    num_rows: 600
})

In [147]:
# Creating a subset of the cv_test_dataset to have a size of 2k
indices = random.sample(range(len(common_voice_validate)), 600)

# Select the subset
cv_test_subset = cv_test_dataset.select(indices)

In [148]:
cv_test_subset

Dataset({
    features: ['path', 'sentence'],
    num_rows: 600
})

In [149]:
# Cast the audio column to audio feature
cv_train_subset = cv_train_subset.cast_column("path", Audio(sampling_rate=16000))
cv_validate_subset = cv_validate_subset.cast_column("path", Audio(sampling_rate=16000))
cv_test_subset = cv_test_subset.cast_column("path", Audio(sampling_rate=16000))

In [150]:
cv_train_subset[0]["path"]

{'path': 'E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/audio/en/train/common_voice_en_19583596.mp3',
 'array': array([ 3.49245965e-10,  5.82076609e-10, -2.09547579e-09, ...,
         8.03382136e-07, -5.16767614e-07,  4.31085937e-07]),
 'sampling_rate': 16000}

In [151]:
cv_validate_subset[0]["path"]

{'path': 'E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/audio/en/dev/common_voice_en_18438162.mp3',
 'array': array([-7.27595761e-12, -1.81898940e-12, -3.75166564e-12, ...,
        -2.59348826e-06, -1.97724603e-07, -9.81580456e-07]),
 'sampling_rate': 16000}

In [152]:
cv_test_subset[0]["path"]

{'path': 'E:/CSIT/7thsem/FYP/hugging_face/common_voice_data/audio/en/test/common_voice_en_28192765.mp3',
 'array': array([-4.65661287e-10, -6.98491931e-10, -2.32830644e-10, ...,
        -1.25389372e-04, -1.81773561e-04, -4.70489904e-05]),
 'sampling_rate': 16000}

In [153]:
# Choose a random sample from the train subset
rand_int_train = random.randint(0, len(cv_train_subset) - 1)

# Display the transcript
print(cv_train_subset[rand_int_train]["sentence"])

# Play the audio
ipd.Audio(data=cv_train_subset[rand_int_train]["path"]["array"], autoplay=True, rate=16000)

jackson recorded the album alongside producer billy ray hearn


In [154]:
# Choose a random sample from the validate subset
rand_int_train = random.randint(0, len(cv_validate_subset) - 1)

# Display the transcript
print(cv_validate_subset[rand_int_train]["sentence"])

# Play the audio
ipd.Audio(data=cv_validate_subset[rand_int_train]["path"]["array"], autoplay=True, rate=16000)

studies have shown a linkage between dopamine and the urge to pick


In [155]:
# Choose a random sample from the test subset
rand_int_train = random.randint(0, len(cv_test_subset) - 1)

# Display the transcript
print(cv_test_subset[rand_int_train]["sentence"])

# Play the audio
ipd.Audio(data=cv_test_subset[rand_int_train]["path"]["array"], autoplay=True, rate=16000)

a man in a white shirt is playing the flute to someone in a red skirt


In [156]:
# Checking the shape of input speech, its transcription and corresponding sampling rate for train_dataset

print("Target text: ", cv_train_subset[rand_int_train]["sentence"])
print("Input array shape:", cv_train_subset[rand_int_train]["path"]["array"].shape)
print("Sampling rate:", cv_train_subset[rand_int_train]["path"]["sampling_rate"])

Target text:  these minelayers can carry several thousand mines and manoeuvre with high precision
Input array shape: (135936,)
Sampling rate: 16000


In [157]:
# Checking the shape of input speech, its transcription and corresponding sampling rate for train_dataset

print("Target text: ", cv_validate_subset[rand_int_train]["sentence"])
print("Input array shape:", cv_validate_subset[rand_int_train]["path"]["array"].shape)
print("Sampling rate:", cv_validate_subset[rand_int_train]["path"]["sampling_rate"])

Target text:  last week our fridge broke which forced us to eat lots of ice cream
Input array shape: (92544,)
Sampling rate: 16000


In [158]:
# Checking the shape of input speech, its transcription and corresponding sampling rate for train_dataset

print("Target text: ", cv_test_subset[rand_int_train]["sentence"])
print("Input array shape:", cv_test_subset[rand_int_train]["path"]["array"].shape)
print("Sampling rate:", cv_test_subset[rand_int_train]["path"]["sampling_rate"])

Target text:  a man in a white shirt is playing the flute to someone in a red skirt
Input array shape: (79104,)
Sampling rate: 16000


In [159]:
# Preparing the dataset to the format expected by the Wav2Vec2ForCTC training.

def prepare_dataset(batch):
    # Loading the file paths to the audio file in the dataset
    audio = batch["path"]

    # Processing the audio by normalizing and extracting input values for the model
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    # Process the transcript and encode it as labels for CTC
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids

    return batch    

In [160]:
prep_train_subset = cv_train_subset.map(prepare_dataset, remove_columns=cv_train_subset.column_names)
prep_validate_subset = cv_validate_subset.map(prepare_dataset, remove_columns=cv_validate_subset.column_names)
prep_test_subset = cv_test_subset.map(prepare_dataset, remove_columns=cv_test_subset.column_names)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

## Setting up Training Pipeline

In [161]:
# Decorator to automatically generate special methods 
# like __init__, __repr__, __eq__, and __hash__ for the DataCollatorCTCWithPadding class
@dataclass

class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    # Class attributes that store the processor and padding
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True


    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pads the audio features to ensure that all sequences in the batch are of the same length.
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            # Specifies that the padded batch should be returned as PyTorch tensors
            return_tensors="pt",
        )

        # Allows the labels to be padded seperately from the audio features
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                # Specifies that the padded batch should be returned as PyTorch tensors
                return_tensors = "pt",
            )

        # Replacing padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch 




In [162]:
# Defining the data collator
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [163]:
# Defining evaluation metric

wer_metric = evaluate.load("wer")

In [211]:
def compute_metrics(pred):

    # Extract the predictions (logits) from the pred object
    pred_logits = pred.predictions

    # Get the predicted token IDs by selecting the token with the highest probability
    pred_ids = np.argmax(pred_logits, axis=-1)

    # Replace any padded labels (-100) with the pad_token_id to avoid using them in the metric calculation
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode the predicted token IDs into human-readable text
    pred_str = processor.batch_decode(pred_ids)

    # Decode the correct or true values while avoiding grouping tokens into readable format
    label_str = processor.batch_decode(pred.label_ids, group_tokens = False)

    # Computing WER by comparing the decoded predictions with correct or true values
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    # # Confusion matrix and ROC curve requires the raw predictions(probabilities) 
    # # and the true labels as integers for classification tasks.

    # # Get the true labels
    # true_labels = pred.label_ids

    # # Get the predicted probabilities before the argmax 
    # # ie raw logits that can be converted to probabilities
    # pred_probs = pred_logits 

    # # Create confusion matrix and save it
    # cm = confusion_matrix(true_labels, np.argmax(pred_probs, axis=-1))
    # save_confusion_matrix(cm)

    # # For ROC curve we need probabilites, so convert logits to probabilities

    # # Ensure it as a classification task with multiple classes
    # if len(pred_probs.shape) == 3:
    #     pred_probs = pred_probs[:, :, 1]

    # save_roc_curve(true_labels, pred_probs)    

    return {"wer": wer}

# output_dir = "../classification_report/"

# # Save confusion matrix plot
# def save_confusion_matrix(cm, output_dir):
#     os.makedirs(output_dir, exist_ok=True)
#     plt.figure(figsize=(8, 6))
#     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
#     plt.xlabel('Predicted Labels')
#     plt.ylabel('True Labels')
#     plt.title('Confusion Matrix')
#     plt.savefig(os.path.join(output_dir, "confusion_matrix.png"))
#     plt.close()

# # Save ROC curve plot
# def save_roc_curve(labels, probs, output_dir):
#     os.makedirs(output_dir, exist_ok=True)
#     fpr, tpr, _ = roc_curve(labels, probs)
#     roc_auc = auc(fpr, tpr)

#     plt.figure(figsize=(8, 6))
#     plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
#     plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
#     plt.xlabel('False Positive Rate')
#     plt.ylabel('True Positive Rate')
#     plt.title('ROC Curve')
#     plt.legend(loc='lower right')
#     plt.grid(True)
#     plt.savefig(os.path.join(output_dir, "roc_curve.png"))
#     plt.close()

In [212]:
# Loading the pretrained checkpoint of wav2vec2-base-960h

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base-960h", 
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.0,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
    ignore_mismatched_sizes=True,
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized because the shapes did not match:
- lm_head.bias: found shape torch.Size([32]) in the checkpoint and torch.Size([41]) in the model instantiated
- lm_head.weight: found shape torch.Size([32, 768]) in the checkpoint and torch.Size([41, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [213]:
model.lm_head.weight.shape

torch.Size([41, 768])

In [214]:
model.lm_head.bias.shape

torch.Size([41])

In [215]:
len(processor.tokenizer)

41

In [216]:
# Freezing layers as this part of the model has already been trained sufficiently
model.freeze_feature_extractor()



In [221]:
# Creating a custom callback to log and plot graphs
class PlottingCallback(TrainerCallback):

    def __init__(self, output_dir):
        self.output_dir = output_dir
        # Make sure the output directory exists
        os.makedirs(self.output_dir, exist_ok=True)

        # Collecting metrics during training
        # Collects (step, loss)
        self.losses = []
        # Collects (epoch, loss)
        self.epoch_losses = []
        # Collect (step, WER)
        self.wers = []
        # Collects (epoch, WER)
        self.epoch_wers = []
        # Collects (step, lr)
        self.learning_rates = []
        # Collect steps
        self.steps = []

        # For Confusion Matrix and ROC
        # Predicted labels for confusion matrix and ROC
        self.predictions = []
        # True labels for confusion matrix and ROC
        self.true_labels = []
        # Predicted probabilities for ROC
        self.pred_probs = []

    # Collect metrics during training
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if "loss" in logs:
                self.losses.append((state.global_step, logs["loss"]))
            if "eval_wer" in logs:
                self.wers.append((state.global_step, logs["eval_wer"]))
            if "learning_rate" in logs:
                self.learning_rates.append((state.global_step, logs["learning_rate"]))
            if "epoch" in logs and "loss" in logs:
                self.epoch_losses.append((logs["epoch"], logs["loss"]))
            if "epoch" in logs and "eval_wer" in logs:
                self.epoch_wers.append((logs["epoch"], logs["eval_wer"]))     

    # Collect predictions and true labels for later user 
    def on_predict(self, args, state, control, predictions, **kwargs):
        # Collect the predictions and true labels for ROC and confusion matrix
        # Predicted labels
        self.predictions.extend(np.argmax(predictions.predictions, axis=-1))   
        # True labels
        self.true_labels.extend(predictions.label_ids)
        # Predicted probabilities for ROC curve
        self.pred_probs.extend(predictions.predictions)            

    # Saving a plot to the specified file using Seaborn
    def save_plot(self, x, y, xlabel, ylabel, title, filename):
        
        sns.set_theme(style="whitegrid")
        plt.figure(figsize=(8,6))
        sns.lineplot(x=x, y=y, markers="o", linewidth=2.5)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.title(title, fontsize=14, fontweight="bold")
        save_path = os.path.join(self.output_dir, filename)
        plt.tight_layout()
        plt.savefig(save_path)
        plt.close()

    # Generate and save the plot at the end of training
    def on_train_end(self, args, state, control, **kwargs):
        if self.losses:
            steps, loss_values = zip(*self.losses)
            self.save_plot(
                steps,
                loss_values,
                xlabel="Steps",
                ylabel="Loss",
                title="Loss vs Steps",
                filename="loss_vs_steps.png"
            )
        
        if self.epoch_losses:
            epochs, epoch_loss_values = zip(*self.epoch_losses)
            self.save_plot(
                epochs,
                epoch_loss_values,
                xlabel="Epochs",
                ylabel="Loss",
                title="Loss vs Epochs",
                filename="loss_vs_epochs.png"    
            )

        if self.wers:
            steps, wer_values = zip(*self.wers)
            self.save_plot(
                steps,
                wer_values,
                xlabel="Steps",
                ylabel="WER",
                title="WER vs Steps",
                filename="wer_vs_steps.png"
            )    

        if self.epoch_wers:
            epochs, epoch_wer_values = zip(*self.epoch_wers)
            self.save_plot(
                epochs,
                epoch_wer_values,
                xlabel="Epochs",
                ylabel="WER",
                title="WER vs Epochs",
                filename="wer_vs_epochs.png"
            )

        if self.learning_rates:
            steps, lr_values = zip(*self.learning_rates)
            self.save_plot(
                steps,
                lr_values,
                xlabel="Steps",
                ylabel="Learning Rate",
                title="Learning Rate vs Steps",
                filename="learning_rate_vs_steps.png"
            )

        # Generate and save Confusion Matrix and ROC curve at the end of training
        self.generate_confusion_matrix()
        self.generate_roc_curve()    
    
        print(f"All plots saved in: {self.output_dir}")

    # Confusion Matrix
    def generate_confusion_matrix(self):
        cm = confusion_matrix(self.true_labels, self.predictions)
        self.save_confusion_matrix(cm)    

    def save_confusion_matrix(self, cm):
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
        plt.xlabel('Predicted Labels')
        plt.ylabel('True Labels')
        plt.title('Confusion Matrix')
        plt.savefig(os.path.join(self.output_dir, "confusion_matrix.png"))
        plt.close()    

    # ROC Curve
    def generate_roc_curve(self):
        # Convert logits to probabilities for ROC
        # shape: (num_samples, num_classes)
        pred_probs = np.array(self.pred_probs)
        # Use the positive class (index 1 for binary)
        fpr, tpr, _ = roc_curve(self.true_labels, pred_probs[:, 1])
        roc_auc = auc(fpr, tpr)

        self.save_roc_curve(fpr, tpr, roc_auc)

    def save_roc_curve(self, fpr, tpr, roc_auc):
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='lower right')
        plt.grid(True)
        plt.savefig(os.path.join(self.output_dir, "roc_curve.png"))
        plt.close() 
            

In [222]:
# Setting up the training arguments

training_args = TrainingArguments(
    output_dir="../models/6th_test/",
    # Group sequences by length to optimize training
    group_by_length=True,
    # Lowered batch size due to limited data and memory
    per_device_train_batch_size=4,
    # Accumulate gradients over multiple steps to effectively increase batch size
    gradient_accumulation_steps=2,
    # Evaluate the model at specific intervals
    # evaluation_strategy="steps",
    evaluation_strategy="epoch",
    # Epochs must be 1% of the total data. For example: if we have a total of 100 entries in a dataset the epoch would be 1
    num_train_epochs=2,
    # Enable gradient checkpointing to save GPU memory
    gradient_checkpointing=True,
    # Use mixed precision training for reducing memory usage
    fp16=True,
    # # Save model every 400 steps for frequent checkpoints
    # save_steps=1000,
    save_strategy = "epoch",
    # # # Evaluate model every 400 steps
    # eval_steps=200,
    # Log training progress every 400 steps
    logging_steps=50,
    learning_rate=1e-5,
    save_total_limit=2,
    push_to_hub=False,
    greater_is_better=False,
    load_best_model_at_end=True,
)




In [223]:
# Intitialize the custom callback
output_dir = "../classification_report/"
stats_callback = PlottingCallback(output_dir)

# Passing all the instances to the Trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=prep_train_subset,
    eval_dataset=prep_validate_subset,
    # Use the processor for both tokenization and feature extraction
    tokenizer=processor.feature_extractor,
    callbacks=[stats_callback],
)

  trainer = Trainer(


In [224]:
trainer.train()

  0%|          | 0/500 [00:00<?, ?it/s]



{'loss': 3.194, 'grad_norm': 7.919771194458008, 'learning_rate': 9.040000000000002e-06, 'epoch': 0.2}
{'loss': 3.1338, 'grad_norm': 11.010995864868164, 'learning_rate': 8.040000000000001e-06, 'epoch': 0.4}
{'loss': 3.1042, 'grad_norm': 14.673157691955566, 'learning_rate': 7.04e-06, 'epoch': 0.6}
{'loss': 3.0635, 'grad_norm': 3.5633761882781982, 'learning_rate': 6.040000000000001e-06, 'epoch': 0.8}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'loss': 3.041, 'grad_norm': 2.74058198928833, 'learning_rate': 5.04e-06, 'epoch': 1.0}


  0%|          | 0/75 [00:00<?, ?it/s]

{'eval_loss': 3.0733189582824707, 'eval_wer': 1.0, 'eval_runtime': 23.1471, 'eval_samples_per_second': 25.921, 'eval_steps_per_second': 3.24, 'epoch': 1.0}




{'loss': 3.0377, 'grad_norm': 2.6924924850463867, 'learning_rate': 4.04e-06, 'epoch': 1.2}
{'loss': 3.0208, 'grad_norm': 7.609466075897217, 'learning_rate': 3.04e-06, 'epoch': 1.4}
{'loss': 3.0231, 'grad_norm': 10.1987886428833, 'learning_rate': 2.04e-06, 'epoch': 1.6}
{'loss': 3.0177, 'grad_norm': 3.157449722290039, 'learning_rate': 1.04e-06, 'epoch': 1.8}
{'loss': 3.0128, 'grad_norm': 3.211790084838867, 'learning_rate': 4e-08, 'epoch': 2.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/75 [00:00<?, ?it/s]

{'eval_loss': 3.0392439365386963, 'eval_wer': 1.0, 'eval_runtime': 22.9684, 'eval_samples_per_second': 26.123, 'eval_steps_per_second': 3.265, 'epoch': 2.0}
{'train_runtime': 295.8887, 'train_samples_per_second': 13.519, 'train_steps_per_second': 1.69, 'train_loss': 3.0648519592285157, 'epoch': 2.0}


ValueError: zero-size array to reduction operation fmin which has no identity

<Figure size 800x600 with 0 Axes>

In [None]:
# save_directory = "../models/Saved/"
# model.save_pretrained(save_directory)
# processor.save_pretrained(save_directory)

In [None]:
# # Load pretrained model checkpoint
# model = AutoModelForCTC.from_pretrained("../models/Saved/")
# processor = Wav2Vec2Processor.from_pretrained("../models/Saved/")

In [None]:
# cv_test_subset