# Installations

Install and unzip TIMIT dataset from google drive

In [1]:
# ! gdown 15Nq4PdOY7h8AP54ge3EurUPKa91jvezc

In [2]:
# ! unzip timit.zip -d ./timit/

In [3]:
import os
import random
import json
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import IPython.display as ipd
import librosa
import soundfile as sf
from datasets import Audio, Dataset

import torch
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC, TrainingArguments, Trainer
from transformers.trainer_utils import EvalPrediction
from dataclasses import dataclass
from jiwer import wer


from typing import List, Union, Set, Dict
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


# Dataset

This section contains a modified version of the original function, adapted from [here](https://www.kaggle.com/code/vitouphy/phoneme-recognition-with-wav2vec2/notebook) and [here](https://huggingface.co/blog/fine-tune-wav2vec2-english) to fit our specific task requirements.

In [4]:
timit_path = "./timit/"
timit_data_path = "./timit/data/"

In [5]:
df_train = pd.read_csv(os.path.join(timit_path, 'train_data.csv'))
df_test = pd.read_csv(os.path.join(timit_path, 'test_data.csv'))
df = pd.concat([df_train, df_test])
df = df[df['is_converted_audio'] == False]

In [6]:
df_train = df[df['test_or_train'] == "TRAIN"]
df_test  = df[df['test_or_train'] == "TEST"]

In [7]:
df_train

Unnamed: 0,index,test_or_train,dialect_region,speaker_id,filename,path_from_data_dir,path_from_data_dir_windows,is_converted_audio,is_audio,is_word_file,is_phonetic_file,is_sentence_file
1,2.0,TRAIN,DR4,MMDM0,SI1311.PHN,TRAIN/DR4/MMDM0/SI1311.PHN,TRAIN\\DR4\\MMDM0\\SI1311.PHN,False,False,False,True,False
2,3.0,TRAIN,DR4,MMDM0,SI1311.WRD,TRAIN/DR4/MMDM0/SI1311.WRD,TRAIN\\DR4\\MMDM0\\SI1311.WRD,False,False,True,False,False
3,4.0,TRAIN,DR4,MMDM0,SX321.PHN,TRAIN/DR4/MMDM0/SX321.PHN,TRAIN\\DR4\\MMDM0\\SX321.PHN,False,False,False,True,False
4,5.0,TRAIN,DR4,MMDM0,SX321.WRD,TRAIN/DR4/MMDM0/SX321.WRD,TRAIN\\DR4\\MMDM0\\SX321.WRD,False,False,True,False,False
5,6.0,TRAIN,DR4,MMDM0,SI681.TXT,TRAIN/DR4/MMDM0/SI681.TXT,TRAIN\\DR4\\MMDM0\\SI681.TXT,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
23093,23094.0,TRAIN,DR8,MRDM0,SX245.TXT,TRAIN/DR8/MRDM0/SX245.TXT,TRAIN\\DR8\\MRDM0\\SX245.TXT,False,False,False,,True
23094,23095.0,TRAIN,DR8,MRDM0,SI1044.PHN,TRAIN/DR8/MRDM0/SI1044.PHN,TRAIN\\DR8\\MRDM0\\SI1044.PHN,False,False,False,,False
23096,23097.0,TRAIN,DR8,MRDM0,SX245.WAV,TRAIN/DR8/MRDM0/SX245.WAV,TRAIN\\DR8\\MRDM0\\SX245.WAV,False,True,False,,False
23098,23099.0,TRAIN,DR8,MRDM0,SX335.WAV,TRAIN/DR8/MRDM0/SX335.WAV,TRAIN\\DR8\\MRDM0\\SX335.WAV,False,True,False,,False


In [8]:
df_test

Unnamed: 0,index,test_or_train,dialect_region,speaker_id,filename,path_from_data_dir,path_from_data_dir_windows,is_converted_audio,is_audio,is_word_file,is_phonetic_file,is_sentence_file
0,1.0,TEST,DR4,MGMM0,SX139.WAV,TEST/DR4/MGMM0/SX139.WAV,TEST\\DR4\\MGMM0\\SX139.WAV,False,True,False,False,False
2,3.0,TEST,DR4,MGMM0,SX139.TXT,TEST/DR4/MGMM0/SX139.TXT,TEST\\DR4\\MGMM0\\SX139.TXT,False,False,False,False,True
3,4.0,TEST,DR4,MGMM0,SI499.WRD,TEST/DR4/MGMM0/SI499.WRD,TEST\\DR4\\MGMM0\\SI499.WRD,False,False,True,False,False
4,5.0,TEST,DR4,MGMM0,SX319.WRD,TEST/DR4/MGMM0/SX319.WRD,TEST\\DR4\\MGMM0\\SX319.WRD,False,False,True,False,False
5,6.0,TEST,DR4,MGMM0,SX319.PHN,TEST/DR4/MGMM0/SX319.PHN,TEST\\DR4\\MGMM0\\SX319.PHN,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8394,8395.0,TEST,DR8,MPAM0,SX19.WAV,TEST/DR8/MPAM0/SX19.WAV,TEST\\DR8\\MPAM0\\SX19.WAV,False,True,False,False,False
8395,8396.0,TEST,DR8,MPAM0,SX109.TXT,TEST/DR8/MPAM0/SX109.TXT,TEST\\DR8\\MPAM0\\SX109.TXT,False,False,False,False,True
8397,8398.0,TEST,DR8,MPAM0,SX289.WRD,TEST/DR8/MPAM0/SX289.WRD,TEST\\DR8\\MPAM0\\SX289.WRD,False,False,True,False,False
8398,8399.0,TEST,DR8,MPAM0,SX109.WAV,TEST/DR8/MPAM0/SX109.WAV,TEST\\DR8\\MPAM0\\SX109.WAV,False,True,False,False,False


In this part, we will process the raw data to structure the dataset into three columns:

1. Path to the audio file
2. Text from the audio
3. Corresponding phonemes

Additionally, we will filter out any samples lacking phoneme data, as they cannot be used for training or testing without labels.

In [9]:
PHONETIC_KEY: str = "phonetic"
AUDIO_KEY:    str = "audio"
WORD_KEY:     str = "word"


def convert_phon61_to_phon39(sentence: str) -> str:
    phon61_map39 = {
        'iy':'iy',  'ih':'ih',  'eh':'eh',  'ae':'ae',   'ix':'ih',  'ax':'ah',  'ah':'ah',   'uw':'uw',
        'ux':'uw',  'uh':'uh',  'ao':'aa',  'aa':'aa',   'ey':'ey',  'ay':'ay',  'oy':'oy',   'aw':'aw',
        'ow':'ow',  'l':'l',    'el':'l',   'r':'r',     'y':'y',    'w':'w',    'er':'er',   'axr':'er',
        'm':'m',    'em':'m',   'n':'n',    'nx':'n',    'en':'n',   'ng':'ng',  'eng':'ng',  'ch':'ch',
        'jh':'jh',  'dh':'dh',  'b':'b',    'd':'d',     'dx':'dx',  'g':'g',    'p':'p',     't':'t',
        'k':'k',    'z':'z',    'zh':'sh',  'v':'v',     'f':'f',    'th':'th',  's':'s',     'sh':'sh',
        'hh':'hh',  'hv':'hh',  'pcl':'h#', 'tcl':'h#',  'kcl':'h#', 'qcl':'h#', 'bcl':'h#',  'dcl':'h#',
        'gcl':'h#', 'h#':'h#',  '#h':'h#',  'pau':'h#',  'epi':'h#', 'nx':'n',   'ax-h':'ah', 'q':'h#'
    }
    tokens = [phon61_map39[x] for x in sentence.split()]
    return " ".join(tokens)


def read_text_file(filepath: str) -> str:
    with open(filepath) as f:
        tokens = [line.split()[-1] for line in f]
        return " ".join(tokens)


def merge_item_data(df: pd.DataFrame) -> Dict[str, Dict[str, str]]:
    data = {}

    for _, row in tqdm(df.iterrows()):
        path = row['path_from_data_dir']
        entry_id = path.split('.')[0]
        entry_id = "-".join(entry_id.split('/')[-2:])

        if entry_id not in data:
            data[entry_id] = {}

        if row['is_audio'] is True:
            data[entry_id][AUDIO_KEY] = os.path.join(timit_data_path, path)
        elif row['is_word_file'] is True:
            data[entry_id][WORD_KEY] = read_text_file(os.path.join(timit_data_path, path))
        elif row['is_phonetic_file'] is True:
            phoneme_sentence = read_text_file(os.path.join(timit_data_path, path))
            data[entry_id][PHONETIC_KEY] = convert_phon61_to_phon39(phoneme_sentence)
    return data


def transform_dataset(df: pd.DataFrame) -> Dict[str, List[str]]:
    merged_data = merge_item_data(df)

    phonetic_files: List[str] = []
    audio_files:    List[str] = []
    word_files:     List[str] = []
    keys_set:       Set[str]  = set([PHONETIC_KEY, AUDIO_KEY, WORD_KEY])

    for _, value in merged_data.items():
        if set(value) == keys_set:
            phonetic_files.append(value[PHONETIC_KEY])
            audio_files.append(value[AUDIO_KEY])
            word_files.append(value[WORD_KEY])

    return {
        AUDIO_KEY: audio_files,
        WORD_KEY: word_files,
        PHONETIC_KEY: phonetic_files
    }

In [10]:
train_val = transform_dataset(df_train)
test = transform_dataset(df_test)

18480it [00:02, 7625.03it/s]
6720it [00:00, 7093.21it/s]


In [11]:
random.seed(42)
n = len(train_val[AUDIO_KEY])
train_n = int(n * 0.9)
val_n = n - train_n

val_ind = np.array(sorted(random.sample(range(0, n), val_n)))
train_ind = np.array(sorted(list(set(range(0, n)) - set(val_ind))))

In [12]:
train = dict()
val = dict()

for key in [AUDIO_KEY, WORD_KEY, PHONETIC_KEY]:
    train[key] = np.array(train_val[key])[train_ind].tolist()
    val[key]   = np.array(train_val[key])[val_ind].tolist()

In [13]:
print(f"Train set length: {len(train[AUDIO_KEY])}")
print(f"Validation set length: {len(val[AUDIO_KEY])}")
print(f"Test set length: {len(test[AUDIO_KEY])}")

Train set length: 1512
Validation set length: 168
Test set length: 1680


Now let's convert the data to a data type that the model will work with.

In [14]:
train_dataset = Dataset.from_dict(train)
val_dataset   = Dataset.from_dict(val)
test_dataset  = Dataset.from_dict(test)

In [15]:
train_dataset = train_dataset.cast_column(AUDIO_KEY, Audio(sampling_rate=16_000))
val_dataset   = val_dataset.cast_column(AUDIO_KEY, Audio(sampling_rate=16_000))
test_dataset  = test_dataset.cast_column(AUDIO_KEY, Audio(sampling_rate=16_000))

In [16]:
ind = 10

print("Text:", train_dataset[ind][WORD_KEY])
print("Phonetics:", train_dataset[ind][PHONETIC_KEY])
print("Input array shape:", train_dataset[ind][AUDIO_KEY]["array"].shape)
print("Sampling rate:", train_dataset[ind][AUDIO_KEY]["sampling_rate"])
ipd.Audio(data=train_dataset[ind][AUDIO_KEY]["array"], autoplay=False, rate=16000)

Text: get a calico cat to keep
Phonetics: h# g ih dx ih h# k ae l ih h# k ow h# k ae h# t ah h# k iy h# p h#
Input array shape: (32359,)
Sampling rate: 16000


#### Phonemes vocabulary

In [17]:
train_phonetics = [phone for x in train_dataset for phone in x[PHONETIC_KEY].split()]
val_phonetics   = [phone for x in val_dataset for phone in x[PHONETIC_KEY].split()]
test_phonetics  = [phone for x in test_dataset for phone in x[PHONETIC_KEY].split()]

In [18]:
vocab_list = list(set(train_phonetics) | set(val_phonetics) | set(test_phonetics) | set([' ']))

vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
print(vocab_dict)

{' ': 0, 'aa': 1, 'ae': 2, 'ah': 3, 'aw': 4, 'ay': 5, 'b': 6, 'ch': 7, 'd': 8, 'dh': 9, 'dx': 10, 'eh': 11, 'er': 12, 'ey': 13, 'f': 14, 'g': 15, 'h#': 16, 'hh': 17, 'ih': 18, 'iy': 19, 'jh': 20, 'k': 21, 'l': 22, 'm': 23, 'n': 24, 'ng': 25, 'ow': 26, 'oy': 27, 'p': 28, 'r': 29, 's': 30, 'sh': 31, 't': 32, 'th': 33, 'uh': 34, 'uw': 35, 'v': 36, 'w': 37, 'y': 38, 'z': 39}


In [19]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

42


In [20]:
with open('./vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [21]:
del df
del df_test
del df_train

# Modeling

In [22]:
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, 
                                             sampling_rate=16000, 
                                             padding_value=0.0, 
                                             do_normalize=True, 
                                             return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [23]:
def prepare_dataset(batch):
    audio = batch[AUDIO_KEY]
    
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["phonetic"]).input_ids
    return batch

In [24]:
train_dataset = train_dataset.map(prepare_dataset)
val_dataset   = val_dataset.map(prepare_dataset)
test_dataset  = test_dataset.map(prepare_dataset)

Map: 100%|██████████| 1512/1512 [01:08<00:00, 21.98 examples/s]
Map: 100%|██████████| 168/168 [00:07<00:00, 23.37 examples/s]
Map: 100%|██████████| 1680/1680 [01:11<00:00, 23.57 examples/s]


In [25]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch

In [26]:
def calculate_metric(eval_pred: EvalPrediction) -> Dict[str, float]:
    pred_logits = eval_pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    
    eval_pred.label_ids[eval_pred.label_ids == -100] = tokenizer.pad_token_id
    
    pred_str = tokenizer.batch_decode(pred_ids)
    label_str = tokenizer.batch_decode(eval_pred.label_ids, group_tokens=False)
    
    per = wer(hypothesis=pred_str, reference=label_str)
    return {
        "per": per
    }

In [28]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size = len(vocab_dict)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
model.freeze_feature_encoder()

In [30]:
LEARNING_RATE = 1e-4
BATCH_SIZE    = 8
EPOCHS        = 20

training_args = TrainingArguments(
    output_dir="./results",
    push_to_hub=False,
    group_by_length=True,
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    eval_strategy="steps",
    fp16=True,
    gradient_checkpointing=True,
    save_steps=500,
    eval_steps=500,
    logging_steps=500,
    learning_rate=LEARNING_RATE,
    warmup_steps=1000,
    load_best_model_at_end=True,
    no_cuda=False
    )

In [31]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=calculate_metric,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.feature_extractor
)

  trainer = Trainer(


In [34]:
trainer.train()



Step,Training Loss,Validation Loss,Per
500,3.9835,1.47457,1.005034
1000,0.7687,0.432232,0.75
1500,0.3931,0.407089,0.696309
2000,0.2737,0.43762,0.677852
2500,0.2081,0.458755,0.652685
3000,0.1574,0.476359,0.674497
3500,0.1236,0.501159,0.666107




TrainOutput(global_step=3780, training_loss=0.7892151948636171, metrics={'train_runtime': 1431.5519, 'train_samples_per_second': 21.124, 'train_steps_per_second': 2.64, 'total_flos': 8.573756131965979e+17, 'train_loss': 0.7892151948636171, 'epoch': 20.0})

In [39]:
trainer.evaluate(test_dataset)



{'eval_loss': 0.40407517552375793,
 'eval_per': 0.6711718874224837,
 'eval_runtime': 48.1282,
 'eval_samples_per_second': 34.907,
 'eval_steps_per_second': 4.363,
 'epoch': 20.0}