# HW1: Frame-Level Speech Recognition

## Dataset and Preparation
### imports

In [1]:
import torch
import numpy as np
# from torchsummaryX import summary
import torchinfo
import sklearn
import gc
import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime
import wandb
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print('Device:', 'cpu' if device == 'cpu' else torch.cuda.get_device_name(0))

Device: NVIDIA GeForce RTX 4090


### Common Configs

In [2]:
def recursive_mkdir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        print(f"Created directory: {dir_path}")
    else:
        print(f"Directory already exists: {dir_path}")

PHONEMES = [
            '[SIL]',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '[SOS]', '[EOS]']
DATA_ROOT = "/mnt/e/Workspace/IDL/Data/hw1/11-785-s24-hw1p2/"


## Dataset

In [3]:
from torch.utils.data import Dataset, DataLoader
import time

class AudioDataset(Dataset):
    def __init__(self, root=DATA_ROOT, phonemes = PHONEMES, context=0, partition= "train-clean-100", use_cmn=False): #TODO: make no-pad works (BF2042 meme huh?) 
        self.max_context_length = 1145 #Magic number
        self.set_context_length(context)
        self.phonemes   = phonemes
        
        self.num_phonemes = len(self.phonemes)
        
        self.mfccs, self.transcripts = self._init_data(f"{root}/{partition}", use_cmn=use_cmn)
        self.length = len(self.mfccs)
        
        self.mfccs = np.concatenate([np.zeros((self.max_context_length, 27)), self.mfccs, np.zeros((self.max_context_length, 27))], axis=0)
        self.transcripts = np.concatenate([
                [self.phonemes.index('[SIL]') for _ in range(self.max_context_length)], 
                self.transcripts, 
                [self.phonemes.index('[SIL]') for _ in range(self.max_context_length)]
            ],axis=0)
        assert len(self.mfccs) == len(self.transcripts)
    
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        start = time.perf_counter_ns()
        # given current context length, compute offset:
        lower = idx + self.max_context_length - self.context
        upper = idx + self.max_context_length + self.context + 1
            

        # TODO: Based on context and offset, return a frame at given index with context frames to the left, and right.
        frames = self.mfccs[lower:upper]
        
        # After slicing, you get an array of shape 2*context+1 x 27. But our MLP needs 1d data and not 2d.
        frames = frames.flatten() # Reshape to 1d array

        frames      = torch.FloatTensor(frames) # Convert to tensors
        phonemes    = torch.tensor(self.transcripts[idx + self.max_context_length]) # Get the phoneme at the index

        return frames, phonemes
        
        
    def _init_data(self, root: str, use_cmn = False):
        self.mfcc_dir       = f"{root}/mfcc"
        self.transcript_dir = f"{root}/transcript"
        mfcc_names          = os.listdir(self.mfcc_dir)
        transcript_names    = os.listdir(self.transcript_dir)
        
        assert len(mfcc_names) == len(transcript_names)

        self.mfccs, self.transcripts = [], []
        for i in tqdm(range(len(mfcc_names))):
        #   Load a single mfcc
            mfcc        = np.load(f"{self.mfcc_dir}/{mfcc_names[i]}")
        #   Do Cepstral Normalization of mfcc (explained in writeup)
            if use_cmn:
                mfcc = mfcc - np.mean(mfcc, axis=0)
        #   Load the corresponding transcript
            transcript  = np.load(f"{self.transcript_dir}/{transcript_names[i]}") 
            # Remove [SOS] and [EOS] from the transcript
            assert transcript[0] == '[SOS]' and transcript[-1] == '[EOS]'
            transcript = transcript[1:-1]
            #lookup phoneme index
            transcript = np.vectorize(self.phonemes.index)(transcript)
            assert len(mfcc) == len(transcript)
            # (Is there an efficient way to do this without traversing through the transcript?)
            # Note that SOS will always be in the starting and EOS at end, as the name suggests.
        #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs.append(mfcc)
            self.transcripts.append(transcript)
            
        return np.concatenate(self.mfccs, axis=0), np.concatenate(self.transcripts, axis=0)
    
    def set_context_length(self, context):
        self.context = context
        
    def phoneme_reverse_lookup(self, idx: torch.tensor) -> str:
        return self.phonemes[idx]
         

In [4]:
class AudioTestDataset(AudioDataset):
    
    def _init_data(self, root: str, use_cmn):
        
        self.mfcc_dir = f"{root}/mfcc"

        mfcc_names = os.listdir(self.mfcc_dir)

        self.mfccs, self.transcripts = [], []
        
        for i in tqdm(range(len(mfcc_names))):
        #   Load a single mfcc
            mfcc        = np.load(f"{self.mfcc_dir}/{mfcc_names[i]}")
            transcript = np.array([0 for _ in range(len(mfcc))])
            
            assert len(mfcc) == len(transcript)
            
            self.mfccs.append(mfcc)
            self.transcripts.append(transcript)
            
        return np.concatenate(self.mfccs, axis=0), np.concatenate(self.transcripts, axis=0)
    
    def __getitem__(self, ind):
        return super().__getitem__(ind)[0]

### Create Dataset and tests

In [5]:
test_context = 5

train_data = AudioDataset(partition="train-clean-100", context=test_context, use_cmn=True)

val_data = AudioDataset(partition="dev-clean", context=test_context, use_cmn=True)

test_data = AudioTestDataset(partition="test-clean", context=test_context, use_cmn=True)

  0%|          | 0/28539 [00:00<?, ?it/s]

100%|██████████| 28539/28539 [03:09<00:00, 150.51it/s]
100%|██████████| 2703/2703 [00:21<00:00, 128.64it/s]
100%|██████████| 2620/2620 [00:09<00:00, 280.22it/s]


In [6]:
# Tests:

f, p = val_data[0]
print('Sample frame shape:', f.shape, 'Sample phoneme shape:', p.shape, 'phoneme type:', p.dtype)
f = test_data[0]
print('Sample frame shape:', f.shape)

Sample frame shape: torch.Size([297]) Sample phoneme shape: torch.Size([]) phoneme type: torch.int64
Sample frame shape: torch.Size([297])


# Model definition

In [7]:
from torch import nn

class NetV2(torch.nn.Module):
    def __init__(self, input_size, output_size, hidden_sizes, dropout_rate):
        super(NetV2, self).__init__()
        self.layers = []
        for i, hs in enumerate(hidden_sizes):
            self.layers.append(self._mlp_layer_provider(input_size, hs, dropout_rate))
            input_size = hs
        self.layers.append(nn.Linear(input_size, output_size)) # output
        self.model = nn.Sequential(*self.layers)
    
    def forward(self, x):
        out = self.model(x)
        return out
    
    def _mlp_layer_provider(self, input_size, hidden_size, dropout_rate):
        return nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.LeakyReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Dropout(dropout_rate)
        )
    

In [8]:
class NetV21(NetV2):
    def _mlp_layer_provider(self, input_size, hidden_size, dropout_rate):
        return nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.Mish(),
            nn.Dropout(dropout_rate)
        )

# Training
## Configs

In [9]:
config = {
    'sizes': np.array([1,2,4,2,1]) * 512, # 5 layers
    'epochs'        : 30,
    'batch_size'    : 32768,
    'context'       : 20,
    'init_lr'       : 2e-3,
    'architecture'  : 'v2_1_5layers',
    'dropout'       : 0.2,
    'weight_decay'  : 1e-5,
    'scheduler_params'     : {'patience': 7, 'factor': 0.2, 'min-lr': 1e-7},
    # Add more as you need them - e.g dropout values, weight decay, scheduler parameters
    'wandb_name': 'v2_1_5layers_4k_bs'
}

train_data.set_context_length(config['context'])
val_data.set_context_length(config['context'])
test_data.set_context_length(config['context'])

MODEL_ROOT = "/mnt/e/Workspace/IDL/Models/hw1/11-785-s24-hw1p2/v2_1_5l_4k_bs_256_size"
recursive_mkdir(MODEL_ROOT)

train_loader = torch.utils.data.DataLoader(
    dataset     = train_data,
    num_workers = 4,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = True
)

val_loader = torch.utils.data.DataLoader(
    dataset     = val_data,
    num_workers = 4,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

test_loader = torch.utils.data.DataLoader(
    dataset     = test_data,
    num_workers = 4,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)


print("Batch size     : ", config['batch_size'])
print("Context        : ", config['context'])
print("Input size     : ", (2*config['context']+1)*27)
print("Output symbols : ", len(PHONEMES))

print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))


Created directory: /mnt/e/Workspace/IDL/Models/hw1/11-785-s24-hw1p2/v2_1_5l_4k_bs_256_size
Batch size     :  32768
Context        :  20
Input size     :  1107
Output symbols :  42
Train dataset samples = 36091157, batches = 1102
Validation dataset samples = 1928204, batches = 59
Test dataset samples = 1934138, batches = 60


In [10]:
# Testing code to check if your data loaders are working
for i, data in enumerate(train_loader):
    frames, phoneme = data
    print(frames.shape, phoneme.shape)
    break

torch.Size([32768, 1107]) torch.Size([32768])


In [11]:
# Summary:
INPUT_SIZE = (2*config['context']+1)*27
# model = NetV2(input_size=INPUT_SIZE, output_size=len(PHONEMES), hidden_sizes=config['sizes'], dropout_rate=config['dropout']).to(device)
model = NetV21(input_size=INPUT_SIZE, output_size=len(PHONEMES), hidden_sizes=config['sizes'], dropout_rate=config['dropout']).to(device)
torchinfo.summary(model, input_size=(config['batch_size'], INPUT_SIZE), device=device)

Layer (type:depth-idx)                   Output Shape              Param #
NetV21                                   [32768, 42]               --
├─Sequential: 1-1                        [32768, 42]               --
│    └─Sequential: 2-1                   [32768, 512]              --
│    │    └─Linear: 3-1                  [32768, 512]              567,296
│    │    └─Mish: 3-2                    [32768, 512]              --
│    │    └─Dropout: 3-3                 [32768, 512]              --
│    └─Sequential: 2-2                   [32768, 1024]             --
│    │    └─Linear: 3-4                  [32768, 1024]             525,312
│    │    └─Mish: 3-5                    [32768, 1024]             --
│    │    └─Dropout: 3-6                 [32768, 1024]             --
│    └─Sequential: 2-3                   [32768, 2048]             --
│    │    └─Linear: 3-7                  [32768, 2048]             2,099,200
│    │    └─Mish: 3-8                    [32768, 2048]             -

In [12]:
criterion = torch.nn.CrossEntropyLoss() # Defining Loss function.
# We use CE because the task is multi-class classification

optimizer = torch.optim.Adam(model.parameters(), lr= config['init_lr'], weight_decay=config['weight_decay'], ) #Defining Optimizer

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    patience=config['scheduler_params']['patience'], 
    min_lr=config['scheduler_params']['min-lr'], 
    factor=config['scheduler_params']['factor'],
    verbose=True
)

In [13]:
torch.cuda.empty_cache()
gc.collect()

21

# Training
## funcs

In [14]:
from torch.cuda.amp import GradScaler, autocast

#A epoch:
def train(model, dataloader, optimizer, criterion, scheduler, logger, log_freq=100, use_amp=False):
    if use_amp: raise NotImplementedError("AMP not implemented yet")
    else: return train_no_amp(model, dataloader, optimizer, criterion, scheduler, logger, log_freq)
    
def train_no_amp(model, dataloader, optimizer, criterion, scheduler, logger, log_freq=100):
    model.train()
    total_loss, total_acc = 0.0, 0.0
    
    batch_bar   = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')
    for i, data in enumerate(dataloader):
        frames, phonemes = data
        frames, phonemes = frames.to(device), phoneme.to(device)
        
        optimizer.zero_grad()
        logits = model(frames)
        loss = criterion(logits, phonemes)
        loss.backward()
        optimizer.step()
        # scheduler.step(loss)
        
        total_loss += loss.item()
        total_acc += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]
        
        del frames, phonemes, logits
        
        batch_bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))),
                                acc="{:.04f}%".format(float(total_acc*100 / (i + 1))))
        batch_bar.update()
        
        #Switch:
        # raise EOFError('Kill switch')
        
        if (i+1) % log_freq == 0:
            logger(i, total_loss / (i+1), total_acc / (i+1))
    
    batch_bar.close()
    total_loss   /= len(train_loader)
    total_acc /= len(train_loader)
    return total_loss, total_acc

In [15]:
def eval(model, dataloader):

    model.eval() # set model in evaluation mode
    vloss, vacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    for i, (frames, phonemes) in enumerate(dataloader):

        ### Move data to device (ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        # makes sure that there are no gradients computed as we are not training the model now
        with torch.inference_mode():
            ### Forward Propagation
            logits  = model(frames)
            ### Loss Calculation
            loss    = criterion(logits, phonemes)

        vloss   += loss.item()
        vacc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]

        # Do you think we need loss.backward() and optimizer.step() here?

        batch_bar.set_postfix(loss="{:.04f}".format(float(vloss / (i + 1))),
                              acc="{:.04f}%".format(float(vacc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()
        

    batch_bar.close()
    vloss   /= len(val_loader)
    vacc    /= len(val_loader)

    return vloss, vacc

## wandb run

In [16]:
wandb.login(key="a07bacf1f6490c2d1a0d4e22dd08701319310f93") 
run = wandb.init(
    name    = config['wandb_name'], ### Wandb creates random run names if you skip this field, we recommend you give useful names
    reinit  = True, ### Allows reinitalizing runs when you re-run this cell
    #id     = "y28t31uz", ### Insert specific run id here if you want to resume a previous run
    #resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw1p2", ### Project should be created in your wandb account
    config  = config, ### Wandb Config for your run
    dir = MODEL_ROOT ### Wandb local directory
)

model_arch  = str(model)

# ### Save it in a txt file
arch_file   = open(f"{MODEL_ROOT}/model_arch.txt", "w")
file_write  = arch_file.write(model_arch)
arch_file.close()

# ### log it in your wandb run with wandb.save()
wandb.save(arch_file.name)

def wandb_logger(epoch, loss, acc):
    wandb.log({"train_epoch": epoch, "train_loss": loss, "train_acc": acc})


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mzzyatcmu[0m ([33mschool_stuff[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/zzy/.netrc




## Experiment

In [17]:
torch.cuda.empty_cache()
gc.collect()
wandb.watch(model, log="all")


for epoch in range(config['epochs']):
    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    curr_lr                 = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc   = train(model, train_loader, optimizer, criterion, scheduler, logger=wandb_logger, log_freq=100, use_amp=False)
    scheduler.step(train_loss)
    val_loss, val_acc       = eval(model, val_loader)

    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}".format(val_acc*100, val_loss))

    ### Log metrics at each epoch in your run
    # Optionally, you can log at each batch inside train/eval functions
    # (explore wandb documentation/wandb recitation)
    wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss,
               'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': curr_lr})
    

    ### Highly Recommended: Save checkpoint in drive and/or wandb if accuracy is better than your current best
    torch.save(model.state_dict(), f"{MODEL_ROOT}/model_{epoch}.cpt")

### Finish your wandb run
run.finish()


Epoch 1/30


Train:   8%|▊         | 86/1102 [00:11<02:03,  8.23it/s, acc=16.2386%, loss=3.4907]

KeyboardInterrupt: 

Train:   8%|▊         | 86/1102 [00:25<02:03,  8.23it/s, acc=16.2386%, loss=3.4907]

# Test and Submission

In [None]:
def test(model, test_loader):
    ### What you call for model to perform inference?
    model.eval() # TODO train or eval?

    ### List to store predicted phonemes of test data
    test_predictions = []

    ### Which mode do you need to avoid gradients?
    with torch.no_grad(): 

        for i, mfccs in enumerate(tqdm(test_loader)):

            mfccs   = mfccs.to(device)

            logits  = model(mfccs)

            ### Get most likely predicted phoneme with argmax
            max_idxs = torch.argmax(logits, dim=1)
            
            predicted_phonemes = [test_loader.dataset.phoneme_reverse_lookup(max_idx) for max_idx in max_idxs]

            ### How do you store predicted_phonemes with test_predictions? Hint, look at eval
            test_predictions.extend(predicted_phonemes)
    return test_predictions

In [None]:
predictions = test(model, test_loader)
with open("./submission_latest.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(predictions)):
        f.write("{},{}\n".format(i, predictions[i]))