# HW1: Frame-Level Speech Recognition

# In this homework, you will be working with MFCC data consisting of 28 features at each time step/frame. Your model should be able to recognize the phoneme occured in that frame.

# Libraries

In [1]:
!pip install torchsummaryX==1.1.0 wandb --quiet

In [2]:
!pip install pandas



In [3]:
import torch
import numpy as np
from torchsummaryX import summary
import sklearn
import gc
import zipfile
import pandas as pd
from tqdm.auto import tqdm
import os
import datetime
import wandb
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


In [5]:
''' If you are using colab, you can import google drive to save model checkpoints in a folder
    If you want to use it, uncomment the two lines below
'''
# from google.colab import drive
# drive.mount('/content/drive')

' If you are using colab, you can import google drive to save model checkpoints in a folder\n    If you want to use it, uncomment the two lines below\n'

In [6]:
### PHONEME LIST
PHONEMES = [
            '[SIL]',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '[SOS]', '[EOS]']

# Kaggle

This section contains code that helps you install kaggle's API, creating kaggle.json with you username and API key details. Make sure to input those in the given code to ensure you can download data from the competition successfully.

In [None]:
#!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
#!mkdir /root/.kaggle

#with open("/root/.kaggle/kaggle.json", "w+") as f:
    #f.write('{"username":"Replace this with your Kaggle Username","key":"Replace this with your kaggle API key"}')
    # Put your kaggle username & key here

#!chmod 600 /root/.kaggle/kaggle.json

In [None]:
# commands to download data from kaggle
#!kaggle competitions download -c 11785-hw1p2-f24

#!unzip -qo /content/11785-hw1p2-f24.zip -d '/content'

# Dataset

This section covers the dataset/dataloader class for speech data. You will have to spend time writing code to create this class successfully. We have given you a lot of comments guiding you on what code to write at each stage, from top to bottom of the class. Please try and take your time figuring this out, as it will immensely help in creating dataset/dataloader classes for future homeworks.

Before running the following cells, please take some time to analyse the structure of data. Try loading a single MFCC and its transcipt, print out the shapes and print out the values. Do the transcripts look like phonemes?

In [7]:
class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, root, phonemes = PHONEMES, context=0, partition= "train-clean-100"): 

        self.context = context
        self.phonemes = phonemes

        self.mfcc_dir       = os.path.join(root, partition, "mfcc")
        self.transcript_dir = os.path.join(root, partition, "transcript")

        mfcc_names          = sorted(os.listdir(self.mfcc_dir))
        transcript_names    = sorted(os.listdir(self.transcript_dir))

        assert len(mfcc_names) == len(transcript_names)

        self.mfccs, self.transcripts = [], []

        for i in range(len(mfcc_names)):
            mfcc        = np.load(os.path.join(self.mfcc_dir, mfcc_names[i]))
            mfcc = (mfcc - np.mean(mfcc, axis=0)) / np.std(mfcc, axis=0)
            transcript  = np.load(os.path.join(self.transcript_dir, transcript_names[i]))
            
            if transcript[0] == '[SOS]':
                transcript = transcript[1:]
            if transcript[-1] == '[EOS]':
                transcript = transcript[:-1]

            self.mfccs.append(mfcc)
            self.transcripts.append(transcript)

        self.mfccs          = np.concatenate(self.mfccs, axis=0)
        self.transcripts    = np.concatenate([np.array(transcript) for transcript in self.transcripts])
        self.length = len(self.mfccs)
        padding = np.zeros((context, self.mfccs.shape[1]))
        self.mfccs = np.vstack((padding, self.mfccs, padding)) # TODO
        self.transcripts = np.array([self.phonemes.index(p) for p in self.transcripts])

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        frames = self.mfccs[ind:ind + 2 * self.context + 1]
        frames = frames.flatten() 
        frames      = torch.FloatTensor(frames)
        phonemes    = torch.tensor(self.transcripts[ind])

        return frames, phonemes


In [8]:
class AudioTestDataset(torch.utils.data.Dataset):

    def __init__(self, root, context=0, partition="test-clean"): 
        self.context = context
        self.mfcc_dir = os.path.join(root, partition, "mfcc") 
        mfcc_names = sorted(os.listdir(self.mfcc_dir))
        self.mfccs = []
        for i in range(len(mfcc_names)):
            mfcc = np.load(os.path.join(self.mfcc_dir, mfcc_names[i]))
            mfcc = (mfcc - np.mean(mfcc, axis=0)) / np.std(mfcc, axis=0)
            self.mfccs.append(mfcc)
        self.mfccs = np.concatenate(self.mfccs, axis=0)
        self.length = len(self.mfccs)
        padding = np.zeros((context, self.mfccs.shape[1]))
        self.mfccs = np.vstack((padding, self.mfccs, padding))

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        frames = self.mfccs[ind:ind + 2 * self.context + 1]
        frames = frames.flatten()
        frames = torch.FloatTensor(frames)
        return frames


# Parameters Configuration

Storing your parameters and hyperparameters in a single configuration dictionary makes it easier to keep track of them during each experiment. It can also be used with weights and biases to log your parameters for each experiment and keep track of them across multiple experiments.

In [None]:
config = {
    'epochs'        : 5,
    'batch_size'    : 1024,
    'context'       : 20,
    'init_lr'       : 1e-3,
    'architecture'  : 'very-low-cutoff'
    # Add more as you need them - e.g dropout values, weight decay, scheduler parameters
}

# Create Datasets

In [None]:
#TODO: Create a dataset object using the AudioDataset class for the training data
ROOT="/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2"
train_data = AudioDataset(root=ROOT, context=config['context'], phonemes=PHONEMES)

# TODO: Create a dataset object using the AudioDataset class for the validation data
val_data = AudioDataset(root=ROOT, context=config['context'], phonemes=PHONEMES, partition='dev-clean')

# TODO: Create a dataset object using the AudioTestDataset class for the test data
test_data = AudioTestDataset(root=ROOT, context=config['context'], partition='test-clean')

In [None]:
from collections import Counter

In [None]:
transcript_dir = '/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2/dev-clean/transcript'

phoneme_counts = Counter()

for transcript_file in os.listdir(transcript_dir):
    if transcript_file.endswith('.npy'):
        transcript_path = os.path.join(transcript_dir, transcript_file)
        transcript = np.load(transcript_path, allow_pickle=True)
        phoneme_counts.update(transcript)


least_common_phoneme = phoneme_counts.most_common()[-1] 
print(f"Least common phoneme: {least_common_phoneme[0]} with count {least_common_phoneme[1]}")

transcript_dir = '/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2/dev-clean/transcript'

sil_count = 0

for transcript_file in os.listdir(transcript_dir):
    if transcript_file.endswith('.npy'):
        transcript_path = os.path.join(transcript_dir, transcript_file)
        transcript = np.load(transcript_path, allow_pickle=True)
        sil_count += np.sum(transcript == '[SIL]')

print(f'Total number of "SIL" in the dev set: {sil_count}')

In [None]:
# Define dataloaders for train, val and test datasets
# Dataloaders will yield a batch of frames and phonemes of given batch_size at every iteration
# We shuffle train dataloader but not val & test dataloader. Why?

train_loader = torch.utils.data.DataLoader(
    dataset     = train_data,
    num_workers = 4,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = True
)

val_loader = torch.utils.data.DataLoader(
    dataset     = val_data,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

test_loader = torch.utils.data.DataLoader(
    dataset     = test_data,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

print("Batch size     : ", config['batch_size'])
print("Context        : ", config['context'])
print("Input size     : ", (2*config['context']+1)*28)
print("Output symbols : ", len(PHONEMES))

print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))


In [None]:
# Testing code to check if your data loaders are working
for i, data in enumerate(train_loader):
    frames, phoneme = data
    print(frames.shape, phoneme.shape)
    break

# Network Architecture


This section defines your network architecture for the homework. We have given you a sample architecture that can easily clear the very low cutoff for the early submission deadline.

In [None]:
# This architecture will make you cross the very low cutoff
# However, you need to run a lot of experiments to cross the medium or high cutoff
class Network(torch.nn.Module):

    def __init__(self, input_size, output_size):

        super(Network, self).__init__()

        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, output_size)
        )

    def forward(self, x):
        out = self.model(x)

        return out

# Define Model, Loss Function and Optimizer

Here we define the model, loss function, optimizer and optionally a learning rate scheduler.

In [None]:
INPUT_SIZE  = (2*config['context'] + 1) * 28 # Why is this the case?
model       = Network(INPUT_SIZE, len(train_data.phonemes)).to(device)
summary(model, frames.to(device))
# Check number of parameters of your network
# Remember, you are limited to 20 million parameters for HW1 (including ensembles)

In [None]:
criterion = torch.nn.CrossEntropyLoss() # Defining Loss function.
# We use CE because the task is multi-class classification

optimizer = torch.optim.Adam(model.parameters(), lr= config['init_lr']) #Defining Optimizer
# Recommended : Define Scheduler for Learning Rate,
# including but not limited to StepLR, MultiStep, CosineAnnealing, CosineAnnealingWithWarmRestarts, ReduceLROnPlateau, etc.
# You can refer to Pytorch documentation for more information on how to use them.
#scheduler=torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, )
# Is your training time very high?
from torch import autocast
from torch.amp import GradScaler

# Look into mixed precision training if your GPU (Tesla T4, V100, etc) can make use of it
import torchaudio.transforms as tat
time_mask=tat.TimeMasking(time_mask_param=15)
freq_mask=tat.FrequencyMasking(freq_mask_param=15)

def apply_mask(mfccs):
    mfccs=time_mask(mfccs)
    mfccs=freq_mask(mfccs)
    return mfccs
# Refer - https://pytorch.org/docs/stable/notes/amp_examples.html

In [None]:
def apply_mask(mfccs):
    mfccs=time_mask(mfccs)
    mfccs=freq_mask(mfccs)
    return mfccs

In [None]:
from torch import autocast
from torch.amp import GradScaler

# Training and Validation Functions

This section covers the training, and validation functions for each epoch of running your experiment with a given model architecture. The code has been provided to you, but we recommend going through the comments to understand the workflow to enable you to write these loops for future HWs.

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
scaler=GradScaler()
def train(model, dataloader, optimizer, criterion):

    model.train()
    tloss, tacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    for i, (frames, phonemes) in enumerate(dataloader):

        ### Initialize Gradients
        optimizer.zero_grad()

        ### Move Data to Device (Ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        ### Forward Propagation
        with autocast(device_type='cuda', dtype=torch.float16):

            logits  = model(frames)

            ### Loss Calculation
            loss    = criterion(logits, phonemes)
            
        
        

        ### Backward Propagation
        scaler.scale(loss).backward()

        ### Gradient Descent
        scaler.step(optimizer)

        scaler.update()

        tloss   += loss.item()
        tacc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]

        batch_bar.set_postfix(loss="{:.04f}".format(float(tloss / (i + 1))),
                              acc="{:.04f}%".format(float(tacc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    tloss   /= len(train_loader)
    tacc    /= len(train_loader)

    return tloss, tacc

In [None]:
def eval(model, dataloader):

    model.eval() # set model in evaluation mode
    vloss, vacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    for i, (frames, phonemes) in enumerate(dataloader):

        ### Move data to device (ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        # makes sure that there are no gradients computed as we are not training the model now
        with torch.inference_mode():
            ### Forward Propagation
            logits  = model(frames)
            ### Loss Calculation
            loss    = criterion(logits, phonemes)

        vloss   += loss.item()
        vacc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]

        # Do you think we need loss.backward() and optimizer.step() here?

        batch_bar.set_postfix(loss="{:.04f}".format(float(vloss / (i + 1))),
                              acc="{:.04f}%".format(float(vacc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    vloss   /= len(val_loader)
    vacc    /= len(val_loader)

    return vloss, vacc

# Weights and Biases Setup

This section is to enable logging metrics and files with Weights and Biases. Please refer to wandb documentationa and recitation 0 that covers the use of weights and biases for logging, hyperparameter tuning and monitoring your runs for your homeworks. Using this tool makes it very easy to show results when submitting your code and models for homeworks, and also extremely useful for study groups to organize and run ablations under a single team in wandb.

We have written code for you to make use of it out of the box, so that you start using wandb for all your HWs from the beginning.

In [None]:
wandb.login(key="39e9c89279f6d046c7bae725e099c70ddf0fd98f") #API Key is in your wandb account, under settings (wandb.ai/settings)

In [None]:
# Create your wandb run
run = wandb.init(
    name    = "first-run", ### Wandb creates random run names if you skip this field, we recommend you give useful names
    reinit  = True, ### Allows reinitalizing runs when you re-run this cell
    #id     = "y28t31uz", ### Insert specific run id here if you want to resume a previous run
    #resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw1p2", ### Project should be created in your wandb account
    config  = config ### Wandb Config for your run
)

In [None]:
### Save your model architecture as a string with str(model)
model_arch  = str(model)

### Save it in a txt file
arch_file   = open("model_arch.txt", "w")
file_write  = arch_file.write(model_arch)
arch_file.close()

### log it in your wandb run with wandb.save()
wandb.save('model_arch.txt')

# Experiment

Now, it is time to finally run your ablations! Have fun!

In [None]:
best_val_acc = 0.0

for epoch in range(config['epochs']):

    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    curr_lr = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc = eval(model, val_loader)

    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}".format(val_acc*100, val_loss))

    wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss,
               'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': curr_lr})
    
    if val_acc > best_val_acc:
        print("\tValidation accuracy improved from {:.04f}% to {:.04f}%! Saving checkpoint...".format(best_val_acc*100, val_acc*100))

        checkpoint_path = f"/kaggle/working/checkpoint_epoch_{epoch+1}.pth"
        
        torch.save({
            'epoch': epoch + 1,             
            'model_state_dict': model.state_dict(),  
            'optimizer_state_dict': optimizer.state_dict(),  
            'loss': val_loss,                
            'val_acc': val_acc               
        }, checkpoint_path)

        wandb.save(checkpoint_path)

        best_val_acc = val_acc


# Testing and submission to Kaggle

Before we get to the following code, make sure to see the format of submission given in *sample_submission.csv*. Once you have done so, it is time to fill the following function to complete your inference on test data. Refer the eval function from previous cells to get an idea of how to go about completing this function.

In [None]:
checkpoint_path="/kaggle/input/jkdsbcjkbdsjbcd/checkpoint_epoch_5.pth"
checkpoint_dict=torch.load(checkpoint_path)

model.load_state_dict(checkpoint_dict['model_state_dict'])


In [None]:
def test(model, test_loader):
    ### What you call for model to perform inference?
    model.eval() # TODO train or eval?

    ### List to store predicted phonemes of test data
    test_predictions = []

    ### Which mode do you need to avoid gradients?
    with torch.no_grad(): # TODO

        for i, mfccs in enumerate(tqdm(test_loader)):

            mfccs   = mfccs.to(device)

            logits  = model(mfccs)

            ### Get most likely predicted phoneme with argmax
            predicted_phonemes = torch.argmax(logits, dim=1)

            ### How do you store predicted_phonemes with test_predictions? Hint, look at eval
            # TODO
            test_predictions.extend(predicted_phonemes.cpu().numpy())
    

    return test_predictions

In [None]:
predictions = test(model, test_loader)

In [None]:
def map_id_to_phoneme(predicted_ids):
    return [PHONEMES[id] for id in predicted_ids]

In [None]:
phoneme_predictions=map_id_to_phoneme(predictions)

In [None]:
### Create CSV file with predictions
with open("./submission1.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(phoneme_predictions)):
        f.write("{},{}\n".format(i, phoneme_predictions[i]))

In [None]:
### Finish your wandb run
run.finish()

In [None]:
### Submit to kaggle competition using kaggle API (Uncomment below to use)
# !kaggle competitions submit -c 11785-hw1p2-f24 -f ./submission.csv -m "Test Submission"

### However, its always safer to download the csv file and then upload to kaggle

In [None]:
config = {
    'epochs'        : 5,
    'batch_size'    : 256,
    'context'       : 16,
    'init_lr'       : 1e-3,
    'architecture'  : 'pyramid'
    # Add more as you need them - e.g dropout values, weight decay, scheduler parameters
}

In [None]:
class Pyramid(torch.nn.Module):

    def __init__(self, input_size, output_size):

        super(Pyramid, self).__init__()

        self.model=torch.nn.Sequential(

            torch.nn.Linear(input_size, max(1024, 10 * input_size)),
            torch.nn.ReLU(),
            torch.nn.Linear(max(1024, 10*input_size), 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, output_size)
            
        )

        self._init_weights()


    def _init_weights(self):

        for m in self.modules():
            if isinstance(m, torch.nn.Linear):
                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
                if m.bias is not None:
                    torch.nn.init.zeros_(m.bias)
                    
    def forward(self, x):

        out=self.model(x)

        return out



In [None]:
model_2=Pyramid(INPUT_SIZE, len(train_data.phonemes)).to(device)
summary(model_2, frames.to(device))


In [None]:
optimizer=torch.optim.Adam(model_2.parameters(), lr=config['init_lr'])

criterion=torch.nn.CrossEntropyLoss()

scheduler=torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
# Create your wandb run
run = wandb.init(
    name    = "second-run", ### Wandb creates random run names if you skip this field, we recommend you give useful names
    reinit  = True, ### Allows reinitalizing runs when you re-run this cell
    #id     = "y28t31uz", ### Insert specific run id here if you want to resume a previous run
    #resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw1p2", ### Project should be created in your wandb account
    config  = config ### Wandb Config for your run
)

In [None]:
### Save your model architecture as a string with str(model)
model_arch  = str(model_2)

### Save it in a txt file
arch_file   = open("model_arch.txt", "w")
file_write  = arch_file.write(model_arch)
arch_file.close()

### log it in your wandb run with wandb.save()
wandb.save('model_arch.txt')

In [None]:
best_val_acc = 0.0

for epoch in range(config['epochs']):

    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    curr_lr = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc = train(model_2, train_loader, optimizer, criterion)
    val_loss, val_acc = eval(model_2, val_loader)
    scheduler.step()
    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}".format(val_acc*100, val_loss))

    wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss,
               'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': curr_lr})
    
    if val_acc > best_val_acc:
        print("\tValidation accuracy improved from {:.04f}% to {:.04f}%! Saving checkpoint...".format(best_val_acc*100, val_acc*100))

        checkpoint_path = f"/kaggle/working/checkpoint_model_2_epoch_{epoch+1}.pth"
        
        torch.save({
            'epoch': epoch + 1,             
            'model_state_dict': model_2.state_dict(),  
            'optimizer_state_dict': optimizer.state_dict(),  
            'loss': val_loss,                
            'val_acc': val_acc               
        }, checkpoint_path)

        wandb.save(checkpoint_path)

        best_val_acc = val_acc


In [None]:
checkpoint_path="/kaggle/input/model-2/checkpoint_model_2_epoch_4.pth"
checkpoint_dict=torch.load(checkpoint_path)

model_2.load_state_dict(checkpoint_dict['model_state_dict'])


In [None]:
predictions_2 = test(model_2, test_loader)

In [None]:
phoneme_predictions_2=map_id_to_phoneme(predictions_2)

In [None]:
### Create CSV file with predictions
with open("./submission2.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(phoneme_predictions_2)):
        f.write("{},{}\n".format(i, phoneme_predictions_2[i]))

In [109]:
class InvertedPyramid(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(InvertedPyramid, self).__init__()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size, 2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.15),  
            torch.nn.Linear(2048, 2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.15),  
            torch.nn.Linear(2048, 2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.15), 
            torch.nn.Linear(2048, 2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.15), 
            torch.nn.Linear(2048, 1024),
            torch.nn.BatchNorm1d(1024),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.15),  
            torch.nn.Linear(1024, 1024),
            torch.nn.BatchNorm1d(1024),
            torch.nn.GELU(),
            torch.nn.Linear(1024, 512),
            torch.nn.BatchNorm1d(512),
            torch.nn.GELU(),
            torch.nn.Linear(512, 256),
            torch.nn.BatchNorm1d(256),
            torch.nn.GELU(),
            torch.nn.Linear(256, 128),
            torch.nn.BatchNorm1d(128),
            torch.nn.GELU(),
            torch.nn.Linear(128, output_size)
        )
        self._init_weights()

    def _init_weights(self):

        for m in self.modules():
            if isinstance(m, torch.nn.Linear):
                torch.nn.init.kaiming_normal_(m.weight)

    

    def forward(self, x):
        out=self.model(x)
        return out



In [110]:
config = {
    'epochs'        : 90,
    'batch_size'    : 4096,
    'context'       : 25,
    'init_lr'       : 1e-3,
    'architecture'  : 'InvertedPyramid'
    # Add more as you need them - e.g dropout values, weight decay, scheduler parameters
}

In [111]:
#TODO: Create a dataset object using the AudioDataset class for the training data
ROOT="/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2"
train_data = AudioDataset(root=ROOT, context=config['context'], phonemes=PHONEMES)

# TODO: Create a dataset object using the AudioDataset class for the validation data
val_data = AudioDataset(root=ROOT, context=config['context'], phonemes=PHONEMES, partition='dev-clean')

# TODO: Create a dataset object using the AudioTestDataset class for the test data
test_data = AudioTestDataset(root=ROOT, context=config['context'], partition='test-clean')

In [112]:
transcript_dir = '/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2/dev-clean/transcript'

phoneme_counts = Counter()

for transcript_file in os.listdir(transcript_dir):
    if transcript_file.endswith('.npy'):
        transcript_path = os.path.join(transcript_dir, transcript_file)
        transcript = np.load(transcript_path, allow_pickle=True)
        phoneme_counts.update(transcript)


least_common_phoneme = phoneme_counts.most_common()[-1] 
print(f"Least common phoneme: {least_common_phoneme[0]} with count {least_common_phoneme[1]}")

transcript_dir = '/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2/dev-clean/transcript'

sil_count = 0

for transcript_file in os.listdir(transcript_dir):
    if transcript_file.endswith('.npy'):
        transcript_path = os.path.join(transcript_dir, transcript_file)
        transcript = np.load(transcript_path, allow_pickle=True)
        sil_count += np.sum(transcript == '[SIL]')

print(f'Total number of "SIL" in the dev set: {sil_count}')

Least common phoneme: ZH with count 869
Total number of "SIL" in the dev set: 319908


In [113]:
# Define dataloaders for train, val and test datasets
# Dataloaders will yield a batch of frames and phonemes of given batch_size at every iteration
# We shuffle train dataloader but not val & test dataloader. Why?

train_loader = torch.utils.data.DataLoader(
    dataset     = train_data,
    num_workers = 4,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = True
)

val_loader = torch.utils.data.DataLoader(
    dataset     = val_data,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

test_loader = torch.utils.data.DataLoader(
    dataset     = test_data,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

print("Batch size     : ", config['batch_size'])
print("Context        : ", config['context'])
print("Input size     : ", (2*config['context']+1)*28)
print("Output symbols : ", len(PHONEMES))

print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))


Batch size     :  4096
Context        :  25
Input size     :  1428
Output symbols :  42
Train dataset samples = 36091157, batches = 8812
Validation dataset samples = 1928204, batches = 471
Test dataset samples = 1934138, batches = 473


In [114]:
# Testing code to check if your data loaders are working
for i, data in enumerate(train_loader):
    frames, phoneme = data
    print(frames.shape, phoneme.shape)
    break

torch.Size([4096, 1428]) torch.Size([4096])


In [115]:
INPUT_SIZE  = (2*config['context'] + 1) * 28 # Why is this the case?
model_3      = InvertedPyramid(INPUT_SIZE, len(train_data.phonemes)).to(device)
summary(model_3, frames.to(device))
# Check number of parameters of your network
# Remember, you are limited to 20 million parameters for HW1 (including ensembles)

----------------------------------------------------------------------------------------------------
Layer                   Kernel Shape         Output Shape         # Params (K)      # Mult-Adds (M)
0_Linear                [1428, 2048]         [4096, 2048]             2,926.59                 2.92
1_BatchNorm1d                 [2048]         [4096, 2048]                 4.10                 0.00
2_GELU                             -         [4096, 2048]                    -                    -
3_Dropout                          -         [4096, 2048]                    -                    -
4_Linear                [2048, 2048]         [4096, 2048]             4,196.35                 4.19
5_BatchNorm1d                 [2048]         [4096, 2048]                 4.10                 0.00
6_GELU                             -         [4096, 2048]                    -                    -
7_Dropout                          -         [4096, 2048]                    -                    -

In [116]:
criterion = torch.nn.CrossEntropyLoss() 

optimizer = torch.optim.AdamW(model.parameters(), lr=config['init_lr'], weight_decay=0.01)

scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[60, 80], gamma=0.1)

In [117]:
torch.cuda.empty_cache()
gc.collect()

1550

In [118]:
wandb.login(key="39e9c89279f6d046c7bae725e099c70ddf0fd98f") #API Key is in your wandb account, under settings (wandb.ai/settings)



True

In [119]:
# Create your wandb run
run = wandb.init(
    name    = "third-run", ### Wandb creates random run names if you skip this field, we recommend you give useful names
    reinit  = True, ### Allows reinitalizing runs when you re-run this cell
    #id     = "y28t31uz", ### Insert specific run id here if you want to resume a previous run
    #resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw1p2", ### Project should be created in your wandb account
    config  = config ### Wandb Config for your run
)

In [120]:
### Save your model architecture as a string with str(model)
model_arch  = str(model_3)

### Save it in a txt file
arch_file   = open("model_arch.txt", "w")
file_write  = arch_file.write(model_arch)
arch_file.close()

### log it in your wandb run with wandb.save()
wandb.save('model_arch.txt')

['/kaggle/working/wandb/run-20250615_062420-yd1r1er2/files/model_arch.txt']

In [124]:
torch.cuda.empty_cache()
gc.collect()

734

In [125]:

def train(model, dataloader, optimizer, criterion):

    model.train()
    tloss, tacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    for i, (frames, phonemes) in enumerate(dataloader):

        ### Initialize Gradients
        optimizer.zero_grad()

        ### Move Data to Device (Ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        frames = apply_mask(frames)

        ### Forward Propagation
       
        logits  = model(frames)

            ### Loss Calculation
        loss    = criterion(logits, phonemes)

        loss.backward()
        optimizer.step()
            
        
        

       

        tloss   += loss.item()
        tacc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]

        batch_bar.set_postfix(loss="{:.04f}".format(float(tloss / (i + 1))),
                              acc="{:.04f}%".format(float(tacc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    tloss   /= len(train_loader)
    tacc    /= len(train_loader)

    return tloss, tacc

In [None]:
best_val_acc = 0.0

for epoch in range(config['epochs']):

    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    curr_lr = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc = train(model_3, train_loader, optimizer, criterion)
    val_loss, val_acc = eval(model_3, val_loader)
    scheduler.step()
    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}".format(val_acc*100, val_loss))

    wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss,
               'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': curr_lr})
    
    if val_acc > best_val_acc:
        print("\tValidation accuracy improved from {:.04f}% to {:.04f}%! Saving checkpoint...".format(best_val_acc*100, val_acc*100))

        checkpoint_path = f"/kaggle/working/checkpoint_model_3_epoch_{epoch+1}.pth"
        
        torch.save({
            'epoch': epoch + 1,             
            'model_state_dict': model_3.state_dict(),  
            'optimizer_state_dict': optimizer.state_dict(),  
            'loss': val_loss,                
            'val_acc': val_acc               
        }, checkpoint_path)

        wandb.save(checkpoint_path)

        best_val_acc = val_acc



Epoch 1/90


Train:   0%|          | 0/8812 [00:00<?, ?it/s]

# Model 3- InvertedPyramidNet

In [9]:
config = {
    'epochs'        : 5,
    'batch_size'    : 2048,
    'context'       : 16,
    'init_lr'       : 1e-3,
    'architecture'  : 'InvertedPyramidNet'
    # Add more as you need them - e.g dropout values, weight decay, scheduler parameters
}

In [10]:
#TODO: Create a dataset object using the AudioDataset class for the training data
ROOT="/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2"
train_data = AudioDataset(root=ROOT, context=config['context'], phonemes=PHONEMES)

# TODO: Create a dataset object using the AudioDataset class for the validation data
val_data = AudioDataset(root=ROOT, context=config['context'], phonemes=PHONEMES, partition='dev-clean')

# TODO: Create a dataset object using the AudioTestDataset class for the test data
test_data = AudioTestDataset(root=ROOT, context=config['context'], partition='test-clean')

In [11]:
from collections import Counter

In [12]:
transcript_dir = '/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2/dev-clean/transcript'

phoneme_counts = Counter()

for transcript_file in os.listdir(transcript_dir):
    if transcript_file.endswith('.npy'):
        transcript_path = os.path.join(transcript_dir, transcript_file)
        transcript = np.load(transcript_path, allow_pickle=True)
        phoneme_counts.update(transcript)


least_common_phoneme = phoneme_counts.most_common()[-1] 
print(f"Least common phoneme: {least_common_phoneme[0]} with count {least_common_phoneme[1]}")

transcript_dir = '/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2/dev-clean/transcript'

sil_count = 0

for transcript_file in os.listdir(transcript_dir):
    if transcript_file.endswith('.npy'):
        transcript_path = os.path.join(transcript_dir, transcript_file)
        transcript = np.load(transcript_path, allow_pickle=True)
        sil_count += np.sum(transcript == '[SIL]')

print(f'Total number of "SIL" in the dev set: {sil_count}')

Least common phoneme: ZH with count 869
Total number of "SIL" in the dev set: 319908


In [13]:
# Define dataloaders for train, val and test datasets
# Dataloaders will yield a batch of frames and phonemes of given batch_size at every iteration
# We shuffle train dataloader but not val & test dataloader. Why?

train_loader = torch.utils.data.DataLoader(
    dataset     = train_data,
    num_workers = 4,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = True
)

val_loader = torch.utils.data.DataLoader(
    dataset     = val_data,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

test_loader = torch.utils.data.DataLoader(
    dataset     = test_data,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

print("Batch size     : ", config['batch_size'])
print("Context        : ", config['context'])
print("Input size     : ", (2*config['context']+1)*28)
print("Output symbols : ", len(PHONEMES))

print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))


Batch size     :  2048
Context        :  16
Input size     :  924
Output symbols :  42
Train dataset samples = 36091157, batches = 17623
Validation dataset samples = 1928204, batches = 942
Test dataset samples = 1934138, batches = 945


In [14]:
# Testing code to check if your data loaders are working
for i, data in enumerate(train_loader):
    frames, phoneme = data
    print(frames.shape, phoneme.shape)
    break

torch.Size([2048, 924]) torch.Size([2048])


In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class InvertedPyramidNet(nn.Module):
    def __init__(self, input_dim, output_dim, dropout=0.25):
        super(InvertedPyramidNet, self).__init__()

        self.model = nn.Sequential(
            nn.BatchNorm1d(input_dim),
            nn.Linear(input_dim, 2048),
            nn.BatchNorm1d(2048),
            nn.Softplus(),
            nn.Dropout(dropout),

            nn.Linear(2048, 1024),
            nn.BatchNorm1d(1024),
            nn.Softplus(),
            nn.Dropout(dropout),

            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Softplus(),
            nn.Dropout(dropout),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.Softplus(),
            nn.Dropout(dropout),

            nn.Linear(256, output_dim)
        )

        # Xavier Initialization
        for m in self.model:
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def forward(self, x):
        return self.model(x)


In [16]:
INPUT_SIZE  = (2*config['context'] + 1) * 28 # Why is this the case?
model_4      = InvertedPyramidNet(INPUT_SIZE, len(train_data.phonemes)).to(device)
summary(model_4, frames.to(device))
# Check number of parameters of your network
# Remember, you are limited to 20 million parameters for HW1 (including ensembles)

----------------------------------------------------------------------------------------------------
Layer                   Kernel Shape         Output Shape         # Params (K)      # Mult-Adds (M)
0_BatchNorm1d                  [924]          [2048, 924]                 1.85                 0.00
1_Linear                 [924, 2048]         [2048, 2048]             1,894.40                 1.89
2_BatchNorm1d                 [2048]         [2048, 2048]                 4.10                 0.00
3_Softplus                         -         [2048, 2048]                    -                    -
4_Dropout                          -         [2048, 2048]                    -                    -
5_Linear                [2048, 1024]         [2048, 1024]             2,098.18                 2.10
6_BatchNorm1d                 [1024]         [2048, 1024]                 2.05                 0.00
7_Softplus                         -         [2048, 1024]                    -                    -

In [17]:
criterion = torch.nn.CrossEntropyLoss() # Defining Loss function.
# We use CE because the task is multi-class classification

optimizer = torch.optim.Adam(model_4.parameters(), lr= config['init_lr']) #Defining Optimizer
# Recommended : Define Scheduler for Learning Rate,
# including but not limited to StepLR, MultiStep, CosineAnnealing, CosineAnnealingWithWarmRestarts, ReduceLROnPlateau, etc.
scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max')
# You can refer to Pytorch documentation for more information on how to use them.
#scheduler=torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, )
# Is your training time very high?
from torch import autocast
from torch.amp import GradScaler

# Look into mixed precision training if your GPU (Tesla T4, V100, etc) can make use of it
import torchaudio.transforms as tat
time_mask=tat.TimeMasking(time_mask_param=15)
freq_mask=tat.FrequencyMasking(freq_mask_param=15)

def apply_mask(mfccs):
    mfccs=time_mask(mfccs)
    mfccs=freq_mask(mfccs)
    return mfccs
# Refer - https://pytorch.org/docs/stable/notes/amp_examples.html

In [18]:
torch.cuda.empty_cache()
gc.collect()

2931

In [25]:
scaler=GradScaler()
def train(model, dataloader, optimizer, criterion):

    model.train()
    tloss, tacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    for i, (frames, phonemes) in enumerate(dataloader):

        ### Initialize Gradients
        optimizer.zero_grad()

        ### Move Data to Device (Ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        ### Forward Propagation
        with autocast(device_type='cuda', dtype=torch.float16):

            logits  = model(frames)

            ### Loss Calculation
            loss    = criterion(logits, phonemes)
            
        
        

        ### Backward Propagation
        scaler.scale(loss).backward()

        ### Gradient Descent
        scaler.step(optimizer)

        scaler.update()

        tloss   += loss.item()
        tacc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]

        batch_bar.set_postfix(loss="{:.04f}".format(float(tloss / (i + 1))),
                              acc="{:.04f}%".format(float(tacc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    tloss   /= len(train_loader)
    tacc    /= len(train_loader)

    return tloss, tacc

In [20]:
def eval(model, dataloader):

    model.eval() # set model in evaluation mode
    vloss, vacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    for i, (frames, phonemes) in enumerate(dataloader):

        ### Move data to device (ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        # makes sure that there are no gradients computed as we are not training the model now
        with torch.inference_mode():
            ### Forward Propagation
            logits  = model(frames)
            ### Loss Calculation
            loss    = criterion(logits, phonemes)

        vloss   += loss.item()
        vacc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]

        # Do you think we need loss.backward() and optimizer.step() here?

        batch_bar.set_postfix(loss="{:.04f}".format(float(vloss / (i + 1))),
                              acc="{:.04f}%".format(float(vacc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    vloss   /= len(val_loader)
    vacc    /= len(val_loader)

    return vloss, vacc

In [21]:
wandb.login(key="39e9c89279f6d046c7bae725e099c70ddf0fd98f") #API Key is in your wandb account, under settings (wandb.ai/settings)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrishitsaxena55[0m ([33mrishitsaxena55-indian-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [22]:
# Create your wandb run
run = wandb.init(
    name    = "fourth-run", ### Wandb creates random run names if you skip this field, we recommend you give useful names
    reinit  = True, ### Allows reinitalizing runs when you re-run this cell
    #id     = "y28t31uz", ### Insert specific run id here if you want to resume a previous run
    #resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw1p2", ### Project should be created in your wandb account
    config  = config ### Wandb Config for your run
)



In [23]:
### Save your model architecture as a string with str(model)
model_arch  = str(model_4)

### Save it in a txt file
arch_file   = open("model_arch.txt", "w")
file_write  = arch_file.write(model_arch)
arch_file.close()

### log it in your wandb run with wandb.save()
wandb.save('model_arch.txt')

['/kaggle/working/wandb/run-20250615_181707-mk43jq0n/files/model_arch.txt']

In [None]:
best_val_acc = 0.0

for epoch in range(config['epochs']):

    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    curr_lr = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc = train(model_4, train_loader, optimizer, criterion)
    val_loss, val_acc = eval(model_4, val_loader)
    scheduler.step(val_acc)
    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}".format(val_acc*100, val_loss))

    wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss,
               'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': curr_lr})
    
    if val_acc > best_val_acc:
        print("\tValidation accuracy improved from {:.04f}% to {:.04f}%! Saving checkpoint...".format(best_val_acc*100, val_acc*100))

        checkpoint_path = f"/kaggle/working/checkpoint_model_4_epoch_{epoch+1}.pth"
        
        torch.save({
            'epoch': epoch + 1,             
            'model_state_dict': model_4.state_dict(),  
            'optimizer_state_dict': optimizer.state_dict(),  
            'loss': val_loss,                
            'val_acc': val_acc               
        }, checkpoint_path)

        wandb.save(checkpoint_path)

        best_val_acc = val_acc



Epoch 1/5


Train:   0%|          | 0/17623 [00:00<?, ?it/s]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class InvertedPyramidNet(nn.Module):
    def __init__(self, input_dim, output_dim, dropout=0.25):
        super(InvertedPyramidNet, self).__init__()

        self.model = nn.Sequential(
            nn.BatchNorm1d(input_dim),
            nn.Linear(input_dim, 2048),
            nn.BatchNorm1d(2048),
            nn.Softplus(),
            nn.Dropout(dropout),

            nn.Linear(2048, 1024),
            nn.BatchNorm1d(1024),
            nn.Softplus(),
            nn.Dropout(dropout),

            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Softplus(),
            nn.Dropout(dropout),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.Softplus(),
            nn.Dropout(dropout),

            nn.Linear(256, output_dim)
        )

        # Xavier Initialization
        for m in self.model:
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def forward(self, x):
        return self.model(x)


In [None]:
test_data = AudioTestDataset(root=ROOT, context=config['context'], partition='test-clean')

In [None]:
test_loader = torch.utils.data.DataLoader(
    dataset     = test_data,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

In [None]:
checkpoint_path="/kaggle/input/model_3/pytorch/default/1/checkpoint_model_3_epoch_5.pth"
checkpoint_dict=torch.load(checkpoint_path)

model_3.load_state_dict(checkpoint_dict['model_state_dict'])


In [None]:
def test(model, test_loader):
    ### What you call for model to perform inference?
    model.eval() # TODO train or eval?

    ### List to store predicted phonemes of test data
    test_predictions = []

    ### Which mode do you need to avoid gradients?
    with torch.no_grad(): # TODO

        for i, mfccs in enumerate(tqdm(test_loader)):

            mfccs   = mfccs.to(device)

            logits  = model(mfccs)

            ### Get most likely predicted phoneme with argmax
            predicted_phonemes = torch.argmax(logits, dim=1)

            ### How do you store predicted_phonemes with test_predictions? Hint, look at eval
            # TODO
            test_predictions.extend(predicted_phonemes.cpu().numpy())
    

    return test_predictions

In [None]:
predictions_3 = test(model_3, test_loader)

In [None]:
def map_id_to_phoneme(predicted_ids):
    return [PHONEMES[id] for id in predicted_ids]

In [None]:
phoneme_predictions_3=map_id_to_phoneme(predictions_3)

In [None]:
### Create CSV file with predictions
with open("./submission3.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(phoneme_predictions_3)):
        f.write("{},{}\n".format(i, phoneme_predictions_3[i]))

In [27]:
run.finish()

NameError: name 'run' is not defined