 MLSP Project



Copy dataset files from the drive, decompress them and combine them into one folder. Each file must have maximum size of 5GB due to the google drive limit. Doing this is because reading each file in the google drive is extremely slow, so must put all the decompressed file locally in the VM to read and process. 


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp ./drive/MyDrive/mlsp_project/dataset/fix_size_mfcc/coeffs.zip ./coeffs.zip
!cp ./drive/MyDrive/mlsp_project/dataset/fix_size_mfcc/coeffs2.zip ./coeffs2.zip
!unzip -qq "coeffs.zip" -d "./"
!unzip -qq "coeffs2.zip" -d "./" 


Define loader for the dataset.


In [3]:
import torch
from torch.utils.data import Dataset
from numpy import genfromtxt


data_path = './coeffs/'
split_path = './drive/MyDrive/mlsp_project/iden_split.txt'
file_format = 'csv'

train_size = 145265
test_size = 8251

class VoxCeleb1_MFCC(Dataset):
    def __init__(self, is_train):
        f = open(split_path, "r")
        lines = f.readlines()
        self.is_train = is_train
        self.ids = []
        #very high memory usage to speed up load process
        if is_train:
            self.dataset = torch.empty((train_size,41,157))
            self.dataset_size = train_size
        else:
            self.dataset = torch.empty((test_size,41,157))
            self.dataset_size = test_size
        counter = 0
        for line in lines:
            if counter == self.dataset_size:
                break
            strings = line.split(' ')
            set_label = int(strings[0])
            if is_train:
                if set_label == 1 or set_label == 2:
                    self.ids.append(int(strings[1].split('/')[0].replace('id',''))-10001)
                    path = data_path + strings[1].replace('wav\n', file_format)
                    self.dataset[counter, :, :] = torch.from_numpy(genfromtxt(path, delimiter=',').transpose())
                    counter += 1
            else:
                if set_label == 3:
                    self.ids.append(int(strings[1].split('/')[0].replace('id',''))-10001)
                    path = data_path + strings[1].replace('wav\n', file_format)
                    self.dataset[counter, :, :] = torch.from_numpy(genfromtxt(path, delimiter=',').transpose())
                    counter += 1
        f.close()

    def __len__(self):
        return self.dataset_size

    def __getitem__(self, idx):
        #if load from disk, use this:
        # torch.from_numpy(np.expand_dims(genfromtxt(data_path + self.paths[idx], delimiter=','), axis=0)) #in one line to save memory
        return self.dataset[idx, :, :], self.ids[idx]


In [4]:
batch_size = 64
trainset = VoxCeleb1_MFCC(is_train=True) #about 6GB memory  
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

In [5]:
test_batch_size = 64
testset = VoxCeleb1_MFCC(is_train=False) 
testloader = torch.utils.data.DataLoader(testset, batch_size=test_batch_size, shuffle=True)

In [None]:
print(len(testloader.dataset))

8251


In [6]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import time
import random
import math
import os 
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import cv2
import re

In [7]:
# modle definition
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, batch_size, output_dim=1251, num_layers=4):
        super(LSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers

        # setup LSTM layer
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)

        # setup output layer
        self.linear = nn.Linear(self.hidden_dim, output_dim)

    def forward(self, input, hidden=None):
        # lstm step => then ONLY take the sequence's final timetep to pass into the linear/dense layer
        # Note: lstm_out contains outputs for every step of the sequence we are looping over (for BPTT)
        # but we just need the output of the last step of the sequence, aka lstm_out[-1]
        lstm_out, _ = self.lstm(input, hidden)
        logits = self.linear(lstm_out[-1])              # equivalent to return_sequences=False from Keras
        genre_scores = F.log_softmax(logits, dim=1)
        return genre_scores, hidden


In [8]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print ("running on the GPU")
else:
    device = torch.device("cpu")
    print ("running on the cpu")

running on the GPU


In [9]:
net = LSTM( input_dim=41, hidden_dim=128, batch_size=batch_size, output_dim=1251, num_layers=2).to(device)
print(net)

LSTM(
  (lstm): LSTM(41, 128, num_layers=2)
  (linear): Linear(in_features=128, out_features=1251, bias=True)
)


In [10]:
criterion = nn.NLLLoss()  # expects ouputs from LogSoftmax
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [11]:
#BATCH_SIZE = 32
EPOCHS = 40
train_loss = []
def train(net):
  #train_loss = 0

  for epoch in range (EPOCHS):
    hidden_state = None
    correct = 0
    total = 0
    net.train()

    for batch_idx, (batch_X, batch_Y) in enumerate(trainloader, 0):

        batch_X, batch_Y = batch_X.to(device, dtype=torch.float), batch_Y.to(device)
        # match the PyTorch expected input tensor format of (sequence_length, batch size, input_dim)
        # Reshape input & targets to "match" what the loss_function wants
        batch_X = batch_X.permute(2, 0, 1)
        #batch_X.shape

        # zero gradient
        optimizer.zero_grad()
        # pass through
        outputs,hidden_state = net(batch_X, hidden_state)  # forward pass
        hidden_state = None

        # compute loss and back propagate
    
        loss = criterion(outputs, batch_Y)
        loss.backward()
        # optimize
        optimizer.step()
        #scheduler.step()

        #train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += batch_Y.size(0)
        correct += predicted.eq(batch_Y).sum().item()
    

    net.eval()
    epoch_test_loss = 0
    test_correct = 0
    test_total = 0
    with torch.no_grad():
      for batch_idx, (test_batch_X, test_batch_Y) in enumerate(testloader, 0):
        test_batch_X, test_batch_Y = test_batch_X.to(device, dtype=torch.float), test_batch_Y.to(device)
        test_batch_X = test_batch_X.permute(2, 0, 1)


        test_outputs,hidden_state = net(test_batch_X, hidden_state)  # forward pass
        hidden_state = None
        #loss = criterion(outputs, test_y[i].to(device).long())
        _, predicted = test_outputs.max(1)

        test_total += test_batch_Y.size(0)
        test_correct += predicted.eq(test_batch_Y).sum().item()
    train_loss.append(loss.cpu().detach().numpy())
    print(f"Epoch: {epoch}, Loss: {loss} | Acc: {100.*correct/total} ({correct}/{total}) | Test Acc: {100.*test_correct/test_total} ({test_correct}/{test_total})")

In [12]:
train(net)

Epoch: 0, Loss: 5.933485507965088 | Acc: 1.1571954703472964 (1681/145265) | Test Acc: 0.4605502363349897 (38/8251)
Epoch: 1, Loss: 5.211328983306885 | Acc: 3.600316662651017 (5230/145265) | Test Acc: 1.8179614592170645 (150/8251)
Epoch: 2, Loss: 5.14376974105835 | Acc: 7.958558496540805 (11561/145265) | Test Acc: 3.126893709853351 (258/8251)
Epoch: 3, Loss: 4.159987926483154 | Acc: 13.350772725708188 (19394/145265) | Test Acc: 5.126651314992122 (423/8251)
Epoch: 4, Loss: 4.021945953369141 | Acc: 19.819639968333735 (28791/145265) | Test Acc: 8.447460913828627 (697/8251)
Epoch: 5, Loss: 3.2957100868225098 | Acc: 28.160258837297352 (40907/145265) | Test Acc: 12.253060235123016 (1011/8251)
Epoch: 6, Loss: 2.6253182888031006 | Acc: 36.12914328984959 (52483/145265) | Test Acc: 16.07077929947885 (1326/8251)
Epoch: 7, Loss: 2.2901430130004883 | Acc: 43.002099611055655 (62467/145265) | Test Acc: 19.379469155253908 (1599/8251)
Epoch: 8, Loss: 2.097196578979492 | Acc: 49.756651636664024 (72279/14

In [13]:
torch.save(net.state_dict(), \
               'drive/MyDrive/mlsp_project/backup/LSTM_file1_mfcc.pth')

In [None]:
plt.plot(train_loss)