 MLSP Project



Copy dataset files from the drive, decompress them and combine them into one folder. Each file must have maximum size of 5GB due to the google drive limit. Doing this is because reading each file in the google drive is extremely slow, so must put all the decompressed file locally in the VM to read and process. 


In [3]:
!mkdir wav
!cp ./drive/MyDrive/mlsp_project/dataset/wavzip/wav1.zip ./wav1.zip 
!cp ./drive/MyDrive/mlsp_project/dataset/wavzip/wav2.zip ./wav2.zip ; unzip -qq "./wav1.zip" -d "./wav"
!cp ./drive/MyDrive/mlsp_project/dataset/wavzip/wav3.zip ./wav3.zip ; unzip -qq "./wav2.zip" -d "./wav"
!cp ./drive/MyDrive/mlsp_project/dataset/wavzip/wav4.zip ./wav4.zip ; unzip -qq "./wav3.zip" -d "./wav"
!cp ./drive/MyDrive/mlsp_project/dataset/wavzip/wav5.zip ./wav5.zip ; unzip -qq "./wav4.zip" -d "./wav"
!cp ./drive/MyDrive/mlsp_project/dataset/wavzip/wav6.zip ./wav6.zip ; unzip -qq "./wav5.zip" -d "./wav"
!cp ./drive/MyDrive/mlsp_project/dataset/wavzip/wav7.zip ./wav7.zip ; unzip -qq "./wav6.zip" -d "./wav"
!cp ./drive/MyDrive/mlsp_project/dataset/wavzip/wav8.zip ./wav8.zip ; unzip -qq "./wav7.zip" -d "./wav"
!cp ./drive/MyDrive/mlsp_project/dataset/wavzip/wav9.zip ./wav9.zip ; unzip -qq "./wav8.zip" -d "./wav"
!unzip -qq "./wav9.zip" -d "./wav"


mkdir: cannot create directory ‘wav’: File exists


In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 13418744667452612879
 xla_global_id: -1, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 15349841920
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 8037998058393040175
 physical_device_desc: "device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0"
 xla_global_id: 416903419]

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Define loader for the dataset.


In [4]:
from scipy.io import wavfile
from scipy.signal import lfilter, stft
import numpy as np

#reference: https://github.com/Derpimort/VGGVox-PyTorch
#https://github.com/a-nagrani/VGGVox
def preprocess(audio, is_train):
    fs = 16000
    frame_len = int(20 * fs / 1000)
    hop_len = int(10 * fs / 1000)

    nfft = 512

    audio = lfilter([1, -1],[1,-0.99],audio)
    # dither add noise
    dither = np.random.uniform(low=-1, high=1, size=audio.shape)
    std_dev = np.std(audio)
    audio = audio + (1e-6 * std_dev) * dither

    # preemphasis
    audio = lfilter([1, -0.97], 1, audio)

    _, _, spectrogram = stft(
        audio,
        fs=fs,
        window=np.hamming(frame_len),
        nperseg=frame_len,
        noverlap=frame_len-hop_len,
        nfft=nfft,
        return_onesided=False,
        boundary=None,
        padded=False
        )
    spectrogram = np.abs(spectrogram)
    zero_mean_spectrogram = spectrogram-spectrogram.mean(1, keepdims=True)
    spectrogram_std_dev = np.clip(spectrogram.std(1, keepdims=True), 1e-12, None)
    spectrogram = zero_mean_spectrogram/spectrogram_std_dev

    return spectrogram

In [5]:
import torch
from torch.utils.data import Dataset

data_path = './wav/'
split_path = './drive/MyDrive/mlsp_project/iden_split.txt'

train_size = 145265
test_size = 8251
audio_len = 49000

class VoxCeleb1_Spectrogram(Dataset):
    def __init__(self, is_train):
        f = open(split_path, "r")
        lines = f.readlines()
        f.close()
        self.is_train = is_train
        self.ids = []
        self.data = []
        self.paths = []
        if is_train:
            self.dataset_size = train_size
        else:
            self.dataset_size = test_size
        counter = 0
        for line in lines:
            if counter == self.dataset_size:
                break
            strings = line.split(' ')
            set_label = int(strings[0])
            if is_train:
                if set_label == 1 or set_label == 2:
                    self.ids.append(int(strings[1].split('/')[0].replace('id',''))-10001)
                    path = data_path + strings[1].replace('\n', '')
                    self.paths.append(path)
                    #fs, audio = wavfile.read(path)
                    #self.data.append(audio)
                    counter += 1
            else:
                if set_label == 3:
                    self.ids.append(int(strings[1].split('/')[0].replace('id',''))-10001)
                    path = data_path + strings[1].replace('\n', '')
                    self.paths.append(path)
                    #fs, audio = wavfile.read(path)
                    #self.data.append(audio)
                    counter += 1

    def __len__(self):
        return self.dataset_size

    def __getitem__(self, idx):
        fs, audio = wavfile.read(self.paths[idx])
        #audio = self.data[idx]
        #for train set, randomly select a selected length fragment of the audio each time
        #then pass to the preprocess function to get a fix length spectrogram
        #for test set, pass full length audio to the preprocess function
        #then generate a rounded length spectrogram for testing
        if self.is_train:
            start = np.random.randint(0, audio.shape[0] - audio_len + 1)
            audio = audio[start:start + audio_len]
        spectrogram = preprocess(audio,self.is_train)
        return torch.from_numpy(spectrogram), self.ids[idx]

In [6]:
batch_size = 100
trainset = VoxCeleb1_Spectrogram(is_train=True)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

In [None]:
print(len(trainloader.dataset))

145265


In [7]:
test_batch_size = 1
testset = VoxCeleb1_Spectrogram(is_train=False) 
testloader = torch.utils.data.DataLoader(testset, batch_size=test_batch_size, shuffle=True)

In [9]:
print(len(testloader.dataset))

8251


In [8]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import time
import random
import math
import os 
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import cv2
import re

In [14]:
# modle definition
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, batch_size, output_dim=1251, num_layers=2):
        super(LSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers

        # setup LSTM layer
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)

        # setup output layer
        ###
        self.linear = nn.Linear(self.hidden_dim, output_dim)

    def forward(self, input, hidden=None):
      
        lstm_out, _ = self.lstm(input, hidden)
        logits = self.linear(lstm_out[-1])              # equivalent to return_sequences=False from Keras
        genre_scores = F.log_softmax(logits, dim=1)
        return genre_scores, hidden


In [10]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print ("running on the GPU")
else:
    device = torch.device("cpu")
    print ("running on the cpu")

running on the GPU


In [15]:
model = torch.load('drive/MyDrive/mlsp_project/backup/LSTM_file8.pth')
print(model)

OrderedDict([('lstm.weight_ih_l0', tensor([[-0.0199,  0.0577, -0.0983,  ...,  0.1482, -0.0786,  0.0457],
        [ 0.2987,  0.3267, -0.2696,  ..., -0.3419, -0.2421,  0.3726],
        [ 0.4322,  0.1689, -0.2227,  ...,  0.1659, -0.1489,  0.1919],
        ...,
        [ 0.0111,  0.1804,  0.6751,  ...,  0.4383,  0.5935,  0.2447],
        [ 0.0112, -0.0680,  0.3421,  ...,  0.0986,  0.3165, -0.0448],
        [-0.0720,  0.0823,  0.3539,  ..., -0.0912,  0.2665,  0.1235]],
       device='cuda:0')), ('lstm.weight_hh_l0', tensor([[-1.0561,  0.0456,  0.3047,  ..., -0.8490, -0.1819, -0.3186],
        [-0.6126,  0.2944,  0.1303,  ...,  0.0815,  0.0361,  0.4633],
        [ 0.2784, -0.0132,  0.1516,  ..., -0.3639,  0.5886,  0.3619],
        ...,
        [ 0.2084, -0.1493,  0.1215,  ..., -0.2811,  0.1379, -0.0874],
        [ 0.2507, -0.0683, -0.1598,  ..., -0.3053,  0.7805,  0.0757],
        [-0.0491,  0.3060,  0.0981,  ..., -0.1788, -0.2336, -0.1997]],
       device='cuda:0')), ('lstm.bias_ih_l0', ten

In [16]:
net = LSTM( input_dim=512, hidden_dim=256, batch_size=batch_size, output_dim=1251, num_layers=2).to(device)
print(net)

LSTM(
  (lstm): LSTM(512, 256, num_layers=2)
  (linear): Linear(in_features=256, out_features=1251, bias=True)
)


In [17]:
net.load_state_dict(torch.load('drive/MyDrive/mlsp_project/backup/LSTM_file8.pth'), strict=False)

<All keys matched successfully>

In [18]:
criterion = nn.NLLLoss()  # expects ouputs from LogSoftmax
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [19]:
#BATCH_SIZE = 32
EPOCHS = 20
train_loss = []
def train(net):
  #train_loss = 0

  for epoch in range (EPOCHS):
    hidden_state = None
    correct = 0
    total = 0
    net.train()

    for batch_idx, (batch_X, batch_Y) in enumerate(trainloader, 0):

        batch_X, batch_Y = batch_X.to(device, dtype=torch.float), batch_Y.to(device)
        # match the PyTorch expected input tensor format of (sequence_length, batch size, input_dim)
        # Reshape input & targets to "match" what the loss_function wants
        batch_X = batch_X.permute(2, 0, 1)
        #batch_X.shape

        # zero gradient
        optimizer.zero_grad()
        # pass through
        outputs,hidden_state = net(batch_X, hidden_state)  # forward pass
        hidden_state = None

        # compute loss and back propagate
    
        loss = criterion(outputs, batch_Y)
        loss.backward()
        # optimize
        optimizer.step()
        #scheduler.step()

        #train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += batch_Y.size(0)
        correct += predicted.eq(batch_Y).sum().item()
    #backup every 10 epoch
    if epoch % 10 == 0:
      net.eval()
      epoch_test_loss = 0
      test_correct = 0
      test_total = 0
      with torch.no_grad():
        for batch_idx, (test_batch_X, test_batch_Y) in enumerate(testloader, 0):
          test_batch_X, test_batch_Y = test_batch_X.to(device, dtype=torch.float), test_batch_Y.to(device)
          test_batch_X = test_batch_X.permute(2, 0, 1)


          test_outputs,hidden_state = net(test_batch_X, hidden_state)  # forward pass
          hidden_state = None
          #loss = criterion(outputs, test_y[i].to(device).long())
          _, predicted = test_outputs.max(1)

          test_total += test_batch_Y.size(0)
          test_correct += predicted.eq(test_batch_Y).sum().item()
      print('Test Accuracy: {}'.format(\
        test_correct/test_total))
    
      torch.save(net.state_dict(), \
               'drive/MyDrive/mlsp_project/backup/LSTM_file{}.pth'.format(str(epoch//5+8)))

    train_loss.append(loss.cpu().detach().numpy())
    print(f"Epoch: {epoch}, Loss: {loss} | Acc: {100.*correct/total} ({correct}/{total}) ")

In [20]:
train(net)

Test Accuracy: 0.7340928372318507
Epoch: 0, Loss: 0.3655150830745697 | Acc: 89.13709427597838 (129485/145265) 
Epoch: 1, Loss: 0.5774204134941101 | Acc: 89.57422641379547 (130120/145265) 
Epoch: 2, Loss: 0.5130208730697632 | Acc: 89.68574673871889 (130282/145265) 
Epoch: 3, Loss: 0.5253806710243225 | Acc: 89.8840050941383 (130570/145265) 
Epoch: 4, Loss: 0.44751957058906555 | Acc: 89.94527243313944 (130659/145265) 
Epoch: 5, Loss: 0.3424421548843384 | Acc: 90.2075517158297 (131040/145265) 
Epoch: 6, Loss: 0.3446221351623535 | Acc: 90.21168209823426 (131046/145265) 
Epoch: 7, Loss: 0.2598396837711334 | Acc: 90.39204212990053 (131308/145265) 
Epoch: 8, Loss: 0.4491550624370575 | Acc: 90.42577358620453 (131357/145265) 
Epoch: 9, Loss: 0.33052414655685425 | Acc: 90.77479089939077 (131864/145265) 
Test Accuracy: 0.735183614107381
Epoch: 10, Loss: 0.37836119532585144 | Acc: 90.86290572402162 (131992/145265) 
Epoch: 11, Loss: 0.18748335540294647 | Acc: 90.94826696038275 (132116/145265) 


KeyboardInterrupt: ignored

In [None]:
plt.plot(train_loss)

In [21]:
pytorch_total_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
print(pytorch_total_params)

1636323
