In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import os
import torch
import librosa

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from datasets import AudioDataset

In [2]:
class LSTMModel(nn.Module):
    def __init__(self, feat_dim, hidden=128, num_layers=2):
        super(LSTMModel, self).__init__()
        self.numlayers = num_layers
        self.hidden = hidden
        self.feat_dim = feat_dim
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(self.feat_dim, self.hidden, self.num_layers,batch_first=True)
        self.linear1 = nn.Linear(self.hidden, 64)
        self.linear2 = nn.Linear(64, 10)

        #self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x, length):
        """
        Inputs: 
            x: [batch, seq, feature]
        """
        batch_size, seq_len, _ = x.shape
        packed = nn.utils.rnn.pack_padded_sequence(x, length, batch_first=True, enforce_sorted=False)
        h0 = torch.randn(self.numlayers, batch_size, self.hidden).cuda()
        c0 = torch.randn(self.numlayers, batch_size, self.hidden).cuda()
        output, (hn, cn) = self.lstm(packed, (h0, c0))
        y = self.linear1(hn[0, :, :])
        y = self.linear2(y)
        return y

In [3]:
# path to urban sound 8k
data_root = "/home/tiz007/228/228_data/UrbanSound8K/"
# path to label
label_path = "/home/tiz007/228/228_data/UrbanSound8K/metadata/UrbanSound8K.csv"  

In [4]:
# initialize dataset (feature can be "mfcc", "spec", "mel_raw")
audio_dataset = AudioDataset(3, DataRoot=data_root, LabelPath=label_path, feature="mel_raw", mode="train")

verify mel_raw feature success


In [5]:
# define lstm model
lstm_model = LSTMModel(128)
# to gpu
lstm_model = lstm_model.cuda()

In [6]:
# initialize dataloader
data_loader = torch.utils.data.DataLoader(audio_dataset, batch_size=32, shuffle=True, num_workers=1)

# loss function
loss_fn = nn.CrossEntropyLoss()

# lr
learning_rate = 1e-3

# initialize optimizer
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=learning_rate)

In [7]:
MAX_EPOCH = 100
for epoch in range(MAX_EPOCH):
    # iterate through dataset
   
    # initialize epoch stat
    correct_num = 0
    total_num = 0
    loss_sum = 0
    
    for idx, data in enumerate(data_loader):
        
        train_data, labels, length = data
        
        # data to gpu
        train_data = train_data.cuda()
        labels = labels.cuda()
        length = length.cuda()
        
        # normalize to [0,1]
        #print(train_data.max(), train_data.min())
        
        train_data = train_data - train_data.min()
        
        prob = lstm_model(train_data, length)   
        #print(prob, labels)
        loss = loss_fn(prob, labels)
        
        output = prob.argmax(1)
        loss_sum += loss.item()
        
        correct_num += (output==labels).sum().double()
        total_num += float(labels.shape[0])
        
        optimizer.zero_grad()

        loss.backward()

        optimizer.step()
        
    #print(loss.item())
    #print(output, labels)

    print("epoch acc: {:.4} avg loss: {:.4f}".format(correct_num/total_num, loss_sum/total_num))

epoch acc: 0.2234 avg loss: 0.0648
epoch acc: 0.2699 avg loss: 0.0593
epoch acc: 0.3269 avg loss: 0.0560
epoch acc: 0.3507 avg loss: 0.0532
epoch acc: 0.3699 avg loss: 0.0519
epoch acc: 0.4145 avg loss: 0.0483
epoch acc: 0.4639 avg loss: 0.0455
epoch acc: 0.491 avg loss: 0.0434
epoch acc: 0.5277 avg loss: 0.0412
epoch acc: 0.5516 avg loss: 0.0396
epoch acc: 0.559 avg loss: 0.0389
epoch acc: 0.5913 avg loss: 0.0366
epoch acc: 0.5914 avg loss: 0.0366
epoch acc: 0.6171 avg loss: 0.0343
epoch acc: 0.6408 avg loss: 0.0327
epoch acc: 0.6512 avg loss: 0.0318
epoch acc: 0.6577 avg loss: 0.0309
epoch acc: 0.6759 avg loss: 0.0296
epoch acc: 0.6291 avg loss: 0.0334
epoch acc: 0.6515 avg loss: 0.0313
epoch acc: 0.6839 avg loss: 0.0289
epoch acc: 0.7012 avg loss: 0.0274
epoch acc: 0.715 avg loss: 0.0266
epoch acc: 0.717 avg loss: 0.0260
epoch acc: 0.7296 avg loss: 0.0248
epoch acc: 0.7368 avg loss: 0.0243
epoch acc: 0.735 avg loss: 0.0244
epoch acc: 0.7448 avg loss: 0.0234
epoch acc: 0.7514 avg los

In [8]:
test_dataset = AudioDataset(3, DataRoot=data_root, LabelPath=label_path, feature="mel_raw", mode="test")
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=1)

with torch.no_grad():
    for idx, data in enumerate(test_loader):
        test_data, labels, length = data
        test_data = test_data.cuda()
        labels = labels.cuda()
        length = length.cuda()


        test_data = test_data - test_data.min()

        prob = lstm_model(test_data, length)   
        #print(prob, labels)
        loss = loss_fn(prob, labels)

        output = prob.argmax(1)
        loss_sum += loss.item()

        correct_num += (output==labels).sum().double()
        total_num += float(labels.shape[0])


print("epoch acc: {:.4} avg loss: {:.4f}".format(correct_num/total_num, loss_sum/total_num))

verify mel_raw feature success
epoch acc: 0.8934 avg loss: 0.0126
