In [2]:
import numpy as np
import librosa
import os
import time
import h5py

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import matplotlib.pyplot as plt
from IPython.display import Audio

In [11]:
dir_path = 'ALL_data/AED_data'  # directory path

# load all event types
all_event = np.load(dir_path+'/event_types.npy')
print(all_event)

['beach' 'bus' 'cafe/restaurant' 'car' 'city_center' 'forest_path'
 'grocery_store' 'home' 'library' 'metro_station' 'office' 'park'
 'residential_area' 'train' 'tram']


In [None]:
train_label = np.load(dir_path+'/train_labels.npy')
print(train_label[0])  # (label, audio_file) pair

In [None]:
# walk through the directory, find the files with .wav extension
wav_files = []
for (dirpath, dirnames, filenames) in os.walk(dir_path):
    for file in filenames:
        if '.wav' in file:
            wav_files.append(file)
        
num_data = len(wav_files)

val_label = np.load(dir_path+'/validation_labels.npy')
test_label = np.load(dir_path+'/test_labels.npy')

train_audio = []
val_audio = []
test_audio = []
train_target = []
val_target = []
test_target = []

for i in range(len(train_label)):
    y, _ = librosa.load(dir_path+'/audio/'+train_label[i][1], sr=16000)
    train_audio.append(y[:16000*10])
    train_target.append(np.argmax((train_label[i][0] == np.array(all_event)).astype(np.float32)))

for i in range(len(val_label)):
    y, _ = librosa.load(dir_path+'/audio/'+val_label[i][1], sr=16000)
    val_audio.append(y[:16000*10]) 
    val_target.append(np.argmax((val_label[i][0] == np.array(all_event)).astype(np.float32)))
    
for i in range(len(test_label)):
    y, _ = librosa.load(dir_path+'/audio/'+test_label[i][1], sr=16000)
    test_audio.append(y[:16000*10])
    test_target.append(np.argmax((test_label[i][0] == np.array(all_event)).astype(np.float32)))


In [None]:
class AlexNet(nn.Module):
    def __init__(self, num_classes=15):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=11, stride=4, padding=2),  # number of input channel is 1 (for image it is 3) 
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(16, 32, kernel_size=5, padding=2),  # we make the number of hidden channels smaller in these layers
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(32, 32, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((3, 3))  # perform adaptive mean pooling on any size of the input to match the provided size
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(64 * 3 * 3, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(256, 256),
            nn.ReLU(inplace=True),
            nn.Linear(256, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)  # the dimension after adaptive average pooling is (batch, 64, 3, 3)
        x = torch.flatten(x, 1)  # average
        x = self.classifier(x)
        return x
    
# test it with a sample input
model = AlexNet()
sample_input = torch.randn(2, 1, 257, 626)  # (batch_size, num_channel, freq_dim, time_step)
sample_output = model(sample_input)
print(sample_output.shape)  # (batch_size, num_classes)

In [None]:
from torch.utils.data import Dataset, DataLoader

batch_size = 8

class dataset_pipeline(Dataset):
    def __init__(self, data, label):
        super(dataset_pipeline, self).__init__()
        
        self.data = data
        self.label = label
        
        self._len = len(self.data)  # number of utterances
    
    def __getitem__(self, index):
        # calculate STFT here
        spec = librosa.stft(self.data[index].astype(np.float32), n_fft=512, hop_length=256)
        label = self.label[index]
        spec = torch.from_numpy(np.abs(spec))  # only use the magnitude spectrogram
        label = torch.from_numpy(np.array(label)).long()
            
        return spec, label
    
    def __len__(self):
        return self._len
    
# define data loaders
train_loader = DataLoader(dataset_pipeline(train_audio, train_target), 
                          batch_size=batch_size, 
                          shuffle=True,
                         )

validation_loader = DataLoader(dataset_pipeline(val_audio, val_target), 
                               batch_size=batch_size, 
                               shuffle=False,
                              )

dataset_len = len(train_loader)
log_step = dataset_len // 4

# CE loss
def CE(output, target):
    # output shape: (batch, num_classes)
    # target shape: (batch,)
    
    loss = nn.CrossEntropyLoss()
    
    return loss(output, target)


def train(model, epoch, versatile=True):
    start_time = time.time()
    model = model.train()  # set the model to training mode. Always do this before you start training!
    train_loss = 0.
    
    # load batch data
    for batch_idx, data in enumerate(train_loader):
        spec, label = data
        
        optimizer.zero_grad()
        
        output = model(spec.unsqueeze(1))
        
        # CE as objective
        loss = CE(output, label)
        
        # automatically calculate the backward pass
        loss.backward()
        # perform the actual backpropagation
        optimizer.step()
        
        train_loss += loss.data.item()
        
        # OPTIONAL: you can print the training progress 
        if versatile:
            if (batch_idx+1) % log_step == 0:
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | CE {:5.4f} |'.format(
                    epoch, batch_idx+1, len(train_loader),
                    elapsed * 1000 / (batch_idx+1), 
                    train_loss / (batch_idx+1)
                    ))
    
    train_loss /= (batch_idx+1)
    print('-' * 99)
    print('    | end of training epoch {:3d} | time: {:5.2f}s | CE {:5.4f} |'.format(
            epoch, (time.time() - start_time), train_loss))
    
    return train_loss
        
def validate(model, epoch):
    start_time = time.time()
    model = model.eval()  # set the model to evaluation mode. Always do this during validation or test phase!
    correct = 0
    total = 0
    
    # load batch data
    for batch_idx, data in enumerate(validation_loader):
        spec, label = data
        
        # you don't need to calculate the backward pass and the gradients during validation
        # so you can call torch.no_grad() to only calculate the forward pass to save time and memory
        with torch.no_grad():
        
            output = model(spec.unsqueeze(1))
        
            # calculate accuracy
            _, output_label = torch.max(output, 1)
            output_label = output_label.data.numpy()
            label = label.data.numpy()
            correct += np.sum(output_label == label)
            total += len(label)
        
    accuracy = correct / total
    print('    | end of validation epoch {:3d} | time: {:5.2f}s | Accuracy {:5.4f} |'.format(
            epoch, (time.time() - start_time), accuracy))
    print('-' * 99)
    
    return accuracy


total_epoch = 100  # train the model for 100 epochs
model_save = 'best_AlexNet.pt'  # path to save the best validation model
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# main function

training_loss = []
validation_loss = []

for epoch in range(1, total_epoch + 1):
    training_loss.append(train(model, epoch))
    validation_loss.append(validate(model, epoch))
    
    if training_loss[-1] == np.min(training_loss):
        print('      Best training model found.')
    if validation_loss[-1] == np.max(validation_loss):
        # save current best model on validation set
        with open(model_save, 'wb') as f:
            torch.save(model.state_dict(), f)
            print('      Best validation model found and saved.')
    
    print('-' * 99)

In [None]:
model.load_state_dict(torch.load('best_AlexNet.pt'))
model.eval()

correct = []
total = len(test_audio)

for i in range(len(test_audio)):
    this_spec = librosa.stft(test_audio[i].astype(np.float32), n_fft=512, hop_length=256)
    this_label = test_target[i]
    spec = torch.from_numpy(np.abs(this_spec))  # only use the magnitude spectrogram
    this_label = torch.from_numpy(np.array(this_label)).long()
    
    output = model(spec.unsqueeze(0).unsqueeze(1))
        
    # calculate accuracy
    _, output_label = torch.max(output, 1)
    output_label = output_label.data.numpy()
    this_label = this_label.data.numpy()
    correct.append(np.sum(output_label == this_label))

print('Overall accuracy: {:.2f}%'.format(np.sum(correct) / total * 100))

# accuracy for each class
for i in range(len(test_audio) // 2):
    print('Accuracy for {:s}: {:.2f}%'.format(all_event[i], np.sum(correct[i*2:i*2+2]) / 2 * 100))