In [2]:
# Import the packages we'll use

import numpy as np
import os, glob, csv

# librosa is a widely-used audio processing library
import librosa

import sklearn

import torch
import torch.nn as nn
import torch.nn.functional as nnF

# for plotting
%matplotlib inline
import matplotlib.pyplot as plt

import math

# for accuracy and confusion matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# for data normalization
from sklearn.preprocessing import StandardScaler

In [3]:
# USER CONFIGURATION
# Please alter the paths here to where the data are stored on your local filesystem
binarylabelcsv  = os.path.expanduser("~/shared_storage/ECS7013P/bird_audio_detection/warblrb10k_public_metadata_2018.csv")
audiofilefolder = os.path.expanduser("~/shared_storage/ECS7013P/warblrb10k_public_wav")

# we experiment with 100 files here. In practice, it depends on your actual training, validation, and test data
#maxfilestoload  = 1000      # limit, because loading the whole dataset is very slow
maxfilestoload  = 100      # limit, because loading the whole dataset is very slow


In [4]:
# here we load the metadata labels
binarylabels = {}
with open(binarylabelcsv, 'r') as infp:
        rdr = csv.DictReader(infp)
        for row in rdr:
                binarylabels[row['itemid']] = float(row['hasbird'])
                if len(binarylabels)==maxfilestoload:
                        break  # note, here we are restricting the maximum number of rows.

fkeys = sorted(binarylabels.keys())
# inspect:
for i, kv in enumerate(binarylabels.items()):
    print(kv)
    if i==10: break

('759808e5-f824-401e-9058', 1.0)
('1d94fc4a-1c63-4da0-9cac', 1.0)
('bb0099ce-3073-4613-8557', 1.0)
('c4c67e81-9aa8-4af4-8eb7', 1.0)
('ab322d4b-da69-4b06-a065', 0.0)
('519cfbe6-f804-4add-baa3', 0.0)
('6332d960-6f57-4ecc-8d1a', 1.0)
('db89b696-5ca0-4ca8-982a', 1.0)
('a02ac7bc-5a29-40a1-89e1', 1.0)
('6ce66c37-3a83-43b1-b0dd', 1.0)
('126160c6-cd85-41f7-a5e7', 1.0)


In [5]:
'''
- Load an example audio file, converting the audio data to mel spectrogram
- window length 50 ms, hop_len 25 ms
'''
def extract_melspectrogram(filename, win_len=0.05, hop_len=0.025, n_mels=64):
    audio, sr = librosa.load("%s/%s.wav" % (audiofilefolder, filename), sr=22050)
    win_len = int(win_len*sr)
    hop_len = int(hop_len*sr)
    spec = librosa.feature.melspectrogram(audio, sr, n_mels=n_mels, n_fft=2048, win_length=win_len, hop_length=hop_len)
    # return data format (time_len, n_mels)
    return spec.transpose((1,0))
'''
 - Load the data, 
 - Extract mel spectrograms
 - Annotation: one element corresponding to one audio file
'''
data = np.zeros((maxfilestoload, 400, 64)) # for storing mel spectrograms
label = np.zeros(maxfilestoload) # for storing the annotion
for i, kv in enumerate(binarylabels.items()):
    print(kv[0])
    # the number of the melspectrograms' time frames varies a bit (due to some small differences in audio length)
    # for simplicity, let's take a maximum of 400 time frames.
    melspec = extract_melspectrogram(kv[0])
    if(len(melspec) < 400):
        melspec = np.pad(melspec, ((400-len(melspec),0),(0,0)))
    data[i] = melspec[:400]
    label[i] = kv[1]

759808e5-f824-401e-9058
1d94fc4a-1c63-4da0-9cac
bb0099ce-3073-4613-8557
c4c67e81-9aa8-4af4-8eb7
ab322d4b-da69-4b06-a065
519cfbe6-f804-4add-baa3
6332d960-6f57-4ecc-8d1a
db89b696-5ca0-4ca8-982a
a02ac7bc-5a29-40a1-89e1
6ce66c37-3a83-43b1-b0dd
126160c6-cd85-41f7-a5e7
19d149c7-98a8-48d2-921d
4dd5046d-c962-4f02-a820
479b90e3-85bf-403c-8298
3661273c-19b9-4ea0-abc5
5e8976e1-c7bf-45b7-b22b
947e8e78-3f1d-4493-936f
b7a49bf2-f898-41ec-be40
d09bf6fc-6275-47b2-9a3e
960784f4-34aa-4235-9d9c
4c6d2568-17f6-4ca7-b347
1e1ade85-72d2-4c85-b428
efe349dd-319a-42bf-ae04
e30d7c93-f1bc-4ca5-af31
13562d9b-aa0b-42ca-9198
f08f8ca2-59dd-4065-9abb
b2952a5b-6e87-4b3d-b7a8
98fe674f-985a-4434-aa82
56239bc2-5af8-4586-b2c7
bfee5f94-ed1d-48da-9681
0a0b783d-f9a3-4652-a01d
031f0a9b-446c-496f-8997
1e860c93-dd38-4711-88de
026eceb5-3e21-4fdc-b6d3
9fe0ad8e-14bf-45f5-9429
7c503dce-d6ad-4394-97b1
345341a8-7d81-4ccb-ba4a
c56549cc-af7b-4269-8fc5
c95f2ba3-b863-439b-993a
3718760f-e8c5-40c8-87b8
42354fbc-fa9b-430d-b374
d509daa6-afad-4d

In [6]:
'''
- Split the data into 
    training (80%)
    validation (10%)
    test (10%)
'''
#print(label)
#print(data.shape)
#print(data[0])

# training data
train_data = data[:int(0.8*maxfilestoload)]
train_label = label[:int(0.8*maxfilestoload)]
print(train_data.shape)

# validation data
valid_data = data[int(0.8*maxfilestoload):int(0.9*maxfilestoload)]
valid_label = label[int(0.8*maxfilestoload):int(0.9*maxfilestoload)]
print(valid_data.shape)

# test data
test_data = data[int(0.9*maxfilestoload):]
test_label = label[int(0.9*maxfilestoload):]
print(test_data.shape)

#del data

(80, 400, 64)
(10, 400, 64)
(10, 400, 64)


In [7]:
# data normalisation
scaler = StandardScaler()
# compute normalisation parameters based on the training data 
# QUESTION: why do we reshape the data to (-1,64)?
scaler.fit(train_data.reshape((-1,64)))
print(scaler.mean_)

# normalise the training data with the computed parameters
train_data = scaler.transform(train_data.reshape((-1,64)))
train_data = train_data.reshape((-1, 400, 64)) # reverse back to the original shape
#print(train_data[0])

# normalise the validation data with the computed parameters
valid_data = scaler.transform(valid_data.reshape((-1,64)))
valid_data = valid_data.reshape((-1, 400, 64)) # reverse back to the original shape
#print(valid_data[0])

# normalise the test data with the computed parameters
test_data = scaler.transform(test_data.reshape((-1,64)))
test_data = test_data.reshape((-1, 400, 64)) # reverse back to the original shape
#print(test_data[0])

[8.20791494e-01 9.64391566e-01 2.89431562e+00 3.78157927e+00
 2.76027139e+00 1.78452657e+00 1.79662422e+00 1.43626160e+00
 1.53025706e+00 1.43725899e+00 1.34187642e+00 9.01580792e-01
 7.94863424e-01 9.08886755e-01 9.38719742e-01 9.98877398e-01
 1.16789356e+00 1.15067610e+00 1.01053602e+00 7.92294295e-01
 6.63644233e-01 5.78312027e-01 5.68667713e-01 4.96086765e-01
 5.65720270e-01 5.38950945e-01 4.06807184e-01 3.62360494e-01
 3.00438021e-01 3.19008672e-01 2.85847548e-01 3.35627510e-01
 3.63672546e-01 5.53592367e-01 3.74909727e-01 2.67980823e-01
 2.62757641e-01 2.55434710e-01 2.67096008e-01 1.97006626e-01
 1.85187636e-01 1.32342670e-01 1.42885041e-01 1.19854611e-01
 1.72176351e-01 1.62574243e-01 2.18439848e-01 1.76943503e-01
 1.59127307e-01 6.16644894e-02 7.83198373e-02 6.16663548e-02
 5.08809065e-02 6.14633521e-02 5.89586188e-02 6.33452124e-02
 4.55618710e-02 4.00422790e-02 3.46592746e-02 2.16795920e-02
 1.03177601e-02 7.68573427e-03 6.16053785e-03 1.94739536e-03]


In [8]:
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. 
    Ref: He, Kaiming, et al. "Delving deep into rectifiers: Surpassing 
    human-level performance on imagenet classification." Proceedings of the 
    IEEE international conference on computer vision. 2015.
    """
    
    if layer.weight.ndimension() == 4:
        (n_out, n_in, height, width) = layer.weight.size()
        n = n_in * height * width
        
    elif layer.weight.ndimension() == 2:
        (n_out, n) = layer.weight.size()

    std = math.sqrt(2. / n)
    scale = std * math.sqrt(3.)
    layer.weight.data.uniform_(-scale, scale)

    if layer.bias is not None:
        layer.bias.data.fill_(0.)

In [9]:
def init_bn(bn):
    """Initialize a Batchnorm layer. """
    
    bn.weight.data.fill_(1.)

In [10]:
class CnnModel(nn.Module):
    """The CNN model"""
    def __init__(self):
        
        super(CnnModel, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=64,
                               kernel_size=(5, 5), bias=False)

        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128,
                               kernel_size=(5, 5), bias=False)

        self.conv3 = nn.Conv2d(in_channels=128, out_channels=128,
                               kernel_size=(3, 3), bias=False)

        self.fc1 = nn.Linear(128*2*1, 128, bias=True)
        self.fc2 = nn.Linear(128, 1, bias=True)

        self.bn1 = nn.BatchNorm2d(64)
        self.bn2 = nn.BatchNorm2d(128)
        self.bn3 = nn.BatchNorm2d(128)

        self.init_weights()

    def init_weights(self):

        init_layer(self.conv1)
        init_layer(self.conv2)
        init_layer(self.conv3)
        init_layer(self.fc1)

        init_bn(self.bn1)
        init_bn(self.bn2)
        init_bn(self.bn3)

    def forward(self, x):
        (_, time_len, mel_bins) = x.shape

        x = x.view(-1, 1, time_len, mel_bins)
        #print('Input')
        #print(x.size())

        x = nnF.relu(self.bn1(self.conv1(x)))
        #print('Conv1')
        #print(x.size())
        x = nnF.max_pool2d(x,kernel_size=(8,4),padding=(4,0))
        #print('Pool1')
        #print(x.size())
        
        x = nnF.relu(self.bn2(self.conv2(x)))
        #print('Conv2')
        #print(x.size())
        x = nnF.max_pool2d(x,kernel_size=(8,4),padding=(2,1))
        #print('Pool2')
        #print(x.size())
        
        x = nnF.relu(self.bn3(self.conv3(x)))
        #print('Conv3')
        #print(x.size())
        x = nnF.max_pool2d(x,kernel_size=(2,1))
        #print('Pool3')
        #print(x.size())
        
        # flatten
        x = x.view(-1, self.num_flat_features(x))
        x = nnF.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))

        return x

    def forward_and_convert(self, x):
        "Handles the torch<--->numpy tensor conversion, for convenience"
        x_torch = torch.FloatTensor(x)
        y_torch = self.forward(x_torch)
        return y_torch.detach().numpy()
        
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [11]:
# create a model instance
net = CnnModel()
print(net)

# Binary-cross entropy loss, closely related to logistic regression loss
criterion = nn.BCELoss()

# Adam Optimizer, learning rate 0.001
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.)

CnnModel(
  (conv1): Conv2d(1, 64, kernel_size=(5, 5), stride=(1, 1), bias=False)
  (conv2): Conv2d(64, 128, kernel_size=(5, 5), stride=(1, 1), bias=False)
  (conv3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), bias=False)
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [12]:
# minibatch size (remember stochastic gradient descent?)
batch_size = 4

# some helpful functions

'''
Evaluate a network "model" on the data "data" 
Predicted class labels will be returned
'''
def evaluate(model, data):
    pred = np.zeros(len(data)) # for storing predicted class labels, one for each data sample
    num_batch = len(data)//batch_size # number of batches in one data epoch
    # evaluate batch by batch and store the output to "pred"
    for i in range(num_batch):
        temp = model.forward_and_convert(data[i*num_batch : (i+1)*num_batch])
        # QUESTION: what does squeeze() function do?
        pred[i*num_batch : (i+1)*num_batch] = temp.squeeze()
    # some trailing data samples
    if(num_batch*batch_size < len(data)):
        temp = model.forward_and_convert(data[num_batch*batch_size :])
        pred[num_batch*batch_size :] = temp.squeeze()
    # each element in "pred" is the output after sigmoid function and has value in [0, 1].
    # to obtain the discrete label (0 or 1 in this case), we threshold the value by 0.5.
    pred[pred >= 0.5] = 1.
    pred[pred < 0.5] = 0.
    return pred

'''
Randomly shuffle the data. It will be used to shuffle the training data after every training epoch
'''
def shuffle_data(data, label):
    # permute the data indices
    rand_ind = np.random.permutation(len(data))
    # re-order the data with the pumuted indices
    return data[rand_ind], label[rand_ind]

In [None]:
'''The training loop'''

num_epochs = 100 # the number of training epoch (i.e. when you've gone through all samples of the training data, that's one epoch)
evaluate_every_epoch = 1 # how often you want to evaluate the network during training?
best_valid_acc = 0.0 # for keeping track of the best accuracy on the validation data
saved_model = './best_model' # path for saving the best model during training

for epoch in range(num_epochs):
    # shuffle training data
    train_data, train_label = shuffle_data(train_data, train_label)
    
    # the number of minibatch in one epoch
    num_batch = len(train_data) // batch_size
    for i in range(num_batch):
        # sample one minibatch
        batch_data = train_data[i*batch_size : (i+1)*batch_size]
        label_data = train_label[i*batch_size : (i+1)*batch_size]
    
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(torch.FloatTensor(batch_data))
        loss = criterion(outputs.squeeze(), torch.FloatTensor(label_data))
    
        loss.backward()
        optimizer.step()

    running_loss = loss.item()
    # print training loss
    print('[%d] loss: %.8f' %(epoch, running_loss))
    
    # evaluate the network on the validation data
    if((epoch+1) % evaluate_every_epoch == 0):
        valid_pred = evaluate(net, valid_data)
        valid_acc = accuracy_score(valid_pred, valid_label)
        print('Validation accuracy: %g' % valid_acc)
        
        # if the best validation performance so far, save the network to file 
        if(best_valid_acc < valid_acc):
            best_valid_acc = valid_acc
            print('Saving best model')
            torch.save(net.state_dict(), saved_model)

[0] loss: 0.21287803
Validation accuracy: 0.6
Saving best model
[1] loss: 0.38553396
Validation accuracy: 0.6
[2] loss: 0.41485715
Validation accuracy: 0.6
[3] loss: 0.20125622
Validation accuracy: 0.6


In [None]:
'''When you are here, we have the best model saved in file.'''
'''Then, load the saved model, and evaluate it on the test data'''
net = CnnModel()
net.load_state_dict(torch.load(saved_model)) # load the saved model

# evaluate on the test data
test_pred = evaluate(net, test_data) 
print(test_pred)

# test accuracy
test_acc = accuracy_score(test_pred, test_label)
print('Test accuracy: %g' % test_acc)

# confusion matrix
confusion_matrix(test_label, test_pred)
print('Confusion_matrix')