# Defining the neural network model

In [99]:
import soundfile, torch
import torchaudio
from torchaudio.transforms import Resample

In [100]:
from pytorch_model import SoundNet8_pytorch
from utils import vector_to_scenes,vector_to_obj

In [101]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [102]:
test = SoundNet8_pytorch()
#'conv5.0.weight', 'conv5.0.bias', 'conv5.1.weight', 'conv5.1.bias', 'conv5.1.running_mean', 'conv5.1.running_var', 'conv5.1.num_batches_tracked'
state_dict = torch.load('sound8.pth')

state_dict_5 = {}

state_dict_5['0.weight'] = state_dict['conv5.0.weight']
state_dict_5['0.bias'] = state_dict['conv5.0.bias']
state_dict_5['1.weight'] = state_dict['conv5.1.weight']
state_dict_5['1.bias'] = state_dict['conv5.1.bias']
state_dict_5['1.running_mean'] = state_dict['conv5.1.running_mean']
state_dict_5['1.running_var'] = state_dict['conv5.1.running_var']
state_dict_5['1.num_batches_tracked'] = state_dict['conv5.1.num_batches_tracked']

test.conv5.state_dict().keys()

odict_keys(['0.weight', '0.bias', '1.weight', '1.bias', '1.running_mean', '1.running_var', '1.num_batches_tracked'])

In [103]:
class fineTune_SoundNet(torch.nn.Module):
    def __init__(self):
        super(fineTune_SoundNet, self).__init__()

        self.model = SoundNet8_pytorch()
        self.model.conv5.load_state_dict(state_dict_5)

    def forward(self, x):
        for net in [self.model.conv1, self.model.conv2, self.model.conv3, self.model.conv4]:
            x = net(x)
        
        x = self.model.conv5(x)
        
        for net in [self.model.conv6, self.model.conv7]:
            x = net(x)

        object_pred = self.model.conv8(x)
        scene_pred = self.model.conv8_2(x)
        return object_pred, scene_pred

In [104]:
model = fineTune_SoundNet()

#Freeze the parameters of the 5th layer
for param in model.model.conv5.parameters():
    param.requires_grad = False

In [105]:
from torchsummary import summary

summary(model, input_size=(1,110250, 1), batch_size=20)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [20, 16, 55126, 1]           1,040
       BatchNorm2d-2         [20, 16, 55126, 1]              32
              ReLU-3         [20, 16, 55126, 1]               0
         MaxPool2d-4          [20, 16, 6890, 1]               0
            Conv2d-5          [20, 32, 3446, 1]          16,416
       BatchNorm2d-6          [20, 32, 3446, 1]              64
              ReLU-7          [20, 32, 3446, 1]               0
         MaxPool2d-8           [20, 32, 430, 1]               0
            Conv2d-9           [20, 64, 216, 1]          32,832
      BatchNorm2d-10           [20, 64, 216, 1]             128
             ReLU-11           [20, 64, 216, 1]               0
           Conv2d-12          [20, 128, 109, 1]          65,664
      BatchNorm2d-13          [20, 128, 109, 1]             256
             ReLU-14          [20, 128,

# Loading the Dataset

In [106]:
from torch.utils.data import Dataset
from torch.utils.data.sampler import SubsetRandomSampler
import pandas as pd
import os
import numpy as np
from utils import vector_to_scenes,vector_to_obj

In [107]:
df = pd.read_csv('meta\esc10.csv')
df = df.rename(columns={'Unnamed: 0': 'index'})
df = df.set_index('index')

df

Unnamed: 0_level_0,filename,fold,target,category,esc10,src_file,take
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-110389-A-0.wav,1,0,dog,True,110389,A
2,1-116765-A-41.wav,1,41,chainsaw,True,116765,A
3,1-17150-A-12.wav,1,12,crackling_fire,True,17150,A
4,1-172649-A-40.wav,1,40,helicopter,True,172649,A
...,...,...,...,...,...,...,...
395,5-233160-A-1.wav,5,1,rooster,True,233160,A
396,5-234879-A-1.wav,5,1,rooster,True,234879,A
397,5-234879-B-1.wav,5,1,rooster,True,234879,B
398,5-235671-A-38.wav,5,38,clock_tick,True,235671,A


In [108]:
class ESC10_Dataset(Dataset):

    def __init__(self, annotations_file, audio_dir):
        self.annotations = pd.read_csv(annotations_file).rename(columns={'Unnamed: 0': 'index'}).set_index('index')
        self.audio_dir = audio_dir

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        if sr != 22050:
            transform = Resample(sr,22050)
            signal = transform(signal)
        if signal.shape[0]>1:
            signal = torch.mean(signal,axis=0)
        signal = signal.view(1,-1,1)
        return signal,label

    def _get_audio_sample_path(self, index):
        return os.path.join(self.audio_dir, self.annotations.iloc[index, 0])

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 3]

esc10 = ESC10_Dataset("meta\esc10.csv", "audio")

In [109]:
batch_size = 1 # how many samples per batch to load
valid_size = 0.2 # percentage of training set to use as validation

num_train = len(esc10)

# obtain training indices that will be used for validation
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_index, valid_index = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_index)
valid_sampler = SubsetRandomSampler(valid_index)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(esc10, batch_size = batch_size, sampler = train_sampler)
valid_loader = torch.utils.data.DataLoader(esc10, batch_size = batch_size, sampler = valid_sampler)

# Train the dataset

In [110]:
criterion = torch.nn.KLDivLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [111]:
#remap the label into int so that pytorch stop bugging because it hates string apparently...
label_mapping_dataset = {'dog':0, 'chainsaw':1, 'crackling_fire':2, 'helicopter':3, 'rain':4, 'crying_baby':5, 'clock_tick':6, 'sneezing':7, 'rooster':8, 'sea_waves':9}

label_mapping_model = {'n02085620 Chihuahua':0, 'n03000684 chain saw, chainsaw':1, 'n03729826 matchstick':2, 'n03345487 fire engine, fire truck':3, 'n04049303 rain barrel':4, 'n03825788 nipple':5, 'n02708093 analog clock':6, 'n03424325 gasmask, respirator, gas helmet':7, 'n01514668 cock':8, 'n04557648 water bottle':9}

In [115]:
epochs = 10
def training(n_epochs, train_loader, valid_loader, model, criterion, optimizer):

    train_losses, valid_losses = [], []
    # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf  # set initial "min" to infinity

    for epoch in range(n_epochs):
        train_loss, valid_loss = 0, 0 # monitor losses
      
        # train the model
        model.train() # prep model for training
        for data, label in train_loader:
            label = torch.tensor([label_mapping_dataset[x] for x in label], dtype=float)
            optimizer.zero_grad() # clear the gradients of all optimized variables
            output = model(data) # forward pass: compute predicted outputs by passing inputs to the model
            pred = vector_to_obj(output[1].detach().numpy())
            if pred in label_mapping_model.keys():
                pred = label_mapping_model[pred] + 0.0
            else:
                pred = 11.
            loss = criterion(torch.tensor(pred, dtype=float), label) # calculate the loss
            loss.backward() # backward pass: compute gradient of the loss with respect to model parameters
            optimizer.step() # perform a single optimization step (parameter update)
            train_loss += loss.item() * data.size(0) # update running training loss
      
        # validate the model
        model.eval()
        for data, label in valid_loader:
            with torch.no_grad():
                output = model(data)
            pred = output[1]
            target = torch.tensor(label_mapping_dataset[label[0]]).to(torch.float)
            loss = criterion(torch.tensor(pred), target) # calculate the loss
            valid_loss += loss.item() * data.size(0)
      
        # calculate average loss over an epoch
        train_loss /= len(train_loader.sampler)
        valid_loss /= len(valid_loader.sampler)
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
      
        print('epoch: {} \ttraining Loss: {:.6f} \tvalidation Loss: {:.6f}'.format(epoch+1, train_loss, valid_loss))

        # save model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print('validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
            torch.save(model.state_dict(), 'model.pt')
            valid_loss_min = valid_loss

    return train_losses, valid_losses      

In [116]:
train_losses_1, valid_losses_1 = training(epochs, train_loader, valid_loader, model, criterion, optimizer)



RuntimeError: kl_div: Integral inputs not supported.