# Defining the neural network model

In [2]:
import soundfile, torch
import torchaudio
import matplotlib.pyplot as plt
from torchaudio.transforms import Resample

In [3]:
from pytorch_model import SoundNet8_pytorch
from utils import vector_to_scenes,vector_to_obj

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
class fineTune_SoundNet(torch.nn.Module):
    def __init__(self):
        super(fineTune_SoundNet, self).__init__()

        self.pretrained = SoundNet8_pytorch().load_state_dict(torch.load("sound8.pth"))
        self.model = SoundNet8_pytorch()

    def forward(self, x):
        for net in [self.model.conv1, self.model.conv2, self.model.conv3, self.model.conv4]:
            x = net(x)
        
        x = self.pretrained.conv5(x)
        
        for net in [self.conv6, self.conv7]:
            x = net(x)

        object_pred = self.conv8(x)
        scene_pred = self.conv8_2(x) 
        return object_pred, scene_pred

In [11]:
model = fineTune_SoundNet()
model.eval()

fineTune_SoundNet(
  (model): SoundNet8_pytorch(
    (conv1): Sequential(
      (0): Conv2d(1, 16, kernel_size=(64, 1), stride=(2, 1), padding=(32, 0))
      (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=(8, 1), stride=(8, 1), padding=0, dilation=1, ceil_mode=False)
    )
    (conv2): Sequential(
      (0): Conv2d(16, 32, kernel_size=(32, 1), stride=(2, 1), padding=(16, 0))
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=(8, 1), stride=(8, 1), padding=0, dilation=1, ceil_mode=False)
    )
    (conv3): Sequential(
      (0): Conv2d(32, 64, kernel_size=(16, 1), stride=(2, 1), padding=(8, 0))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (conv4): Sequential(
      (0): Conv2d(64, 128, kernel_size=(8, 

# Loading the Dataset

In [24]:
from torch.utils.data import Dataset
import pandas as pd
import os

In [66]:
df = pd.read_csv('meta\esc10.csv')
df = df.rename(columns={'Unnamed: 0': 'index'})
df = df.set_index('index')

df

Unnamed: 0_level_0,filename,fold,target,category,esc10,src_file,take
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-110389-A-0.wav,1,0,dog,True,110389,A
2,1-116765-A-41.wav,1,41,chainsaw,True,116765,A
3,1-17150-A-12.wav,1,12,crackling_fire,True,17150,A
4,1-172649-A-40.wav,1,40,helicopter,True,172649,A
...,...,...,...,...,...,...,...
395,5-233160-A-1.wav,5,1,rooster,True,233160,A
396,5-234879-A-1.wav,5,1,rooster,True,234879,A
397,5-234879-B-1.wav,5,1,rooster,True,234879,B
398,5-235671-A-38.wav,5,38,clock_tick,True,235671,A


In [67]:
class ESC10_Dataset(Dataset):

    def __init__(self, annotations_file, audio_dir):
        self.annotations = pd.read_csv(annotations_file).rename(columns={'Unnamed: 0': 'index'}).set_index('index')
        self.audio_dir = audio_dir

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        if sr != 22050:
            transform = Resample(sr,22050)
            signal = transform(signal)
        if signal.shape[0]>1:
            signal = torch.mean(signal,axis=0)
        signal = signal.view(1,1,-1,1)
        return signal,label

    def _get_audio_sample_path(self, index):
        return os.path.join(self.audio_dir, self.annotations.iloc[index, 0])

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 3]

esc10 = ESC10_Dataset("meta\esc10.csv", "audio")