In [22]:
import torch
from torch.optim import Adam
from torch import nn
import librosa
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
from skimage.transform import resize

In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [24]:
data_df = pd.read_csv("/kaggle/input/environmental-sound-classification-50/esc50.csv")

In [25]:
data_df.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [26]:
data_df['filename'] = "/kaggle/input/environmental-sound-classification-50/audio/audio/" + data_df['filename']

In [27]:
data_df.shape

(2000, 7)

In [28]:
label_encoder = LabelEncoder()
data_df['category'] = label_encoder.fit_transform(data_df['category'])

In [29]:
data_df.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,/kaggle/input/environmental-sound-classificati...,1,0,18,True,100032,A
1,/kaggle/input/environmental-sound-classificati...,1,14,7,False,100038,A
2,/kaggle/input/environmental-sound-classificati...,1,36,46,False,100210,A
3,/kaggle/input/environmental-sound-classificati...,1,36,46,False,100210,B
4,/kaggle/input/environmental-sound-classificati...,1,19,43,False,101296,A


In [30]:
train_df = data_df.sample(frac= 0.7)
test_df = data_df.drop(train_df.index)
val_df = test_df.sample(frac=0.5)
test_df =test_df.drop(val_df.index)

In [31]:
train_df.shape

(1400, 7)

In [32]:
print(test_df.shape)
print(val_df.shape)

(300, 7)
(300, 7)


In [33]:



class custom_dataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.labels = torch.tensor(list(dataframe['category'])).type(torch.LongTensor).to(device)
        self.audio = [torch.tensor(self.get_spectogram(path)).type(torch.FloatTensor) for path in self.dataframe['filename']]
    def __len__(self):
        return self.dataframe.shape[0]

    def __getitem__(self, idx):
        audio = self.audio[idx].unsqueeze(0).to(device)
        label = self.labels[idx]
        return audio, label

    def get_spectogram(self, path):
        sr = 32000
        duration = 5

        image_height = 128
        image_width =  256

        signal, sr = librosa.load(path,sr=sr, duration = duration)

        spec = librosa.feature.melspectrogram(y=signal, sr=sr, n_fft= 2048, hop_length=512, n_mels=128)

        spec_db = librosa.power_to_db(spec, ref=np.max)

        spec_resized = librosa.util.fix_length(spec_db, size=(duration*sr)//512+1 )
        spec_resized = resize(spec_resized, (image_height,image_width), anti_aliasing=True)

        return spec_resized

        
        

In [34]:
train_dataset = custom_dataset(train_df)
val_dataset = custom_dataset(val_df)
test_dataset = custom_dataset(test_df)

In [35]:
lr = 1e-4
batch_size = 16
epochs = 20

In [36]:
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size = batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle=True)


In [37]:
class net(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.maxpooling = nn.MaxPool2d(2,2)
        self.relu = nn.ReLU()

        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear((16*32*64), 4096)
        self.linear2 = nn.Linear(4096, 1024)
        self.linear3 = nn.Linear(1024, 512)
        self.output = nn.Linear(512, len(data_df['category'].unique()))

        self.dropout = nn.Dropout(0.5)

    def forward(self,x):
        x = self.conv1(x)
        x = self.maxpooling(x)
        x = self.conv2(x)
        x= self.maxpooling(x)
        x = self.conv3(x)
        x = self.maxpooling(x)
        x = self.relu(x)

        x = x.view(x.size(0), -1)

        x = self.flatten(x)
        x = self.linear1(x)
        x = self.dropout(x)

        x = self.linear2(x)
        x= self.dropout(x)

        x = self.linear3(x)
        x = self.dropout(x)

        x = self.output(x)
        return x


In [38]:
model = net().to(device)

In [39]:
from torchsummary import summary
summary(model, (1, 128, 256))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 16, 128, 256]             160
         MaxPool2d-2          [-1, 16, 64, 128]               0
            Conv2d-3          [-1, 32, 64, 128]           4,640
         MaxPool2d-4           [-1, 32, 32, 64]               0
            Conv2d-5           [-1, 64, 32, 64]          18,496
         MaxPool2d-6           [-1, 64, 16, 32]               0
              ReLU-7           [-1, 64, 16, 32]               0
           Flatten-8                [-1, 32768]               0
            Linear-9                 [-1, 4096]     134,221,824
          Dropout-10                 [-1, 4096]               0
           Linear-11                 [-1, 1024]       4,195,328
          Dropout-12                 [-1, 1024]               0
           Linear-13                  [-1, 512]         524,800
          Dropout-14                  [

In [40]:
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr = lr)

In [44]:
total_loss_train_plot = []
total_acc_train_plot = []
total_loss_val_plot = []
total_acc_val_plot = []

for epoch in range(epochs):
    total_acc_train = 0
    total_loss_train = 0
    total_loss_val = 0
    total_acc_val =0
    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        
        output = model(inputs)
        loss = criterion(output, labels)

        loss.backward()
        optimizer.step()

        total_loss_train += loss.item()
        
        total_acc_train += (torch.argmax(output, axis=1) == labels).sum().item()

    #val
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            
            output = model(inputs)
            loss = criterion(output, labels)
    
            total_loss_val += loss.item()
            
            total_acc_val += (torch.argmax(output, axis=1) == labels).sum().item()


    total_loss_train_plot.append(round(total_loss_train/1000, 4))
    total_acc_train_plot.append(round((total_acc_train/train_dataset.__len__()) * 100, 4))
    
    total_loss_val_plot.append(round(total_loss_val/1000, 4))
    total_acc_val_plot.append(round((total_acc_val/val_dataset.__len__()) * 100, 4))

    print(f"epoch:{epoch}, train_loss:{round(total_loss_train/1000, 4)}, train_acc:{round((total_acc_train/train_dataset.__len__()) * 100, 4)}")
    print(f"val_loss:{round(total_loss_val/1000, 4)}, val_acc:{round((total_acc_val/val_dataset.__len__()) * 100, 4)}")

epoch:0, train_loss:0.2444, train_acc:27.7857
val_loss:0.0559, val_acc:23.3333
epoch:1, train_loss:0.1948, train_acc:40.2143
val_loss:0.0525, val_acc:26.3333
epoch:2, train_loss:0.1563, train_acc:52.4286
val_loss:0.0515, val_acc:29.3333
epoch:3, train_loss:0.127, train_acc:61.2857
val_loss:0.0504, val_acc:34.0
epoch:4, train_loss:0.1018, train_acc:67.7857
val_loss:0.0478, val_acc:37.0
epoch:5, train_loss:0.0806, train_acc:74.4286
val_loss:0.0593, val_acc:30.6667
epoch:6, train_loss:0.0661, train_acc:78.5
val_loss:0.0494, val_acc:43.6667
epoch:7, train_loss:0.0542, train_acc:83.5
val_loss:0.0522, val_acc:40.3333
epoch:8, train_loss:0.0455, train_acc:84.7857
val_loss:0.0511, val_acc:43.3333
epoch:9, train_loss:0.0385, train_acc:87.9286
val_loss:0.0573, val_acc:42.6667
epoch:10, train_loss:0.0358, train_acc:88.7143
val_loss:0.0507, val_acc:46.0
epoch:11, train_loss:0.0309, train_acc:90.7857
val_loss:0.0563, val_acc:42.0
epoch:12, train_loss:0.0216, train_acc:93.7143
val_loss:0.0533, val_a

In [53]:
with torch.no_grad():
    total_acc_test = 0
    for inputs, labels in test_dataloader:
            
            output = model(inputs)
            loss = criterion(output, labels)
    
            
            total_acc_test += (torch.argmax(output, axis=1) == labels).sum().item()

    print( f"train_acc:{round((total_acc_test/test_dataset.__len__()) * 100, 4)}")


train_acc:37.0
