#### About
Urban sound8k classification in PyTorch.
* Dataset Link -https://www.kaggle.com/datasets/chrisfilo/urbansound8k 

In [25]:
#importing modules
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from torchaudio import transforms
import pandas as pd
import os
import torchaudio
from torchsummary import summary

In [10]:
# loading the dataframe
annotation_dir = '/home/suraj/ClickUp/Mar-Apr/data/UrbanSound8K.csv'
df = pd.read_csv(annotation_dir)
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [7]:
dataset_path = "/home/suraj/ClickUp/Mar-Apr/data/"

In [9]:
# creating dataset class
class UrbanSoundDataset(Dataset):
    def __init__(self,annotation_file,audio_dir, transforms, target_sample_rate,num_samples):
        self.annotations = pd.read_csv(annotation_file)
        self.audio_dir = audio_dir
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.transforms = transforms
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations) 
    
    def _get_audio_sample_label(self,index):
        return self.annotations.iloc[index,6]
    
    def _get_audio_sample_path(self,index):
        fold = f"fold{self.annotations.iloc[index,5]}"
        path = os.path.join(self.audio_dir, fold,self.annotations.iloc[index,0])
        return path
    
    def _right_pad(self, signal):
        length_signal = signal.shape[1]
        if length_signal< self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0,num_missing_samples)
            signal= F.pad(signal,last_dim_padding)
        return signal
    
    def resample(self,signal,sample_rate):
        if sample_rate!=self.target_sample_rate:
            resampler = transforms.Resample(sample_rate, self.target_sample_rate)
            signal = resampler(signal)
        
        return signal
    
    def clip_signal(self,signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:,:self.num_samples]
        return signal
    
    def mix_signal(self,signal):
        if signal.shape[0]>1:
            signal = torch.mean(signal,dim=0,keepdim=True)
        return signal
    
    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal,sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self.resample(signal,sr)
        signal = self.mix_signal(signal)
        signal = self.clip_signal(signal)
        signal = self._right_pad(signal)
        signal = self.transforms(signal)
        return signal,label


In [13]:
sample_rate = 22050 #22.5 KHz
num_samples = 22050
mel_spectogram = transforms.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=1024, hop_length=512, n_mels=64
)

data = UrbanSoundDataset(annotation_dir, dataset_path, mel_spectogram,sample_rate, num_samples)

In [22]:
signal,label = data.__getitem__(54)
print(signal.shape,label)


torch.Size([1, 64, 44]) 3


In [23]:
#Model
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1,out_channels=16,kernel_size=3,stride=1,padding=2,bias=False),
        nn.BatchNorm2d(16),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2))

        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=16,out_channels=2*16,kernel_size=3,stride=1,padding=2,bias=False),
        nn.BatchNorm2d(32),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2))

        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=32,out_channels=4*16,kernel_size=3,stride=1,padding=2,bias=False),
        nn.BatchNorm2d(2*32),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2))

        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=2*32,out_channels=8*16,kernel_size=3,stride=1,padding=2,bias=False),
        nn.BatchNorm2d(4*32),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2))
        
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(128*5*4,10) # 10 classes
        self.softmax = nn.Softmax(dim=1)

    
    def forward(self,input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        preds = self.softmax(logits)

        return preds


In [26]:
model = Classifier()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(summary(model,(1,64,44)))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 46]             144
       BatchNorm2d-2           [-1, 16, 66, 46]              32
              ReLU-3           [-1, 16, 66, 46]               0
         MaxPool2d-4           [-1, 16, 33, 23]               0
            Conv2d-5           [-1, 32, 35, 25]           4,608
       BatchNorm2d-6           [-1, 32, 35, 25]              64
              ReLU-7           [-1, 32, 35, 25]               0
         MaxPool2d-8           [-1, 32, 17, 12]               0
            Conv2d-9           [-1, 64, 19, 14]          18,432
      BatchNorm2d-10           [-1, 64, 19, 14]             128
             ReLU-11           [-1, 64, 19, 14]               0
        MaxPool2d-12             [-1, 64, 9, 7]               0
           Conv2d-13           [-1, 128, 11, 9]          73,728
      BatchNorm2d-14           [-1, 128

#### Training 

In [27]:
#creating dataloader
batch_size=1
num_epochs=1
learning_rate = 1e-4

train_loader = DataLoader(data,batch_size=batch_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [31]:
len(train_loader)

8732

In [32]:
for i in range(num_epochs):
    for step,data in enumerate(train_loader):
        sample,label = data
        sample = sample.to(device)
        label =  label.to(device)

        pred = model(sample)
        #calculate loss
        loss = criterion(pred,label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print("Epoch-{}/{},Step-{}/{}, Loss- {}".format(i,num_epochs,step,len(train_loader), loss.item()))

Epoch-0/1,Step-0, Loss- 1.46124267578125
Epoch-0/1,Step-1, Loss- 2.4607274532318115
Epoch-0/1,Step-2, Loss- 2.46048641204834
Epoch-0/1,Step-3, Loss- 2.4604403972625732
Epoch-0/1,Step-4, Loss- 2.4391653537750244
Epoch-0/1,Step-5, Loss- 2.45102858543396
Epoch-0/1,Step-6, Loss- 2.4604196548461914
Epoch-0/1,Step-7, Loss- 2.46001935005188
Epoch-0/1,Step-8, Loss- 2.45855975151062
Epoch-0/1,Step-9, Loss- 2.4611501693725586
Epoch-0/1,Step-10, Loss- 2.4611477851867676
Epoch-0/1,Step-11, Loss- 2.4597115516662598
Epoch-0/1,Step-12, Loss- 2.4607889652252197
Epoch-0/1,Step-13, Loss- 2.4611499309539795
Epoch-0/1,Step-14, Loss- 1.6186721324920654
Epoch-0/1,Step-15, Loss- 1.4825078248977661
Epoch-0/1,Step-16, Loss- 1.4714611768722534
Epoch-0/1,Step-17, Loss- 1.4612092971801758
Epoch-0/1,Step-18, Loss- 1.4611539840698242
Epoch-0/1,Step-19, Loss- 1.461150884628296
Epoch-0/1,Step-20, Loss- 1.4650858640670776
Epoch-0/1,Step-21, Loss- 1.4623161554336548
Epoch-0/1,Step-22, Loss- 2.438324451446533
Epoch-0/1,

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(),"classifier.pth")