In [5]:
RunningInColab = 'google.colab' in str(get_ipython())
if RunningInColab:
    from google.colab import drive
    drive.mount('/content/drive')

In [6]:
# Imports
from torch.utils.data import Dataset, DataLoader
import os
import torch
import torchaudio
import torchaudio.transforms as transforms

from math import ceil
from collections import OrderedDict

In [7]:
# Hyperparameters
TARGET_SAMPLE_RATE = 16000
TARGET_LENGTH_SECONDS = 4
BATCH_SIZE = 128
EPOCHS = 20
LEARNING_RATE = 3e-4
DROPOUT_PROB = 0.4
DROPOUT_PROB_2D = 0.2

# FREQ_MASK_PARAM = 15
# TIME_MASK_PARAM = 25

NUM_SAMPLES = TARGET_LENGTH_SECONDS * TARGET_SAMPLE_RATE
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define transformations
transformations = [
    transforms.MelSpectrogram(
            sample_rate=TARGET_SAMPLE_RATE,
            n_fft = 1024,
            hop_length = 512,
            n_mels = 64
        ),
    # transforms.TimeMasking(time_mask_param=TIME_MASK_PARAM),
    # transforms.FrequencyMasking(freq_mask_param=FREQ_MASK_PARAM),
]

In [8]:
class_mapping = {}

class AudioDataset(Dataset):
  def __init__(self,
               data_dir,
               transformations = transformations,
               target_sample_rate = TARGET_SAMPLE_RATE,
               num_samples = NUM_SAMPLES,
               device = device
               ):
    self.data_dir = data_dir
    self.classes = sorted(os.listdir(data_dir))
    self.file_paths = []
    self.targets = []
    self.transformations = transformations
    self.target_sample_rate = target_sample_rate
    self.num_samples = num_samples
    self.device = device

    createMapping = False
    if(len(class_mapping) == 0):createMapping = True

    for i,class_name in enumerate(self.classes):
      class_dir = os.path.join(data_dir,class_name)
      if(createMapping):class_mapping[class_name] = i

      for filename in os.listdir(class_dir):
        filepath = os.path.join(class_dir,filename)
        self.file_paths.append(filepath)
        self.targets.append(class_mapping[class_name])

  def __len__(self):
    return len(self.file_paths)

  def __getitem__(self,idx):
    audio_path = self.file_paths[idx]
    waveform, sample_rate = torchaudio.load(audio_path)
    waveform = waveform.to(self.device)
    waveform = self._resample_if_necessary(waveform,sample_rate)
    waveform = self._mix_down_if_necessary(waveform)
    waveform = self._cut_if_necessary(waveform)
    waveform = self._right_pad_if_necessary(waveform)
    

    if self.transformations:
      for transformation in self.transformations:
        waveform = transformation.to(self.device)(waveform)

    # waveform normalisation:
    waveform = torch.log1p(waveform)
    waveform = waveform * 255/(waveform.max() -waveform.min())
        
    label = self.targets[idx]
    return waveform, label

  def _resample_if_necessary(self, waveform,sample_rate):
    if sample_rate != self.target_sample_rate:
      resampler = torchaudio.transforms.Resample(sample_rate,self.target_sample_rate)
      waveform = resampler.to(self.device)(waveform)
    return waveform

  def _mix_down_if_necessary(self,waveform):
    if waveform.shape[0] > 1:
      waveform = torch.mean(waveform,dim = 0,keepdim = True)
    return waveform

  #If the video was longer than TARGET_LENGTH_SECONDS, then we are cropping it to that many seconds by removing seconds equally from both the start and the end sides
  def _cut_if_necessary(self,waveform):
    if waveform.shape[1] > self.num_samples:
      mid = (waveform.shape[1] - 1)//2
      nsby2 = self.num_samples//2
      waveform = waveform[:,mid - nsby2 + 1: mid + self.num_samples - nsby2 + 1]
    return waveform

  def _right_pad_if_necessary(self,waveform):
    num_samples = waveform.shape[1]
    if num_samples < self.num_samples:
      num_missing_samples = self.num_samples - num_samples
      last_dim_padding = (0,num_missing_samples)
      waveform = torch.nn.functional.pad(waveform,last_dim_padding)
    return waveform



# Define data directories
train_dir: str
val_dir:str
if RunningInColab:
    train_dir = "/content/drive/MyDrive/audio_dataset/train"
    val_dir = "/content/drive/MyDrive/audio_dataset/val"
else:
    train_dir = "audio_dataset/train"
    val_dir = "audio_dataset/val"

train_dataset = AudioDataset(train_dir)
val_dataset = AudioDataset(val_dir)

train_loader = DataLoader(
    train_dataset,batch_size = BATCH_SIZE, shuffle = True
)
val_loader = DataLoader(
    val_dataset,batch_size = BATCH_SIZE
)

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
from torchsummary import summary

# model = models.resnet50(weights = None).to(device)
# model.conv1 = nn.Conv2d(1,64,kernel_size = 7,stride = 2,padding = 3,bias = False).to(device)

# num_classes = len(class_mapping)
# num_features = model.fc.in_features

# model.fc = nn.Sequential(
#     nn.Linear(num_features, num_classes),
#     nn.Softmax(dim = 1)
# ).to(device)

class CustomResNet18(nn.Module):
    def __init__(self):
        super(CustomResNet18, self).__init__()
        self.resnet = models.resnet18(weights=None).to(device)

        self.resnet.conv1 = nn.Conv2d(1,64,kernel_size = 7,stride = 2,padding = 3,bias = False).to(device)
        num_features = self.resnet.fc.in_features
        self.resnet.fc = nn.Linear(num_features,2048).to(device)
        # self.replace_batchnorm_layers(self.resnet)
        self.num_classes = len(class_mapping)

        self.resnet.conv1 = nn.Sequential(
            self.resnet.conv1,
            nn.Dropout2d(DROPOUT_PROB_2D)  # Add dropout after the first convolutional layer
        )
        # Add dropout after layers in resnet
        self.resnet.layer1 = nn.Sequential(
            self.resnet.layer1,
            nn.Dropout2d(DROPOUT_PROB_2D)  # Add dropout after layer1
        )
        self.resnet.layer2 = nn.Sequential(
            self.resnet.layer2,
            nn.Dropout2d(DROPOUT_PROB_2D)  # Add dropout after layer2
        )
        self.resnet.layer3 = nn.Sequential(
            self.resnet.layer3,
            nn.Dropout2d(DROPOUT_PROB_2D)  # Add dropout after layer3
        )
        self.resnet.layer4 = nn.Sequential(
            self.resnet.layer4,
            nn.Dropout2d(DROPOUT_PROB_2D)  # Add dropout after layer4
        )

        # Add additional linear layers
        self.additional_layers = nn.ModuleList().to(device)
        self.additional_layers.append(nn.Dropout(DROPOUT_PROB).to(device))
        self.additional_layers.append(nn.Linear(2048, 1024).to(device))
        self.additional_layers.append(nn.Linear(1024,512).to(device))
        self.additional_layers.append(nn.ReLU().to(device))
        self.additional_layers.append(nn.Linear(512,self.num_classes).to(device))
    
    # def replace_batchnorm_layers(self,model, replace_with=nn.Identity):
    #     for name, module in model.named_children():
    #         if isinstance(module, nn.BatchNorm2d):
    #             setattr(model, name, replace_with())
    #         else:
    #             self.replace_batchnorm_layers(module, replace_with)

    def forward(self, x):
        x = self.resnet(x)
        for layer in self.additional_layers:
            x = layer(x)
        return x

model = CustomResNet18()
summary(model,(1,64,126))
    


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 32, 63]           3,136
         Dropout2d-2           [-1, 64, 32, 63]               0
       BatchNorm2d-3           [-1, 64, 32, 63]             128
              ReLU-4           [-1, 64, 32, 63]               0
         MaxPool2d-5           [-1, 64, 16, 32]               0
            Conv2d-6           [-1, 64, 16, 32]          36,864
       BatchNorm2d-7           [-1, 64, 16, 32]             128
              ReLU-8           [-1, 64, 16, 32]               0
            Conv2d-9           [-1, 64, 16, 32]          36,864
      BatchNorm2d-10           [-1, 64, 16, 32]             128
             ReLU-11           [-1, 64, 16, 32]               0
       BasicBlock-12           [-1, 64, 16, 32]               0
           Conv2d-13           [-1, 64, 16, 32]          36,864
      BatchNorm2d-14           [-1, 64,

In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr = LEARNING_RATE)


def train_model(model, train_loader, criterion, optimizer, epoch):
  model.train()  # Set model to training mode
  running_loss = 0.0
  correct = 0
  total = 0

  batch_itr = 0
  
  for inputs, labels in train_loader:
    inputs = inputs.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()

    # Forward pass
    outputs = model(inputs)
    _, predicted = torch.max(outputs, 1)
    # print(predicted)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
    loss = criterion(outputs, labels)


    print(loss.item())
    # Backward pass and optimize
    loss.backward()
    optimizer.step()

    running_loss += loss.item() * inputs.size(0)
    batch_itr+=1

    print(f"Batch {batch_itr} of {ceil(len(train_dataset)/BATCH_SIZE)}")
  epoch_loss = running_loss / len(train_loader.dataset)
  print(f"Epoch [{epoch+1}/{EPOCHS}], Loss: {epoch_loss:.4f}")
  train_accuracy = correct / total
  print(f'Training Accuracy: {train_accuracy:.4f}')

# Validation function
def validate_model(model, val_loader):
  model.eval()  # Set model to evaluation mode
  correct = 0
  total = 0

  with torch.no_grad():
    for inputs, labels in val_loader:
      inputs = inputs.to(device)
      labels = labels.to(device)
      outputs = model(inputs)
      _, predicted = torch.max(outputs, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()
      # print(predicted)

  val_accuracy = correct / total
  print(f'Validation Accuracy: {val_accuracy:.4f}')
  return val_accuracy

In [28]:
# Training Loop
best_accuracy = 0.0

for epoch in range(EPOCHS):
  train_model(model,train_loader,criterion,optimizer,epoch)
  accuracy = validate_model(model, val_loader)

  if accuracy > best_accuracy:
    best_accuracy = accuracy
    # torch.save(model.state_dict(),'best_model.pth')

  print(f"Best Validation Accuracy: {best_accuracy:.4f}")

0.057988740503787994
Batch 1 of 38
0.17925816774368286
Batch 2 of 38
0.07590349018573761
Batch 3 of 38
0.06926144659519196
Batch 4 of 38
0.05452224612236023
Batch 5 of 38
0.04531101882457733
Batch 6 of 38
0.13861696422100067
Batch 7 of 38
0.0883622020483017
Batch 8 of 38
0.012878292240202427
Batch 9 of 38
0.1098610907793045
Batch 10 of 38
0.06530620157718658
Batch 11 of 38
0.03174947574734688
Batch 12 of 38
0.10873483866453171
Batch 13 of 38
0.11338510364294052
Batch 14 of 38
0.026243267580866814
Batch 15 of 38
0.05841178447008133
Batch 16 of 38
0.0928352028131485
Batch 17 of 38
0.09884463995695114
Batch 18 of 38
0.13446098566055298
Batch 19 of 38
0.11314146220684052
Batch 20 of 38
0.014937084168195724
Batch 21 of 38
0.052648887038230896
Batch 22 of 38
0.2118903398513794
Batch 23 of 38
0.23520921170711517
Batch 24 of 38
0.04456484317779541
Batch 25 of 38
0.07342937588691711
Batch 26 of 38
0.0487155057489872
Batch 27 of 38
0.06649240851402283
Batch 28 of 38
0.07889424264431
Batch 29 of 

In [29]:
# torch.save(model.state_dict(),'epoch50_x_9107')