In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mmoreaux/environmental-sound-classification-50")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/environmental-sound-classification-50


In [None]:
!pip install torchsummary



In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import pandas as pd
import torchaudio
import os
import torch
from torchsummary import summary
import torch.nn as nn

In [None]:
ANNOTATION_FILE = "/kaggle/input/environmental-sound-classification-50/esc50.csv"
AUDIO_DIR = "/kaggle/input/environmental-sound-classification-50/audio/audio/"
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128

In [None]:
class CustomDataset(Dataset):
  def __init__(self, annotation_file, audio_dir, transformation, target_sample_rate, num_samples, device):
    self.annotations = pd.read_csv(annotation_file)
    self.audio_dir = audio_dir
    self.device = device
    self.transformation = transformation.to(self.device)
    self.target_sample_rate = target_sample_rate
    self.num_samples = num_samples

  def __len__(self):
    return len(self.annotations)

  def __getitem__(self, idx):
    audio_sample_path = self._get_audio_sample_path(idx)
    label = self._get_audio_sample_label(idx)
    signal, sr = torchaudio.load(audio_sample_path)
    signal = signal.to(self.device)
    # signal -> (num_channels, sr) = (2, 16000) -> (1, 16000)
    signal = self._resample(signal, sr)
    signal = self._mix_down(signal)
    signal = self._cut(signal)
    signal = self._right_pad(signal)
    signal = self.transformation(signal)

    return signal, label

  def _cut(self, signal):
    if signal.shape[1]>self.num_samples:
      signal = signal[:, :self.num_samples]
    return signal

  def _right_pad(self, signal):
    if signal.shape[1]<self.num_samples:
      num_missing_samples = self.num_samples - signal.shape[1]
      last_dim_padding = (0, num_missing_samples)
      signal = torch.nn.functional.pad(signal, last_dim_padding)
    return signal

  def _resample(self, signal, sample_rate):
    if sample_rate != self.target_sample_rate:
      resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate).to(self.device)
      signal = resampler(signal)
    return signal

  def _mix_down(self, signal):
    if signal.shape[0]>1:
      signal = torch.mean(signal, dim=0, keepdim=True)
    return signal

  def _get_audio_sample_path(self, idx):
    path = os.path.join(self.audio_dir, self.annotations.iloc[idx, 0])
    return path

  def _get_audio_sample_label(self, idx):
    return self.annotations.iloc[idx, 2]

In [None]:
mel_spectogram = torchaudio.transforms.MelSpectrogram(SAMPLE_RATE, n_fft=1024, hop_length=512, n_mels=64)

In [None]:
esc50 = CustomDataset(ANNOTATION_FILE, AUDIO_DIR, mel_spectogram, SAMPLE_RATE, NUM_SAMPLES, DEVICE)

In [None]:
import torch
import torch.nn as nn

class CNNNetwork(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Dropout(0.3)
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Dropout(0.3)
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Dropout(0.4)
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=2),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Dropout(0.4)
        )

        self.flatten = nn.Flatten()
        self.linear = nn.Sequential(
            nn.Linear(128 * 5 * 4, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 50)
        )

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        return logits


In [None]:
model = CNNNetwork()
model.to(DEVICE)
summary(model, (1, 64, 44))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 46]             160
       BatchNorm2d-2           [-1, 16, 66, 46]              32
              ReLU-3           [-1, 16, 66, 46]               0
         MaxPool2d-4           [-1, 16, 33, 23]               0
           Dropout-5           [-1, 16, 33, 23]               0
            Conv2d-6           [-1, 32, 35, 25]           4,640
       BatchNorm2d-7           [-1, 32, 35, 25]              64
              ReLU-8           [-1, 32, 35, 25]               0
         MaxPool2d-9           [-1, 32, 17, 12]               0
          Dropout-10           [-1, 32, 17, 12]               0
           Conv2d-11           [-1, 64, 19, 14]          18,496
      BatchNorm2d-12           [-1, 64, 19, 14]             128
             ReLU-13           [-1, 64, 19, 14]               0
        MaxPool2d-14             [-1, 6

In [None]:
esc50
train_dataloader = DataLoader(esc50, batch_size=BATCH_SIZE)

In [None]:
learning_rate = 0.001
epochs = 50

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(epochs):
  for X, y in train_dataloader:
    X = X.to(DEVICE)
    y = y.to(DEVICE)

    output = model(X)

    optimizer.zero_grad()

    loss = loss_fn(output, y)

    loss.backward()

    optimizer.step()

  print(f"Epoch: {epoch} | Loss: {loss.item()}")

Epoch: 0 | Loss: 2.612142562866211
Epoch: 1 | Loss: 2.4953036308288574
Epoch: 2 | Loss: 2.4880001544952393
Epoch: 3 | Loss: 2.3570199012756348
Epoch: 4 | Loss: 2.2784740924835205
Epoch: 5 | Loss: 2.2443394660949707
Epoch: 6 | Loss: 2.2194249629974365
Epoch: 7 | Loss: 2.2883286476135254
Epoch: 8 | Loss: 2.355990409851074
Epoch: 9 | Loss: 2.055131196975708
Epoch: 10 | Loss: 2.056495189666748
Epoch: 11 | Loss: 1.8995975255966187
Epoch: 12 | Loss: 2.0249929428100586
Epoch: 13 | Loss: 1.8518203496932983
Epoch: 14 | Loss: 1.9143178462982178
Epoch: 15 | Loss: 1.8892837762832642
Epoch: 16 | Loss: 1.8667023181915283
Epoch: 17 | Loss: 1.8538926839828491
Epoch: 18 | Loss: 1.9100711345672607
Epoch: 19 | Loss: 1.7310912609100342
Epoch: 20 | Loss: 1.6848903894424438
Epoch: 21 | Loss: 1.7430782318115234
Epoch: 22 | Loss: 1.9179519414901733
Epoch: 23 | Loss: 1.6599435806274414
Epoch: 24 | Loss: 1.4533237218856812
Epoch: 25 | Loss: 1.689745545387268
Epoch: 26 | Loss: 1.7405128479003906
Epoch: 27 | Loss

In [None]:
torch.save(model.state_dict(), "audioModelDL.pth")

In [None]:
# df = pd.read_csv(ANNOTATION_FILE)
# unique_mapping_df = df.drop_duplicates(subset=['target', 'category'])

# mapping_series = unique_mapping_df.set_index('target')['category']

# mapping_dict = mapping_series.to_dict()

# print(mapping_dict)

In [None]:
# import json

# with open("mapping.json", "w") as f:
#   json.dump(mapping_dict, f)