In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mmoreaux/environmental-sound-classification-50")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/environmental-sound-classification-50


In [3]:
!pip install torchsummary



In [4]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import pandas as pd
import torchaudio
import os
import torch
from torchsummary import summary
import torch.nn as nn

In [5]:
ANNOTATION_FILE = "/kaggle/input/environmental-sound-classification-50/esc50.csv"
AUDIO_DIR = "/kaggle/input/environmental-sound-classification-50/audio/audio/"
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128

In [6]:
class CustomDataset(Dataset):
  def __init__(self, annotation_file, audio_dir, transformation, target_sample_rate, num_samples, device):
    self.annotations = pd.read_csv(annotation_file)
    self.audio_dir = audio_dir
    self.device = device
    self.transformation = transformation.to(self.device)
    self.target_sample_rate = target_sample_rate
    self.num_samples = num_samples

  def __len__(self):
    return len(self.annotations)

  def __getitem__(self, idx):
    audio_sample_path = self._get_audio_sample_path(idx)
    label = self._get_audio_sample_label(idx)
    signal, sr = torchaudio.load(audio_sample_path)
    signal = signal.to(self.device)
    # signal -> (num_channels, sr) = (2, 16000) -> (1, 16000)
    signal = self._resample(signal, sr)
    signal = self._mix_down(signal)
    signal = self._cut(signal)
    signal = self._right_pad(signal)
    signal = self.transformation(signal)

    return signal, label

  def _cut(self, signal):
    if signal.shape[1]>self.num_samples:
      signal = signal[:, :self.num_samples]
    return signal

  def _right_pad(self, signal):
    if signal.shape[1]<self.num_samples:
      num_missing_samples = self.num_samples - signal.shape[1]
      last_dim_padding = (0, num_missing_samples)
      signal = torch.nn.functional.pad(signal, last_dim_padding)
    return signal

  def _resample(self, signal, sample_rate):
    if sample_rate != self.target_sample_rate:
      resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate).to(self.device)
      signal = resampler(signal)
    return signal

  def _mix_down(self, signal):
    if signal.shape[0]>1:
      signal = torch.mean(signal, dim=0, keepdim=True)
    return signal

  def _get_audio_sample_path(self, idx):
    path = os.path.join(self.audio_dir, self.annotations.iloc[idx, 0])
    return path

  def _get_audio_sample_label(self, idx):
    return self.annotations.iloc[idx, 2]

In [7]:
mel_spectogram = torchaudio.transforms.MelSpectrogram(SAMPLE_RATE, n_fft=1024, hop_length=512, n_mels=64)

In [8]:
esc50 = CustomDataset(ANNOTATION_FILE, AUDIO_DIR, mel_spectogram, SAMPLE_RATE, NUM_SAMPLES, DEVICE)

In [15]:
import torch
import torch.nn as nn
import torchvision.models as models

class EfficientNetCustom(nn.Module):
    def __init__(self, num_classes=50, pretrained=True):
        super().__init__()

        # Load EfficientNet-B0
        effnet = models.efficientnet_b0(pretrained=pretrained)

        # Keep only the convolutional feature extractor (exclude classifier)
        self.features = effnet.features

        # Modify the first convolutional layer to accept 1 input channel
        # EfficientNet-B0's first layer is features[0][0]
        original_first_conv = self.features[0][0]
        new_first_conv = nn.Conv2d(
            1,  # Change input channels from 3 to 1
            original_first_conv.out_channels,
            kernel_size=original_first_conv.kernel_size,
            stride=original_first_conv.stride,
            padding=original_first_conv.padding,
            bias=original_first_conv.bias
        )
        # Copy weights from the original first layer (optional, but good practice if applicable)
        # For 1 input channel, we can average the weights across the original 3 input channels
        new_first_conv.weight.data = original_first_conv.weight.data.mean(dim=1, keepdim=True)

        self.features[0][0] = new_first_conv


        # Get the output channels of EfficientNet-B0
        self.feature_dim = 1280

        # Custom fully connected head (your design)
        self.classifier = nn.Sequential(
            nn.Linear(self.feature_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        # Extract features
        x = self.features(x)         # shape: [B, 1280, H, W]
        x = nn.functional.adaptive_avg_pool2d(x, 1)  # [B, 1280, 1, 1]
        x = torch.flatten(x, 1)      # [B, 1280]

        # Pass through custom head
        out = self.classifier(x)
        return out

In [16]:
model = EfficientNetCustom(num_classes=50, pretrained=True)
model.to(DEVICE)
# summary(model, (1, 64, 44))



Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth


100%|██████████| 20.5M/20.5M [00:00<00:00, 68.8MB/s]


EfficientNetCustom(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormA

In [17]:
esc50
train_dataloader = DataLoader(esc50, batch_size=BATCH_SIZE)

In [21]:
learning_rate = 0.001
epochs = 50

In [22]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [23]:
for epoch in range(epochs):
  for X, y in train_dataloader:
    X = X.to(DEVICE)
    y = y.to(DEVICE)

    output = model(X)

    optimizer.zero_grad()

    loss = loss_fn(output, y)

    loss.backward()

    optimizer.step()

  print(f"Epoch: {epoch} | Loss: {loss.item()}")

Epoch: 0 | Loss: 3.6562423706054688
Epoch: 1 | Loss: 3.0316274166107178
Epoch: 2 | Loss: 2.276737689971924
Epoch: 3 | Loss: 1.6158339977264404
Epoch: 4 | Loss: 1.4108726978302002
Epoch: 5 | Loss: 1.1619783639907837
Epoch: 6 | Loss: 0.9182340502738953
Epoch: 7 | Loss: 0.8434804677963257
Epoch: 8 | Loss: 0.9099575281143188
Epoch: 9 | Loss: 0.6499221920967102
Epoch: 10 | Loss: 0.4950626790523529
Epoch: 11 | Loss: 0.6669813394546509
Epoch: 12 | Loss: 0.41633859276771545
Epoch: 13 | Loss: 0.685961127281189
Epoch: 14 | Loss: 0.41623735427856445
Epoch: 15 | Loss: 0.41942015290260315
Epoch: 16 | Loss: 0.3357165455818176
Epoch: 17 | Loss: 0.3633340895175934
Epoch: 18 | Loss: 0.2722107768058777
Epoch: 19 | Loss: 0.29117363691329956
Epoch: 20 | Loss: 0.22582504153251648
Epoch: 21 | Loss: 0.21664953231811523
Epoch: 22 | Loss: 0.27495867013931274
Epoch: 23 | Loss: 0.34807515144348145
Epoch: 24 | Loss: 0.21794600784778595
Epoch: 25 | Loss: 0.2585553526878357
Epoch: 26 | Loss: 0.23078863322734833
Epo

In [24]:
torch.save(model.state_dict(), "audioModelDLNew3.pth")

In [29]:
import pandas as pd
from torch.utils.data import DataLoader, Subset

annotations = pd.read_csv("/kaggle/input/environmental-sound-classification-50/esc50.csv")

# Pick one fold for testing, rest for training
test_fold = 1 # Changed test_fold from 0 to 1
train_idx = annotations[annotations['fold'] != test_fold].index
test_idx = annotations[annotations['fold'] == test_fold].index

train_dataset = Subset(esc50, train_idx)
test_dataset = Subset(esc50, test_idx)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [30]:
from torch.utils.data import DataLoader

def evaluate_model(model, dataloader, device="cpu"):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            _, predicted = torch.max(outputs, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()

    accuracy = 100 * correct / total
    return accuracy


acc = evaluate_model(model, test_loader, DEVICE)
print(f"Model Accuracy: {acc:.2f}%")


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Model Accuracy: 52.50%
