In [24]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import pandas as pd
import torchaudio
import os
import torch
import torch.nn as nn

In [25]:
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128

In [26]:
import torch
import torch.nn as nn
import torchvision.models as models

class EfficientNetCustom(nn.Module):
    def __init__(self, num_classes=50, pretrained=True):
        super().__init__()

        # Load EfficientNet-B0
        effnet = models.efficientnet_b0(pretrained=pretrained)

        # Keep only the convolutional feature extractor (exclude classifier)
        self.features = effnet.features

        # Modify the first convolutional layer to accept 1 input channel
        # EfficientNet-B0's first layer is features[0][0]
        original_first_conv = self.features[0][0]
        new_first_conv = nn.Conv2d(
            1,  # Change input channels from 3 to 1
            original_first_conv.out_channels,
            kernel_size=original_first_conv.kernel_size,
            stride=original_first_conv.stride,
            padding=original_first_conv.padding,
            bias=original_first_conv.bias
        )
        # Copy weights from the original first layer (optional, but good practice if applicable)
        # For 1 input channel, we can average the weights across the original 3 input channels
        new_first_conv.weight.data = original_first_conv.weight.data.mean(dim=1, keepdim=True)

        self.features[0][0] = new_first_conv


        # Get the output channels of EfficientNet-B0
        self.feature_dim = 1280

        # Custom fully connected head (your design)
        self.classifier = nn.Sequential(
            nn.Linear(self.feature_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        # Extract features
        x = self.features(x)         # shape: [B, 1280, H, W]
        x = nn.functional.adaptive_avg_pool2d(x, 1)  # [B, 1280, 1, 1]
        x = torch.flatten(x, 1)      # [B, 1280]

        # Pass through custom head
        out = self.classifier(x)
        return out

In [27]:
import torch
import torchaudio
import torch.nn.functional as F
import os

def preprocess_audio(file_path, transformation, target_sample_rate, num_samples, device):
    # load audio
    signal, sr = torchaudio.load(file_path)
    signal = signal.to(device)

    # resample
    if sr != target_sample_rate:
        resampler = torchaudio.transforms.Resample(sr, target_sample_rate).to(device)
        signal = resampler(signal)

    # mix down to mono
    if signal.shape[0] > 1:
        signal = torch.mean(signal, dim=0, keepdim=True)

    # cut or pad
    if signal.shape[1] > num_samples:
        signal = signal[:, :num_samples]
    elif signal.shape[1] < num_samples:
        num_missing = num_samples - signal.shape[1]
        signal = F.pad(signal, (0, num_missing))

    # apply transformation (e.g., mel spectrogram)
    signal = transformation(signal)

    return signal


In [32]:
def predict_single_audio(model, file_path, transformation, target_sample_rate, num_samples, device, class_mapping=None):
    model.eval()
    with torch.no_grad():
        signal = preprocess_audio(file_path, transformation, target_sample_rate, num_samples, device)
        signal = signal.unsqueeze(0).to(device)   # add batch dimension
        outputs = model(signal)
        predicted_idx = torch.argmax(outputs, dim=1).item()

    if class_mapping:
        return class_mapping[str(predicted_idx)]
    return predicted_idx

In [29]:
mel_spectrogram = torchaudio.transforms.MelSpectrogram(SAMPLE_RATE, n_fft=1024, hop_length=512, n_mels=64)

In [30]:
model = EfficientNetCustom(num_classes=50, pretrained=True).to(DEVICE)
model.load_state_dict(torch.load("/content/audioModelDLNew3.pth", map_location=DEVICE))



<All keys matched successfully>

In [33]:
import json

with open("mapping.json", "r") as f:
    class_mapping = json.load(f)


prediction = predict_single_audio(
    model,
    "/content/1-100038-A-14.wav",
    mel_spectrogram,
    SAMPLE_RATE,
    NUM_SAMPLES,
    DEVICE,
    class_mapping
)

print("Predicted class:", prediction)


Predicted class: chirping_birds
