In [5]:
import torch
import librosa
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as Models

class AudioDenseNet(nn.Module):
    def __init__(self, numClasses=50, dropoutProb=0.0):
        super().__init__()
        self.backbone = Models.densenet121(weights=Models.DenseNet121_Weights.DEFAULT)
        self.backbone.features.conv0 = nn.Conv2d(
            in_channels=1,
            out_channels=64,
            kernel_size=7,
            stride=2,
            padding=3,
            bias=False
        )
        nn.init.kaiming_normal_(self.backbone.features.conv0.weight, mode='fan_out', nonlinearity='relu')
        
        nfeats = self.backbone.classifier.in_features
        self.dropout = nn.Dropout(p=dropoutProb) if dropoutProb > 0 else nn.Identity()
        self.classifier = nn.Linear(nfeats, numClasses)
    
    def forward(self, x):
        x = self.backbone.features(x)
        x = F.relu(x, inplace=True)
        x = F.adaptive_avg_pool2d(x, (1, 1))
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        x = self.classifier(x)
        return x

def classify_audio(audio_path, model, classNames, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), target_sr=None):
    signal, sr = librosa.load(audio_path, sr=target_sr)
    mel = librosa.feature.melspectrogram(
        y=signal,
        sr=sr,
        n_fft=2048,
        hop_length=512,
        n_mels=128
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_tensor = torch.tensor(mel_db, dtype=torch.float).unsqueeze(0).unsqueeze(0).to(device)
    model.eval()
    with torch.no_grad():
        logits = model(mel_tensor)
        probs = F.softmax(logits, dim=1).cpu().numpy().squeeze()
        pred_idx = np.argmax(probs)
    
    pred_class = classNames[pred_idx]
    return pred_class, probs[pred_idx], probs

classNames = [
    "airplane", "breathing", "brushing_teeth", "can_opening", "car_horn", 
    "cat", "chainsaw", "chirping_birds", "church_bells", "clapping", 
    "clock_alarm", "clock_tick", "coughing", "cow", "crackling_fire", 
    "crickets", "crow", "crying_baby", "dog", "door_wood_creaks", 
    "door_wood_knock", "drinking_sipping", "engine", "fireworks", 
    "footsteps", "frog", "glass_breaking", "hand_saw", "helicopter", 
    "hen", "insects", "keyboard_typing", "laughing", "mouse_click", 
    "pig", "pouring_water", "rain", "rooster", "sea_waves", "sheep", 
    "siren", "sneezing", "snoring", "thunderstorm", "toilet_flush", 
    "train", "vacuum_cleaner", "washing_machine", "water_drops", "wind"
]

device = torch.device("cuda")
model = AudioDenseNet(numClasses=len(classNames), dropoutProb=0.2).to(device)
weights_path = r"C:\Users\clash\Downloads\mkc\Sequence Match\SequenceMatch_ESC50_17_PerClass_SeqMatch.pth"
model.load_state_dict(torch.load(weights_path, map_location=device))
audio_path = r"C:\Users\clash\Downloads\mkc\1.wav"
pred_class, prob, all_probs = classify_audio(audio_path, model, classNames, device)
print("Predicted Class:", pred_class)

Predicted Class: cat
