In [None]:
import torch
import torchaudio
import pandas as pd
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import os


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
annotation = r"C:\Users\Nikhil Pathak\OneDrive\Desktop\Deep Learning\Urban-sound-model\UrbanSound8K.csv"
audio_dir = r"C:\Users\Nikhil Pathak\OneDrive\Desktop\Deep Learning\Urban-sound-model\UrbanSound8K\audio"

ValueError: mount failed

In [3]:
class AudioDataset(Dataset):
    def __init__(self, annotation, audio_dir, transformation, target_sampling_rate, num_samples, device):
        self.annotation = pd.read_csv(annotation)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sampling_rate = target_sampling_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotation)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        return signal, label

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sampling_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sampling_rate)
            resampler = resampler.to(self.device)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        if signal.shape[1] < self.num_samples:
            num_missing_samples = self.num_samples - signal.shape[1]
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _get_audio_sample_path(self, index):
        fold = f"fold{self.annotation.iloc[index, 5]}"
        path = os.path.join(self.audio_dir, fold, self.annotation.iloc[index, 0])
        return path

    def _get_audio_sample_label(self, index):
        return self.annotation.iloc[index, 6]


In [4]:
class SpecAugment(nn.Module):
    """SpecAugment for Mel Spectrograms"""
    def __init__(self, freq_mask_param=15, time_mask_param=35, num_masks=2):
        super().__init__()
        self.freq_masking = torchaudio.transforms.FrequencyMasking(freq_mask_param)
        self.time_masking = torchaudio.transforms.TimeMasking(time_mask_param)
        self.num_masks = num_masks
    
    def forward(self, spec):
        for _ in range(self.num_masks):
            spec = self.freq_masking(spec)
            spec = self.time_masking(spec)
        return spec

In [7]:
sample_rate = 22050
num_samples = 22050

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)

usd = AudioDataset(annotation, audio_dir, mel_spectrogram, sample_rate, num_samples, device)

train_size = int(0.87 * len(usd))
val_size = len(usd) - train_size
train_dataset, val_dataset = random_split(usd, [train_size, val_size])
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
augment = SpecAugment(freq_mask_param=15, time_mask_param=35, num_masks=2).to(device)

print(f"Total samples: {len(usd)}")
print(f"Train samples: {train_size}")
print(f"Validation samples: {val_size}")

Total samples: 8732
Train samples: 7596
Validation samples: 1136


In [9]:
from Base_model import AttentionResNet, LightAttentionResNet

model = LightAttentionResNet(num_classes=10).to(device)
# model = AttentionResNet(num_classes=10).to(device)  # Use this for better accuracy

print(f"\nModel Parameters: {sum(p.numel() for p in model.parameters()):,}")

num_epochs = 25
learning_rate = 0.001
weight_decay = 1e-4

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5
)


Model Parameters: 2,814,134


In [None]:
def train_epoch(model, dataloader, loss_fn, optimizer, augment):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        targets = targets.to(device)
        
        # Apply augmentation
        inputs = augment(inputs)
        
        # Forward pass
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Statistics
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()
        
        # Print progress every 50 batches
        if (batch_idx + 1) % 50 == 0:
            print(f"  Batch [{batch_idx+1}/{len(dataloader)}] Loss: {loss.item():.4f}")
    
    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc

In [None]:

def validate(model, dataloader, loss_fn):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, targets in dataloader:
            targets = targets.to(device)
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    
    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc


In [None]:
print("\n" + "="*60)
print("Starting Training...")
print("="*60 + "\n")

best_val_acc = 0.0
patience_counter = 0
early_stop_patience = 15

for epoch in range(num_epochs):
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print("-" * 60)
    
    # Train
    train_loss, train_acc = train_epoch(model, train_loader, loss_fn, optimizer, augment)
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    
    # Validate
    val_loss, val_acc = validate(model, val_loader, loss_fn)
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    # Learning rate scheduling
    scheduler.step(val_loss)
    current_lr = optimizer.param_groups[0]['lr']
    print(f"Learning Rate: {current_lr:.6f}")
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_acc': val_acc,
            'val_loss': val_loss,
        }, r"C:\Users\Nikhil Pathak\OneDrive\Desktop\Deep Learning\Urban-sound-model/best_attention_resnet.pth")
        print(f"âœ“ Best model saved! (Val Acc: {val_acc:.2f}%)")
        patience_counter = 0
    else:
        patience_counter += 1
    
    # Early stopping
    if patience_counter >= early_stop_patience:
        print(f"\nEarly stopping triggered after {epoch+1} epochs")
        break
    
    print()

print("\n" + "="*60)
print("Training Completed!")
print("="*60)
print(f"Best Validation Accuracy: {best_val_acc:.2f}%")
print(f"Model saved to: C:\Users\Nikhil Pathak\OneDrive\Desktop\Deep Learning\Urban-sound-model")

In [None]:
print("\nLoading best model for final evaluation...")
checkpoint = torch.load(r"C:\Users\Nikhil Pathak\OneDrive\Desktop\Deep Learning\Urban-sound-model/Urban-Sound.pth")
model.load_state_dict(checkpoint['model_state_dict'])

final_val_loss, final_val_acc = validate(model, val_loader, loss_fn)
print(f"\nFinal Validation Results:")
print(f"Loss: {final_val_loss:.4f} | Accuracy: {final_val_acc:.2f}%")