In [None]:
import os
import random
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torchaudio.transforms as T

import timm
from torch_audiomentations import Compose, Gain, AddColoredNoise, PitchShift, Shift

# --- Configuration ---
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 4)

class CFG:
    # General
    SEED = 42
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # --- IMPORTANT: Update this path to your dataset location ---
    DATA_PATH = r"C:\Users\sinha\Downloads\tsda_dataset-20251015T052020Z-1-001\tsda_dataset"
    AUDIO_PATH = os.path.join(DATA_PATH, "audio")
    META_PATH = os.path.join(DATA_PATH, "meta/sound_50.csv")
    
    # Audio Parameters
    SAMPLING_RATE = 44100
    N_FFT = 2048        
    WIN_LENGTH = 1024     
    HOP_LENGTH = 512
    N_MELS = 128 # We will use 128 mels, and resize
    
    # Model & Training
    MODEL_NAME = 'efficientnet_b2' # Using a slightly larger B2
    TARGET_SIZE = 260 # Default input size for EfficientNet-B2
    NUM_CLASSES = 50
    BATCH_SIZE = 32 # B2 is larger, lower batch size if you get OOM errors
    EPOCHS = 50          
    LEARNING_RATE = 3e-4
    WEIGHT_DECAY = 1e-2    
    
    # --- NEW: Mixup Configuration ---
    MIXUP_ALPHA = 0.4 # How much to mix. 0.0 = no mixup

    # Optional: compute "clean" (non-mixup) train accuracy (slower)
    CLEAN_TRAIN_ACC = False

def set_seed(seed):
    """Sets the seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(CFG.SEED)
print(f"Using device: {CFG.DEVICE}")
print(f"Using model: {CFG.MODEL_NAME}")

# --- Spectrogram Transforms ---

# Base Mel Spectrogram pipeline
mel_transform = T.MelSpectrogram(
    sample_rate=CFG.SAMPLING_RATE,
    n_fft=CFG.N_FFT,
    win_length=CFG.WIN_LENGTH,
    hop_length=CFG.HOP_LENGTH,
    n_mels=CFG.N_MELS
)
db_transform = T.AmplitudeToDB()

# This is the base transform
spec_transform = nn.Sequential(
    mel_transform,
    db_transform
).to(CFG.DEVICE)

# --- NEW: Delta Transform ---
# This computes the delta (velocity) of the spectrogram
delta_transform = T.ComputeDeltas().to(CFG.DEVICE)

# --- NEW: SpecAugment (we'll still use this) ---
spec_augmenter = nn.Sequential(
    T.FrequencyMasking(freq_mask_param=24), # Mask ~20% of mel bins (24/128)
    T.TimeMasking(time_mask_param=50)      # Mask ~10-12% of time steps
).to(CFG.DEVICE)


# --- Check Dataset Paths ---
try:
    audio_files = os.listdir(CFG.AUDIO_PATH)
    print(f"Found {len(audio_files)} audio files in {CFG.AUDIO_PATH}")
    meta_files = os.listdir(os.path.join(CFG.DATA_PATH, "meta"))
    print(f"Found metadata files: {meta_files}")
except FileNotFoundError:
    print(f"Error: Dataset not found. Please check the DATA_PATH in the CFG class.")
    print(f"Current DATA_PATH: {CFG.DATA_PATH}")

df = pd.read_csv(CFG.META_PATH)
print("Metadata DataFrame Head:")
print(df.head())


def get_model(num_classes=CFG.NUM_CLASSES, pretrained=True):
    """
    Loads a pre-trained EfficientNet-B2 model.
    We will use the default in_chans=3, as we will be feeding it
    a 3-channel spectrogram (Mel, Delta, Delta-Delta).
    """
    model = timm.create_model(
        CFG.MODEL_NAME,
        pretrained=pretrained,
        num_classes=num_classes,
        # in_chans=3 is the default, so no change needed
    )
    return model

model_check = get_model(num_classes=CFG.NUM_CLASSES)
print(f"\nUsing Pre-trained {CFG.MODEL_NAME} Model Architecture.")
# print(model_check.default_cfg) # Uncomment to check default config


# --- UPDATED: Dataset Class ---
class ESC50Dataset(Dataset):
    def __init__(self, df, data_path, fold_to_exclude, is_train=True, 
                 transform=None, 
                 delta_transform=None,
                 waveform_augmentations=None,
                 spec_augmenter=None):
        
        self.data_path = data_path
        if is_train:
            self.df = df[df['fold'] != fold_to_exclude].reset_index(drop=True)
        else:
            self.df = df[df['fold'] == fold_to_exclude].reset_index(drop=True)
            
        # Move nn transforms to CPU for dataloading (they will be sent to device later)
        # Note: these transforms are simple torch modules; keeping them on CPU avoids some issues with DataLoader workers
        self.transform = transform.to("cpu") if transform is not None else None
        self.delta_transform = delta_transform.to("cpu") if delta_transform is not None else None
        self.waveform_augmentations = waveform_augmentations
        self.spec_augmenter = spec_augmenter.to("cpu") if spec_augmenter else None
        self.is_train = is_train
        self.target_length = 5 * CFG.SAMPLING_RATE # 5 seconds
        self.target_size = (CFG.TARGET_SIZE, CFG.TARGET_SIZE)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        file_path = os.path.join(self.data_path, row['filename'])
        
        waveform, sr = torchaudio.load(file_path)  # waveform: [channels, time]
        if waveform.dim() == 1:
            waveform = waveform.unsqueeze(0)
        
        if sr != CFG.SAMPLING_RATE:
            waveform = T.Resample(sr, CFG.SAMPLING_RATE)(waveform)
        
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        
        waveform = waveform.to(dtype=torch.float32)
        
        # 1. Waveform Augmentations
        if self.is_train and self.waveform_augmentations is not None:
            added_batch = False
            # torch_audiomentations expects [batch, channels, time] OR [channels, time] depending on API
            if waveform.dim() == 2:
                # make batch dim
                waveform = waveform.unsqueeze(0)  # [1, 1, time]
                added_batch = True
   
            try:
                augmented = self.waveform_augmentations(samples=waveform, sample_rate=CFG.SAMPLING_RATE)
            except TypeError:
                augmented = self.waveform_augmentations(waveform, CFG.SAMPLING_RATE)

            if isinstance(augmented, dict):
                augmented = augmented.get('samples', augmented.get('augmented_samples'))
            
            if added_batch:
                augmented = augmented.squeeze(0)
            
            waveform = augmented.to(dtype=torch.float32)

        # 2. Pad or Truncate
        n_samples = waveform.shape[1]
        if n_samples < self.target_length:
            pad_amount = self.target_length - n_samples
            waveform = torch.nn.functional.pad(waveform, (0, pad_amount))
        else:
            waveform = waveform[:, : self.target_length]
        
        
        # 3. Create Spectrogram (Channel 1: Log-Mel)
        # self.transform expects [batch, time] or [time], but gets [1, time]
        # Let's squeeze it
        spectrogram = self.transform(waveform.squeeze(0)) # -> [n_mels, n_frames]
        
        # 4. Apply SpecAugment (only for training)
        if self.is_train and self.spec_augmenter is not None:
            spectrogram = self.spec_augmenter(spectrogram)
        
        # 5. Create Delta Features (Channels 2 & 3)
        spec_delta = self.delta_transform(spectrogram)       # -> [n_mels, n_frames]
        spec_delta_delta = self.delta_transform(spec_delta)  # -> [n_mels, n_frames]
        
        # 6. Stack all 3 channels
        # We need to unsqueeze to add a channel dim before cat
        stacked_spec = torch.stack([spectrogram, spec_delta, spec_delta_delta], dim=0) # -> [3, n_mels, n_frames]
        
        # 7. Resize to model input size
        # Add a temporary batch dimension for interpolate
        stacked_spec = stacked_spec.unsqueeze(0) # -> [1, 3, n_mels, n_frames]
        resized_spec = F.interpolate(
            stacked_spec, size=self.target_size, mode='bilinear', align_corners=False
        ) # -> [1, 3, target_size, target_size]
        
        resized_spec = resized_spec.squeeze(0) # -> [3, target_size, target_size]
        
        label = torch.tensor(row['target'], dtype=torch.long)
        return resized_spec, label


# --- Waveform Augmentations ---
waveform_augmenter = Compose(
    transforms=[
        Gain(min_gain_in_db=-12.0, max_gain_in_db=12.0, p=0.5, output_type='tensor'),
        AddColoredNoise(min_snr_in_db=6.0, max_snr_in_db=20.0, p=0.45, output_type='tensor'),
        PitchShift(
            min_transpose_semitones=-2,
            max_transpose_semitones=2,
            p=0.35,
            sample_rate=CFG.SAMPLING_RATE,
            output_type='tensor'
        ),
        Shift(min_shift=-0.2, max_shift=0.2, p=0.35, output_type='tensor'),
    ]
)

# --- NEW: Mixup Functions ---

def mixup_data(x, y, alpha=0.4, device=None):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if device is None:
        device = x.device
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1.0

    batch_size = x.size(0)
    index = torch.randperm(batch_size).to(device)

    mixed_x = lam * x + (1.0 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def mixup_criterion(criterion, pred, y_a, y_b, lam):
    '''Calculates the mixed loss'''
    return lam * criterion(pred, y_a) + (1.0 - lam) * criterion(pred, y_b)


# --- UPDATED: Training & Validation Functions ---

def train_one_epoch(model, train_loader, criterion, optimizer, device, spec_augmenter=None):
    model.train()
    total_loss = 0.0
    correct_predictions = 0.0  # float because we accumulate weighted counts
    total_samples = 0
    clean_correct = 0
    clean_total = 0
    
    for inputs, labels in tqdm(train_loader, desc="Training", leave=False):
        # Move data to device
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Optionally keep copies for clean-accuracy calculation
        if CFG.CLEAN_TRAIN_ACC:
            clean_inputs = inputs.clone()
            clean_labels = labels.clone()
        
        # --- NEW: Apply Mixup ---
        inputs, labels_a, labels_b, lam = mixup_data(inputs, labels, CFG.MIXUP_ALPHA, device=device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        
        # --- NEW: Use Mixup Loss ---
        loss = mixup_criterion(criterion, outputs, labels_a, labels_b, lam)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * inputs.size(0)
        
        # Mixup-aware accuracy:
        _, predicted = torch.max(outputs.data, 1)
        # Weighted correctness: lam*match(y_a) + (1-lam)*match(y_b)
        correct_predictions += (lam * (predicted == labels_a).float()).sum().item()
        correct_predictions += ((1.0 - lam) * (predicted == labels_b).float()).sum().item()
        total_samples += labels.size(0)
        
        # Optionally compute a "clean" hard accuracy using the un-mixed inputs (slower)
        if CFG.CLEAN_TRAIN_ACC:
            with torch.no_grad():
                out_clean = model(clean_inputs)
                _, pred_clean = out_clean.max(1)
                clean_correct += (pred_clean == clean_labels).sum().item()
                clean_total += clean_labels.size(0)
        
    avg_loss = total_loss / total_samples
    accuracy = correct_predictions / total_samples
    clean_acc = (clean_correct / clean_total) if (CFG.CLEAN_TRAIN_ACC and clean_total > 0) else None
    return avg_loss, accuracy, clean_acc

def validate_one_epoch(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    
    with torch.no_grad():
        for inputs, labels in tqdm(val_loader, desc="Validation", leave=False):
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels) # Standard loss for validation
            
            total_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total_samples += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()
            
    avg_loss = total_loss / total_samples
    accuracy = correct_predictions / total_samples
    return avg_loss, accuracy


# --- Main Cross-Validation Loop ---
fold_accuracies = []

for fold in range(1, 6):
    print(f"\n===== FOLD {fold} =====")
    
    train_dataset = ESC50Dataset(df, CFG.AUDIO_PATH, fold_to_exclude=fold, is_train=True, 
                                 transform=spec_transform, 
                                 delta_transform=delta_transform,
                                 waveform_augmentations=waveform_augmenter,
                                 spec_augmenter=spec_augmenter)
    
    val_dataset = ESC50Dataset(df, CFG.AUDIO_PATH, fold_to_exclude=fold, is_train=False, 
                               transform=spec_transform,
                               delta_transform=delta_transform,
                               spec_augmenter=None) # No augs for val
    
    train_loader = DataLoader(train_dataset, batch_size=CFG.BATCH_SIZE, shuffle=True, num_workers=0, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=CFG.BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=True)
    
    model = get_model(num_classes=CFG.NUM_CLASSES).to(CFG.DEVICE)
    
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    
    optimizer = optim.AdamW(model.parameters(), lr=CFG.LEARNING_RATE, weight_decay=CFG.WEIGHT_DECAY)
    
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=CFG.EPOCHS, eta_min=1e-6)
    
    best_val_accuracy = 0.0
    best_model_path = f"best_model_fold_{fold}.pth"

    for epoch in range(CFG.EPOCHS):
        # We pass spec_augmenter, but it's handled in the Dataset class now
        train_loss, train_acc, train_clean_acc = train_one_epoch(model, train_loader, criterion, optimizer, CFG.DEVICE)
        val_loss, val_acc = validate_one_epoch(model, val_loader, criterion, CFG.DEVICE)
        
        scheduler.step()
        
        if CFG.CLEAN_TRAIN_ACC and train_clean_acc is not None:
            print(f"Epoch {epoch+1}/{CFG.EPOCHS} | Train Loss: {train_loss:.4f}, Train Acc (mixup-aware): {train_acc:.4f}, Train Clean Acc: {train_clean_acc:.4f} | Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        else:
            print(f"Epoch {epoch+1}/{CFG.EPOCHS} | Train Loss: {train_loss:.4f}, Train Acc (mixup-aware): {train_acc:.4f} | Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        
        if val_acc > best_val_accuracy:
            best_val_accuracy = val_acc
            torch.save(model.state_dict(), best_model_path)
            print(f"  -> New best validation accuracy: {best_val_accuracy:.4f}. Model saved.")
            
    fold_accuracies.append(best_val_accuracy)
    print(f"Best validation accuracy for fold {fold}: {best_val_accuracy:.4f}")

print("\n===================================")
print("Cross-Validation Complete.")
print(f"All fold accuracies: {fold_accuracies}")
print(f"Average CV Accuracy: {np.mean(fold_accuracies):.4f} +/- {np.std(fold_accuracies):.4f}")
print("===================================")


Using device: cuda
Using model: efficientnet_b2
Found 2000 audio files in C:\Users\sinha\Downloads\tsda_dataset-20251015T052020Z-1-001\tsda_dataset\audio
Found metadata files: ['sound_50.csv', 'sound_human_annotations.xlsx']
Metadata DataFrame Head:
            filename  fold  target        category  esc10  src_file take
0   1-100032-A-0.wav     1       0             dog   True    100032    A
1  1-100038-A-14.wav     1      14  chirping_birds  False    100038    A
2  1-100210-A-36.wav     1      36  vacuum_cleaner  False    100210    A
3  1-100210-B-36.wav     1      36  vacuum_cleaner  False    100210    B
4  1-101296-A-19.wav     1      19    thunderstorm  False    101296    A

Using Pre-trained efficientnet_b2 Model Architecture.

===== FOLD 1 =====


  >>> augment = Compose(..., output_type='dict')
  >>> augmented_samples = augment(samples).samples


Training:   0%|          | 0/50 [00:00<?, ?it/s]



Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 1/50 | Train Loss: 3.4015, Train Acc (mixup-aware): 0.1987 | Val Loss: 2.1569, Val Acc: 0.5325
  -> New best validation accuracy: 0.5325. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 2/50 | Train Loss: 2.3147, Train Acc (mixup-aware): 0.5193 | Val Loss: 1.6882, Val Acc: 0.6600
  -> New best validation accuracy: 0.6600. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 3/50 | Train Loss: 2.1318, Train Acc (mixup-aware): 0.5907 | Val Loss: 1.5104, Val Acc: 0.7125
  -> New best validation accuracy: 0.7125. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 4/50 | Train Loss: 1.7110, Train Acc (mixup-aware): 0.7323 | Val Loss: 1.3901, Val Acc: 0.7850
  -> New best validation accuracy: 0.7850. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 5/50 | Train Loss: 1.9469, Train Acc (mixup-aware): 0.6852 | Val Loss: 1.3817, Val Acc: 0.8050
  -> New best validation accuracy: 0.8050. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 6/50 | Train Loss: 1.7724, Train Acc (mixup-aware): 0.7386 | Val Loss: 1.2712, Val Acc: 0.8250
  -> New best validation accuracy: 0.8250. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 7/50 | Train Loss: 1.8247, Train Acc (mixup-aware): 0.7192 | Val Loss: 1.3269, Val Acc: 0.8200


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 8/50 | Train Loss: 1.5666, Train Acc (mixup-aware): 0.7966 | Val Loss: 1.2728, Val Acc: 0.8375
  -> New best validation accuracy: 0.8375. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 9/50 | Train Loss: 1.6808, Train Acc (mixup-aware): 0.7723 | Val Loss: 1.2858, Val Acc: 0.8375


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 10/50 | Train Loss: 1.6130, Train Acc (mixup-aware): 0.7851 | Val Loss: 1.2364, Val Acc: 0.8525
  -> New best validation accuracy: 0.8525. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 11/50 | Train Loss: 1.5209, Train Acc (mixup-aware): 0.8167 | Val Loss: 1.2178, Val Acc: 0.8625
  -> New best validation accuracy: 0.8625. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 12/50 | Train Loss: 1.7199, Train Acc (mixup-aware): 0.7590 | Val Loss: 1.2829, Val Acc: 0.8525


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 13/50 | Train Loss: 1.6392, Train Acc (mixup-aware): 0.7749 | Val Loss: 1.2743, Val Acc: 0.8575


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 14/50 | Train Loss: 1.6398, Train Acc (mixup-aware): 0.7758 | Val Loss: 1.2589, Val Acc: 0.8500


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 15/50 | Train Loss: 1.5011, Train Acc (mixup-aware): 0.8079 | Val Loss: 1.2440, Val Acc: 0.8650
  -> New best validation accuracy: 0.8650. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 16/50 | Train Loss: 1.5511, Train Acc (mixup-aware): 0.7951 | Val Loss: 1.2091, Val Acc: 0.8575


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 17/50 | Train Loss: 1.6193, Train Acc (mixup-aware): 0.7693 | Val Loss: 1.2124, Val Acc: 0.8725
  -> New best validation accuracy: 0.8725. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 18/50 | Train Loss: 1.5033, Train Acc (mixup-aware): 0.8130 | Val Loss: 1.2133, Val Acc: 0.8650


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 19/50 | Train Loss: 1.4615, Train Acc (mixup-aware): 0.8254 | Val Loss: 1.1919, Val Acc: 0.8800
  -> New best validation accuracy: 0.8800. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 20/50 | Train Loss: 1.5092, Train Acc (mixup-aware): 0.8159 | Val Loss: 1.1710, Val Acc: 0.8625


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 21/50 | Train Loss: 1.5014, Train Acc (mixup-aware): 0.8035 | Val Loss: 1.1850, Val Acc: 0.8825
  -> New best validation accuracy: 0.8825. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 22/50 | Train Loss: 1.5190, Train Acc (mixup-aware): 0.7949 | Val Loss: 1.1749, Val Acc: 0.8725


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 23/50 | Train Loss: 1.4188, Train Acc (mixup-aware): 0.8338 | Val Loss: 1.1697, Val Acc: 0.8675


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 24/50 | Train Loss: 1.4093, Train Acc (mixup-aware): 0.8405 | Val Loss: 1.1520, Val Acc: 0.8875
  -> New best validation accuracy: 0.8875. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 25/50 | Train Loss: 1.3739, Train Acc (mixup-aware): 0.8520 | Val Loss: 1.1682, Val Acc: 0.8925
  -> New best validation accuracy: 0.8925. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 26/50 | Train Loss: 1.4581, Train Acc (mixup-aware): 0.8091 | Val Loss: 1.1613, Val Acc: 0.8725


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 27/50 | Train Loss: 1.4780, Train Acc (mixup-aware): 0.7978 | Val Loss: 1.1729, Val Acc: 0.8700


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 28/50 | Train Loss: 1.4196, Train Acc (mixup-aware): 0.8243 | Val Loss: 1.1565, Val Acc: 0.8950
  -> New best validation accuracy: 0.8950. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 29/50 | Train Loss: 1.5003, Train Acc (mixup-aware): 0.8138 | Val Loss: 1.1944, Val Acc: 0.8875


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 30/50 | Train Loss: 1.3987, Train Acc (mixup-aware): 0.8273 | Val Loss: 1.1726, Val Acc: 0.8875


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 31/50 | Train Loss: 1.4714, Train Acc (mixup-aware): 0.8075 | Val Loss: 1.1692, Val Acc: 0.8850


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 32/50 | Train Loss: 1.4363, Train Acc (mixup-aware): 0.8263 | Val Loss: 1.1575, Val Acc: 0.8925


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 33/50 | Train Loss: 1.4346, Train Acc (mixup-aware): 0.8261 | Val Loss: 1.1749, Val Acc: 0.9000
  -> New best validation accuracy: 0.9000. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 34/50 | Train Loss: 1.4016, Train Acc (mixup-aware): 0.8202 | Val Loss: 1.1757, Val Acc: 0.8975


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 35/50 | Train Loss: 1.4438, Train Acc (mixup-aware): 0.8150 | Val Loss: 1.1901, Val Acc: 0.8900


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 36/50 | Train Loss: 1.4147, Train Acc (mixup-aware): 0.8271 | Val Loss: 1.1687, Val Acc: 0.9025
  -> New best validation accuracy: 0.9025. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 37/50 | Train Loss: 1.4763, Train Acc (mixup-aware): 0.7979 | Val Loss: 1.1667, Val Acc: 0.8950


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 38/50 | Train Loss: 1.4927, Train Acc (mixup-aware): 0.7910 | Val Loss: 1.1653, Val Acc: 0.9000


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 39/50 | Train Loss: 1.5098, Train Acc (mixup-aware): 0.7926 | Val Loss: 1.1720, Val Acc: 0.8850


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 40/50 | Train Loss: 1.4504, Train Acc (mixup-aware): 0.8054 | Val Loss: 1.1864, Val Acc: 0.8975


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 41/50 | Train Loss: 1.2601, Train Acc (mixup-aware): 0.8629 | Val Loss: 1.1368, Val Acc: 0.8975


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 42/50 | Train Loss: 1.3387, Train Acc (mixup-aware): 0.8516 | Val Loss: 1.1641, Val Acc: 0.8975


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 43/50 | Train Loss: 1.5209, Train Acc (mixup-aware): 0.7850 | Val Loss: 1.1787, Val Acc: 0.8950


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 44/50 | Train Loss: 1.3265, Train Acc (mixup-aware): 0.8551 | Val Loss: 1.1730, Val Acc: 0.8950


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 45/50 | Train Loss: 1.4052, Train Acc (mixup-aware): 0.8311 | Val Loss: 1.1683, Val Acc: 0.8900


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 46/50 | Train Loss: 1.4371, Train Acc (mixup-aware): 0.8164 | Val Loss: 1.1599, Val Acc: 0.8975


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 47/50 | Train Loss: 1.3526, Train Acc (mixup-aware): 0.8498 | Val Loss: 1.1515, Val Acc: 0.8975


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 48/50 | Train Loss: 1.4477, Train Acc (mixup-aware): 0.7941 | Val Loss: 1.1543, Val Acc: 0.8950


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 49/50 | Train Loss: 1.2801, Train Acc (mixup-aware): 0.8677 | Val Loss: 1.1239, Val Acc: 0.8975


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 50/50 | Train Loss: 1.4573, Train Acc (mixup-aware): 0.8071 | Val Loss: 1.1627, Val Acc: 0.8950
Best validation accuracy for fold 1: 0.9025

===== FOLD 2 =====


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /timm/efficientnet_b2.ra_in1k/resolve/main/model.safetensors (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001A6F66C4050>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 58669c99-0914-4275-a1d6-78d966e6e6ee)')' thrown while requesting HEAD https://huggingface.co/timm/efficientnet_b2.ra_in1k/resolve/main/model.safetensors
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /timm/efficientnet_b2.ra_in1k/resolve/main/model.safetensors (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001A867D03110>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 8bb69ae1-3eda-43d8-ba4e-8f38d7f690aa)')' thrown while requesting HEAD https://huggingface.co/timm/efficient

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 1/50 | Train Loss: 3.3795, Train Acc (mixup-aware): 0.2078 | Val Loss: 2.3526, Val Acc: 0.4500
  -> New best validation accuracy: 0.4500. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 2/50 | Train Loss: 2.4173, Train Acc (mixup-aware): 0.4983 | Val Loss: 1.6877, Val Acc: 0.6950
  -> New best validation accuracy: 0.6950. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 3/50 | Train Loss: 2.0310, Train Acc (mixup-aware): 0.6121 | Val Loss: 1.5717, Val Acc: 0.7175
  -> New best validation accuracy: 0.7175. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 4/50 | Train Loss: 2.0912, Train Acc (mixup-aware): 0.6239 | Val Loss: 1.4210, Val Acc: 0.7825
  -> New best validation accuracy: 0.7825. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 5/50 | Train Loss: 1.7505, Train Acc (mixup-aware): 0.7280 | Val Loss: 1.4171, Val Acc: 0.7750


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 6/50 | Train Loss: 1.6764, Train Acc (mixup-aware): 0.7584 | Val Loss: 1.3557, Val Acc: 0.7950
  -> New best validation accuracy: 0.7950. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 7/50 | Train Loss: 1.8046, Train Acc (mixup-aware): 0.7338 | Val Loss: 1.2901, Val Acc: 0.8550
  -> New best validation accuracy: 0.8550. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 8/50 | Train Loss: 1.6175, Train Acc (mixup-aware): 0.7863 | Val Loss: 1.2642, Val Acc: 0.8275


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 9/50 | Train Loss: 1.7120, Train Acc (mixup-aware): 0.7592 | Val Loss: 1.2940, Val Acc: 0.8375


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 10/50 | Train Loss: 1.6580, Train Acc (mixup-aware): 0.7665 | Val Loss: 1.2881, Val Acc: 0.8500


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 11/50 | Train Loss: 1.6063, Train Acc (mixup-aware): 0.7945 | Val Loss: 1.2958, Val Acc: 0.8250


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 12/50 | Train Loss: 1.4265, Train Acc (mixup-aware): 0.8413 | Val Loss: 1.2350, Val Acc: 0.8650
  -> New best validation accuracy: 0.8650. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 13/50 | Train Loss: 1.6672, Train Acc (mixup-aware): 0.7594 | Val Loss: 1.2834, Val Acc: 0.8500


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 14/50 | Train Loss: 1.7531, Train Acc (mixup-aware): 0.7396 | Val Loss: 1.3007, Val Acc: 0.8600


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 15/50 | Train Loss: 1.6199, Train Acc (mixup-aware): 0.7812 | Val Loss: 1.2755, Val Acc: 0.8550


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 16/50 | Train Loss: 1.4902, Train Acc (mixup-aware): 0.8195 | Val Loss: 1.2188, Val Acc: 0.8600


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 17/50 | Train Loss: 1.4550, Train Acc (mixup-aware): 0.8205 | Val Loss: 1.2044, Val Acc: 0.8700
  -> New best validation accuracy: 0.8700. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 18/50 | Train Loss: 1.4490, Train Acc (mixup-aware): 0.8253 | Val Loss: 1.2248, Val Acc: 0.8550


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 19/50 | Train Loss: 1.6316, Train Acc (mixup-aware): 0.7783 | Val Loss: 1.2646, Val Acc: 0.8500


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 20/50 | Train Loss: 1.4211, Train Acc (mixup-aware): 0.8447 | Val Loss: 1.2185, Val Acc: 0.8600


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 21/50 | Train Loss: 1.4298, Train Acc (mixup-aware): 0.8259 | Val Loss: 1.2077, Val Acc: 0.8625


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 22/50 | Train Loss: 1.5390, Train Acc (mixup-aware): 0.8053 | Val Loss: 1.1983, Val Acc: 0.8750
  -> New best validation accuracy: 0.8750. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 23/50 | Train Loss: 1.5257, Train Acc (mixup-aware): 0.7940 | Val Loss: 1.2221, Val Acc: 0.8700


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 24/50 | Train Loss: 1.4842, Train Acc (mixup-aware): 0.8048 | Val Loss: 1.1682, Val Acc: 0.8725


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Validation:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 25/50 | Train Loss: 1.5313, Train Acc (mixup-aware): 0.8041 | Val Loss: 1.1908, Val Acc: 0.8850
  -> New best validation accuracy: 0.8850. Model saved.


Training:   0%|          | 0/50 [00:00<?, ?it/s]

In [10]:
import os
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torchaudio
import numpy as np
from tqdm.auto import tqdm

# --- CONFIG: update these paths if needed ---
TEST_AUDIO_PATH = r"C:\\Users\\sinha\\Downloads\\tsda_test_dataset-20251030T122450Z-1-001\\tsda_test_dataset\\test_set"   # <- change this
MODEL_PATH = "C:\\Users\\sinha\\notebook\\best_model_fold_2.pth"               # <- change if file in different folder
OUTPUT_CSV = "test_predictions2.csv"

# Use same CFG, transforms, and get_model defined earlier in your notebook.
# To avoid re-downloading weights, set pretrained=False when constructing model for inference.
def load_model_for_inference(model_path, device):
    model = get_model(num_classes=CFG.NUM_CLASSES, pretrained=False).to(device)
    state = torch.load(model_path, map_location=device)
    # handle state dict saved from DataParallel or similar
    if isinstance(state, dict) and "state_dict" in state:
        state_dict = state["state_dict"]
    else:
        state_dict = state

    # Remove 'module.' prefix if present
    new_state = {}
    for k, v in state_dict.items():
        new_key = k
        if k.startswith("module."):
            new_key = k[len("module."):]
        new_state[new_key] = v

    model.load_state_dict(new_state)
    model.eval()
    return model

# --- Test Dataset ---
class TestDataset(Dataset):
    def __init__(self, audio_dir, transform, delta_transform, spec_augmenter=None, target_length=None, target_size=None, sr=CFG.SAMPLING_RATE):
        self.audio_dir = audio_dir
        # collect audio files (wav/flac/mp3 etc.)
        exts = (".wav", ".flac", ".mp3", ".WAV", ".FLAC", ".MP3")
        self.files = sorted([f for f in os.listdir(audio_dir) if f.endswith(exts)])
        self.transform = transform.to("cpu") if transform is not None else None
        self.delta_transform = delta_transform.to("cpu") if delta_transform is not None else None
        self.spec_augmenter = spec_augmenter.to("cpu") if spec_augmenter is not None else None
        self.target_length = target_length if target_length is not None else 5 * sr
        self.target_size = target_size if target_size is not None else (CFG.TARGET_SIZE, CFG.TARGET_SIZE)
        self.sr = sr

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        fname = self.files[idx]
        file_path = os.path.join(self.audio_dir, fname)

        waveform, sr = torchaudio.load(file_path)  # [channels, time]
        if waveform.dim() == 1:
            waveform = waveform.unsqueeze(0)

        if sr != self.sr:
            waveform = torchaudio.transforms.Resample(sr, self.sr)(waveform)

        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        waveform = waveform.to(dtype=torch.float32)

        # pad or crop to target_length
        n_samples = waveform.shape[1]
        if n_samples < self.target_length:
            pad_amount = self.target_length - n_samples
            waveform = torch.nn.functional.pad(waveform, (0, pad_amount))
        else:
            waveform = waveform[:, : self.target_length]

        # spectrogram
        spectrogram = self.transform(waveform.squeeze(0))  # -> [n_mels, n_frames]

        # (no spec augment for test)
        # delta features
        spec_delta = self.delta_transform(spectrogram)
        spec_delta_delta = self.delta_transform(spec_delta)

        stacked_spec = torch.stack([spectrogram, spec_delta, spec_delta_delta], dim=0)  # [3, n_mels, n_frames]
        stacked_spec = stacked_spec.unsqueeze(0)  # [1, 3, n_mels, n_frames]
        resized_spec = F.interpolate(stacked_spec, size=self.target_size, mode='bilinear', align_corners=False)
        resized_spec = resized_spec.squeeze(0)  # [3, target, target]

        # id is filename without extension
        file_id = os.path.splitext(fname)[0]
        return resized_spec, file_id

# --- Create test dataset & dataloader ---
test_ds = TestDataset(
    audio_dir=TEST_AUDIO_PATH,
    transform=spec_transform,
    delta_transform=delta_transform,
    spec_augmenter=None,
    target_length=5 * CFG.SAMPLING_RATE,
    target_size=(CFG.TARGET_SIZE, CFG.TARGET_SIZE),
    sr=CFG.SAMPLING_RATE
)

test_loader = DataLoader(test_ds, batch_size=CFG.BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=True)

# --- Load model ---
device = CFG.DEVICE
model = load_model_for_inference(MODEL_PATH, device)

# --- Inference loop ---
all_ids = []
all_preds = []

with torch.no_grad():
    for inputs, ids in tqdm(test_loader, desc="Testing"):
        inputs = inputs.to(device)
        outputs = model(inputs)          # [B, C]
        preds = outputs.argmax(dim=1)    # [B]
        preds = preds.detach().cpu().numpy().astype(int)

        all_ids.extend(ids)
        all_preds.extend(preds.tolist())

# --- Save CSV (format: id, prediction) ---
df_out = pd.DataFrame({"id": all_ids, "prediction": all_preds})
# ensure ordering is same as files (it is because we sorted filenames in dataset)
df_out.to_csv(OUTPUT_CSV, index=False)

print(f"Saved predictions to {OUTPUT_CSV}. Example rows:")
print(df_out.head())

Testing:   0%|          | 0/24 [00:00<?, ?it/s]



Saved predictions to test_predictions2.csv. Example rows:
               id  prediction
0  7-280602-A-001          37
1  7-280602-A-002          37
2  7-280602-A-003          37
3  7-280602-A-004          37
4  7-280602-A-005          37
