In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image
import time
import copy

In [2]:
# ==========================================
# 1. CONFIGURATION
# ==========================================
# UPDATE THIS PATH based on your Kaggle Data tab
DATA_PATH = "/kaggle/input/amlfif/Dataset" 

BATCH_SIZE = 64
LEARNING_RATE = 0.001
EPOCHS = 50 
NUM_CLASSES = 200
IMAGE_SIZE = 224
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using Device: {DEVICE}")

Using Device: cuda


In [3]:
# # ==========================================
# # 2. DATASET CLASS
# # ==========================================
# class BirdDataset(Dataset):
#     def __init__(self, csv_file, root_dir, transform=None):
#         self.data = pd.read_csv(csv_file)
#         self.root_dir = root_dir
#         self.transform = transform
        
#         # Adjust label to be 0-199 (PyTorch starts at 0)
#         # Only do this if 'label' column exists (Training data)
#         if 'label' in self.data.columns:
#             self.data['label'] = self.data['label'] - 1

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         # Handle paths
#         img_path = self.data.iloc[idx, 0] if 'label' in self.data.columns else self.data.iloc[idx, 1]
        
#         if img_path.startswith("/"):
#             img_path = img_path[1:]
            
#         full_path = os.path.join(self.root_dir, img_path)
        
#         try:
#             image = Image.open(full_path).convert("RGB")
#         except FileNotFoundError:
#             image = Image.new('RGB', (IMAGE_SIZE, IMAGE_SIZE))
            
#         if self.transform:
#             image = self.transform(image)
            
#         # Return image + label (if training) or image + id (if testing)
#         if 'label' in self.data.columns:
#             label = self.data.iloc[idx, 1]
#             return image, torch.tensor(label, dtype=torch.long)
#         else:
#             image_id = self.data.iloc[idx, 0]
#             return image, image_id


# ==========================================
# 2. DATASET CLASS (Fixed)
# ==========================================
# class BirdDataset(Dataset):
#     def __init__(self, csv_file, root_dir, transform=None):
#         self.data = pd.read_csv(csv_file)
#         self.root_dir = root_dir
#         self.transform = transform
        
#         # Adjust label to be 0-199 (PyTorch starts at 0)
#         # Only do this if 'label' column exists (Training data)
#         if 'label' in self.data.columns:
#             self.data['label'] = self.data['label'] - 1

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         # Get the row
#         row = self.data.iloc[idx]
        
#         # --- SMART DETECTION START ---
#         # We need to find which column has the image path (string) and which has the label/id (int)
#         # We check the first column (index 0). Is it a string ending in .jpg?
#         val0 = row[0]
#         val1 = row[1]
        
#         if isinstance(val0, str) and (val0.endswith('.jpg') or '/' in val0):
#             img_path = val0
#             label_or_id = val1
#         else:
#             # If col 0 isn't the path, then col 1 must be the path
#             img_path = val1
#             label_or_id = val0
#         # --- SMART DETECTION END ---
        
#         # Handle leading slashes if present
#         if str(img_path).startswith("/"):
#             img_path = img_path[1:]
            
#         full_path = os.path.join(self.root_dir, img_path)
        
#         try:
#             image = Image.open(full_path).convert("RGB")
#         except FileNotFoundError:
#             # Fallback for missing images to prevent crash
#             image = Image.new('RGB', (224, 224))
            
#         if self.transform:
#             image = self.transform(image)
            
#         # Return proper pair based on Training vs Testing
#         if 'label' in self.data.columns:
#             # For Training: Return (Image, Label)
#             # Ensure label is a tensor
#             return image, torch.tensor(label_or_id, dtype=torch.long)
#         else:
#             # For Testing: Return (Image, ID)
#             return image, label_or_id

# ==========================================
# 2. DATASET CLASS (Fixed & Clean)
# ==========================================
class BirdDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        
        # Adjust label to be 0-199 (PyTorch starts at 0)
        if 'label' in self.data.columns:
            self.data['label'] = self.data['label'] - 1

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the row by position
        row = self.data.iloc[idx]
        
        # --- SMART DETECTION START ---
        # We use .iloc[] to avoid the FutureWarning
        val0 = row.iloc[0]
        val1 = row.iloc[1]
        
        # Check which one is the image path (string ending in .jpg)
        if isinstance(val0, str) and (val0.endswith('.jpg') or '/' in val0):
            img_path = val0
            label_or_id = val1
        else:
            img_path = val1
            label_or_id = val0
        # --- SMART DETECTION END ---
        
        # Handle leading slashes
        if str(img_path).startswith("/"):
            img_path = img_path[1:]
            
        full_path = os.path.join(self.root_dir, img_path)
        
        try:
            image = Image.open(full_path).convert("RGB")
        except FileNotFoundError:
            image = Image.new('RGB', (224, 224))
            
        if self.transform:
            image = self.transform(image)
            
        # Return proper pair
        if 'label' in self.data.columns:
            # Training: Return (Image, Label)
            return image, torch.tensor(label_or_id, dtype=torch.long)
        else:
            # Testing: Return (Image, ID)
            return image, label_or_id

In [4]:
# ==========================================
# 3. TRANSFORMS (Data Augmentation)
# ==========================================
# Innovation: Adding augmentation helps the model generalize better
train_transforms = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.RandomHorizontalFlip(p=0.5),  # Flip left/right
    transforms.RandomRotation(degrees=15),   # Rotate slightly
    transforms.ColorJitter(brightness=0.1, contrast=0.1), # vary lighting
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transforms = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

#### Simple CNN

In [5]:
# ==========================================
# 4. CUSTOM MODEL (Built from Scratch)
# ==========================================
class SimpleBirdCNN(nn.Module):
    def __init__(self, num_classes=200):
        super(SimpleBirdCNN, self).__init__()
        
        # Block 1
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2) # Reduces size / 2
        
        # Block 2
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        
        # Block 3
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        
        # Block 4
        self.conv4 = nn.Conv2d(128, 256, 3, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5) # Prevents overfitting
        
        # Fully Connected Layers
        # After 4 pools (224 -> 112 -> 56 -> 28 -> 14), image is 14x14
        self.fc1 = nn.Linear(256 * 14 * 14, 512)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        # Pass through convolutions
        x = self.pool(self.relu(self.bn1(self.conv1(x))))
        x = self.pool(self.relu(self.bn2(self.conv2(x))))
        x = self.pool(self.relu(self.bn3(self.conv3(x))))
        x = self.pool(self.relu(self.bn4(self.conv4(x))))
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # Classifier
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


#### Improvised Resnet

In [6]:
# ==========================================
# IMPROVED MODEL: Mini-ResNet
# ==========================================

# 1. The Building Block (The Innovation)
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        
        # First convolution
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        
        # Second convolution
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        # The "Skip Connection" logic
        # If the input size changes (due to stride), we need to resize the shortcut too
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = x # Save the original input (the "jump")
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        
        out += self.shortcut(identity) # ADD the original input back here
        out = self.relu(out)
        
        return out

# 2. The Full Network
class ResNetFromScratch(nn.Module):
    def __init__(self, num_classes=200):
        super(ResNetFromScratch, self).__init__()
        
        # Initial processing (Entry point)
        self.in_channels = 64
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        # Stack the Residual Blocks (Deeper Architecture)
        self.layer1 = self._make_layer(64, 2, stride=1)  # 2 blocks
        self.layer2 = self._make_layer(128, 2, stride=2) # 2 blocks
        self.layer3 = self._make_layer(256, 2, stride=2) # 2 blocks
        self.layer4 = self._make_layer(512, 2, stride=2) # 2 blocks
        
        # Classifier
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def _make_layer(self, out_channels, blocks, stride):
        layers = []
        # First block handles the stride (downsampling)
        layers.append(ResidualBlock(self.in_channels, out_channels, stride))
        self.in_channels = out_channels
        
        # Remaining blocks just process features
        for _ in range(1, blocks):
            layers.append(ResidualBlock(out_channels, out_channels))
            
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

In [7]:
# ==========================================
# IMPROVED MODEL: ResNet with Dropout
# ==========================================
class ResNetFromScratch(nn.Module):
    def __init__(self, num_classes=200):
        super(ResNetFromScratch, self).__init__()
        
        # Initial processing (Entry point)
        self.in_channels = 64
        # Standard ResNet Start
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        # Stack the Residual Blocks
        self.layer1 = self._make_layer(64, 2, stride=1)
        self.layer2 = self._make_layer(128, 2, stride=2)
        self.layer3 = self._make_layer(256, 2, stride=2)
        self.layer4 = self._make_layer(512, 2, stride=2)
        
        # Classifier
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        
        # --- INNOVATION: Dropout ---
        # Dropping 50% of neurons prevents overfitting on small datasets
        self.dropout = nn.Dropout(p=0.5) 
        
        self.fc = nn.Linear(512, num_classes)

        # Initialize weights (Helps training start better)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _make_layer(self, out_channels, blocks, stride):
        layers = []
        layers.append(ResidualBlock(self.in_channels, out_channels, stride))
        self.in_channels = out_channels
        for _ in range(1, blocks):
            layers.append(ResidualBlock(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        
        # Apply Dropout before the final classification
        x = self.dropout(x)
        
        x = self.fc(x)
        return x

#### Data loader

In [8]:
# ==========================================
# 5. SETUP LOADERS & MODEL
# ==========================================
# Load Data
full_dataset = BirdDataset(
    csv_file=f'{DATA_PATH}/train_images.csv', 
    root_dir=f'{DATA_PATH}',
    transform=train_transforms
)

# Split 80/20
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# IMPORTANT: Validation set should NOT use augmentation (just resize)
val_dataset.dataset.transform = val_transforms 

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)


#### CNN Model

In [9]:
# Init CNN Model
model = SimpleBirdCNN(num_classes=NUM_CLASSES).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

print("Model Architecture Created from Scratch!")

# ==========================================
# 6. TRAINING LOOP
# ==========================================
best_acc = 0.0
best_model_wts = copy.deepcopy(model.state_dict())

for epoch in range(EPOCHS):
    print(f'Epoch {epoch+1}/{EPOCHS}')
    
    # Train
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Train Loss: {epoch_loss:.4f}")
    
    # Validate
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
    val_acc = correct / total
    print(f"Val Acc: {val_acc:.4f}")
    
    if val_acc > best_acc:
        best_acc = val_acc
        best_model_wts = copy.deepcopy(model.state_dict())
        torch.save(model.state_dict(), 'best_custom_model.pth')

print(f"Best Validation Accuracy: {best_acc:.4f}")

Model Architecture Created from Scratch!
Epoch 1/50
Train Loss: 7.0816
Val Acc: 0.0089
Epoch 2/50
Train Loss: 5.2926
Val Acc: 0.0089
Epoch 3/50
Train Loss: 5.2868
Val Acc: 0.0102
Epoch 4/50
Train Loss: 5.2859
Val Acc: 0.0064
Epoch 5/50
Train Loss: 5.2802
Val Acc: 0.0064
Epoch 6/50
Train Loss: 5.2737
Val Acc: 0.0064
Epoch 7/50
Train Loss: 5.2708
Val Acc: 0.0064
Epoch 8/50
Train Loss: 5.2646
Val Acc: 0.0089
Epoch 9/50
Train Loss: 5.2670
Val Acc: 0.0064
Epoch 10/50
Train Loss: 5.2590
Val Acc: 0.0064
Epoch 11/50
Train Loss: 5.2573
Val Acc: 0.0064
Epoch 12/50
Train Loss: 5.2529
Val Acc: 0.0064
Epoch 13/50
Train Loss: 5.2485
Val Acc: 0.0064
Epoch 14/50
Train Loss: 5.2456
Val Acc: 0.0064
Epoch 15/50
Train Loss: 5.2454
Val Acc: 0.0064
Epoch 16/50
Train Loss: 5.2423
Val Acc: 0.0064
Epoch 17/50
Train Loss: 5.2390
Val Acc: 0.0064
Epoch 18/50
Train Loss: 5.2375
Val Acc: 0.0064
Epoch 19/50
Train Loss: 5.2391
Val Acc: 0.0064
Epoch 20/50
Train Loss: 5.2338
Val Acc: 0.0064
Epoch 21/50
Train Loss: 5.23

#### Restnet Model

In [10]:
# ==========================================
# 6. TRAINING LOOP (With ResNet & Scheduler)
# ==========================================
import time

# 1. SETUP
# -----------------------------
print("Initializing Custom ResNet...")
model = ResNetFromScratch(num_classes=NUM_CLASSES).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# SCHEDULER: Checks 'val_loss' every epoch. 
# If it doesn't improve for 3 epochs ('patience'), it lowers LR by 10x ('factor').
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)

# 2. TRAINING ENGINE
# -----------------------------
best_acc = 0.0
best_model_wts = copy.deepcopy(model.state_dict())
start_time = time.time()

print(f"Starting training on {DEVICE} for {EPOCHS} epochs...")

for epoch in range(EPOCHS):
    print(f'\nEpoch {epoch+1}/{EPOCHS}')
    print('-' * 10)
    
    # --- TRAIN PHASE ---
    model.train()
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        
        optimizer.zero_grad()
        
        # Forward
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Train Loss: {epoch_loss:.4f}")
    
    # --- VALIDATION PHASE ---
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            _, predicted = torch.max(outputs, 1)
            
            val_loss += loss.item() * inputs.size(0)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
    # Calculate Metrics
    val_loss = val_loss / len(val_loader.dataset)
    val_acc = correct / total
    
    print(f"Val Loss:   {val_loss:.4f}")
    print(f"Val Acc:    {val_acc:.4f}")
    
    # --- SCHEDULER STEP ---
    # This is where the magic happens. We feed the validation loss to the scheduler.
    scheduler.step(val_loss)
    
    # Print current Learning Rate (To verify scheduler is working)
    current_lr = optimizer.param_groups[0]['lr']
    print(f"Current LR: {current_lr}")
    
    # --- SAVE BEST MODEL ---
    if val_acc > best_acc:
        print(f"--> Validation Accuracy Improved ({best_acc:.4f} -> {val_acc:.4f}). Saving model...")
        best_acc = val_acc
        best_model_wts = copy.deepcopy(model.state_dict())
        torch.save(model.state_dict(), 'best_custom_model.pth')

# End of training
time_elapsed = time.time() - start_time
print(f'\nTraining complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
print(f'Best Validation Accuracy: {best_acc:.4f}')

# Load best weights for final inference
model.load_state_dict(best_model_wts)

Initializing Custom ResNet...
Starting training on cuda for 50 epochs...

Epoch 1/50
----------




Train Loss: 5.4420
Val Loss:   5.4547
Val Acc:    0.0191
Current LR: 0.001
--> Validation Accuracy Improved (0.0000 -> 0.0191). Saving model...

Epoch 2/50
----------
Train Loss: 5.1057
Val Loss:   5.0663
Val Acc:    0.0293
Current LR: 0.001
--> Validation Accuracy Improved (0.0191 -> 0.0293). Saving model...

Epoch 3/50
----------
Train Loss: 4.9573
Val Loss:   6.6049
Val Acc:    0.0127
Current LR: 0.001

Epoch 4/50
----------
Train Loss: 4.8217
Val Loss:   4.8753
Val Acc:    0.0267
Current LR: 0.001

Epoch 5/50
----------
Train Loss: 4.7318
Val Loss:   4.8403
Val Acc:    0.0483
Current LR: 0.001
--> Validation Accuracy Improved (0.0293 -> 0.0483). Saving model...

Epoch 6/50
----------
Train Loss: 4.6374
Val Loss:   4.7381
Val Acc:    0.0445
Current LR: 0.001

Epoch 7/50
----------
Train Loss: 4.5809
Val Loss:   4.8679
Val Acc:    0.0369
Current LR: 0.001

Epoch 8/50
----------
Train Loss: 4.5080
Val Loss:   4.8386
Val Acc:    0.0496
Current LR: 0.001
--> Validation Accuracy Improved

<All keys matched successfully>

In [11]:
# ==========================================
# 6. SUPER-CONVERGENCE TRAINING LOOP
# ==========================================
import time

# 1. SETUP
# -----------------------------
EPOCHS = 25 # Increased to give OneCycleLR room to work

print("Initializing ResNet with Dropout & OneCycleLR...")
model = ResNetFromScratch(num_classes=NUM_CLASSES).to(DEVICE)

criterion = nn.CrossEntropyLoss()

# OPTIMIZER: AdamW (Better weight decay handling)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

# SCHEDULER: OneCycleLR
# This ramps the LR up to 'max_lr' then down to 0. 
# It is extremely effective for training from scratch.
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer, 
    max_lr=0.003,              # Peak learning rate
    epochs=EPOCHS, 
    steps_per_epoch=len(train_loader)
)

# 2. TRAINING ENGINE
# -----------------------------
best_acc = 0.0
best_model_wts = copy.deepcopy(model.state_dict())
start_time = time.time()

print(f"Starting training on {DEVICE} for {EPOCHS} epochs...")

for epoch in range(EPOCHS):
    
    # --- TRAIN PHASE ---
    model.train()
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        
        optimizer.zero_grad()
        
        # Forward
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward
        loss.backward()
        optimizer.step()
        
        # STEP SCHEDULER (OneCycleLR updates every BATCH, not every EPOCH)
        scheduler.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    
    # --- VALIDATION PHASE ---
    model.eval()
    correct = 0
    total = 0
    val_loss = 0.0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels) # Calc val loss too
            
            _, predicted = torch.max(outputs, 1)
            
            val_loss += loss.item() * inputs.size(0)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
    val_acc = correct / total
    val_loss = val_loss / len(val_loader.dataset)
    
    # Get current LR for printout
    current_lr = optimizer.param_groups[0]['lr']
    
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {epoch_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | LR: {current_lr:.6f}")
    
    # --- SAVE BEST MODEL ---
    if val_acc > best_acc:
        best_acc = val_acc
        best_model_wts = copy.deepcopy(model.state_dict())
        torch.save(model.state_dict(), 'best_custom_model.pth')

# End of training
time_elapsed = time.time() - start_time
print(f'\nTraining complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
print(f'Best Validation Accuracy: {best_acc:.4f}')

# Load best weights
model.load_state_dict(best_model_wts)

Initializing ResNet with Dropout & OneCycleLR...
Starting training on cuda for 25 epochs...
Epoch 1/25 | Train Loss: 5.4069 | Val Loss: 5.1509 | Val Acc: 0.0229 | LR: 0.000245
Epoch 2/25 | Train Loss: 5.1032 | Val Loss: 5.6120 | Val Acc: 0.0153 | LR: 0.000599
Epoch 3/25 | Train Loss: 5.0328 | Val Loss: 6.4262 | Val Acc: 0.0165 | LR: 0.001120
Epoch 4/25 | Train Loss: 4.9076 | Val Loss: 5.0520 | Val Acc: 0.0293 | LR: 0.001717
Epoch 5/25 | Train Loss: 4.7700 | Val Loss: 5.0397 | Val Acc: 0.0369 | LR: 0.002287
Epoch 6/25 | Train Loss: 4.7493 | Val Loss: 8.0671 | Val Acc: 0.0140 | LR: 0.002731
Epoch 7/25 | Train Loss: 4.6103 | Val Loss: 5.1493 | Val Acc: 0.0433 | LR: 0.002971
Epoch 8/25 | Train Loss: 4.4736 | Val Loss: 4.9540 | Val Acc: 0.0394 | LR: 0.002993
Epoch 9/25 | Train Loss: 4.3446 | Val Loss: 4.7210 | Val Acc: 0.0547 | LR: 0.002945
Epoch 10/25 | Train Loss: 4.1688 | Val Loss: 4.6990 | Val Acc: 0.0585 | LR: 0.002849
Epoch 11/25 | Train Loss: 4.0666 | Val Loss: 4.8127 | Val Acc: 0.05

<All keys matched successfully>

In [12]:
# ==========================================
# 7. GENERATE SUBMISSION
# ==========================================
# Load Best Model
model.load_state_dict(torch.load('best_custom_model.pth'))
model.eval()

test_dataset = BirdDataset(
    csv_file=f'{DATA_PATH}/test_images_path.csv',
    root_dir=f'{DATA_PATH}',
    transform=val_transforms # No augmentation for testing
)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

all_preds = []
all_ids = []

print("Generating predictions...")
with torch.no_grad():
    for inputs, ids in test_loader:
        inputs = inputs.to(DEVICE)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        
        # Convert back to 1-200 range
        predicted = predicted.cpu().numpy() + 1 
        
        all_preds.extend(predicted)
        all_ids.extend(ids.numpy())

# Save CSV
submission = pd.DataFrame({'id': all_ids, 'label': all_preds})
submission.to_csv('submission_scratch4.csv', index=False)
print("Saved submission_scratch.csv!")

Generating predictions...
Saved submission_scratch.csv!
