# Transfer Learning with ResNet50

This experiment implements transfer learning using ResNet50 pretrained on ImageNet to address the issues from the baseline:
- Severe overfitting (98% train vs 86% val)
- Numerical instability (NaN log loss)
- No data augmentation
- Low resolution (128x128)

## Strategy
1. Use ResNet50 backbone with frozen initial layers
2. Add aggressive data augmentation
3. Use BCEWithLogitsLoss for numerical stability
4. Two-phase training: freeze backbone first, then fine-tune
5. Increase resolution to 224x224

In [4]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.cuda.amp import autocast, GradScaler
import warnings
warnings.filterwarnings('ignore')

# Check GPU
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

GPU available: True
GPU: NVIDIA A100-SXM4-80GB
GPU Memory: 85.10 GB


In [5]:
# Load data information
train_dir = '/home/data/train'
test_dir = '/home/data/test'

# Get all training files
train_files = os.listdir(train_dir)
train_files = [f for f in train_files if f.endswith('.jpg')]

# Create labels (dog=1, cat=0)
train_labels = []
for f in train_files:
    if f.startswith('dog'):
        train_labels.append(1)
    else:
        train_labels.append(0)

print(f"Total training images: {len(train_files)}")
print(f"Dogs: {sum(train_labels)}")
print(f"Cats: {len(train_labels) - sum(train_labels)}")
print(f"Class distribution: {np.mean(train_labels):.2%} dogs, {1-np.mean(train_labels):.2%} cats")

Total training images: 22500
Dogs: 11258
Cats: 11242
Class distribution: 50.04% dogs, 49.96% cats


In [6]:
# Create stratified split (since split file doesn't exist or is corrupted)
from sklearn.model_selection import train_test_split

# Create stratified split (80% train, 20% val)
train_files_split, val_files_split, train_labels_split, val_labels_split = train_test_split(
    train_files, train_labels, test_size=0.2, stratify=train_labels, random_state=42
)

print(f"Train set: {len(train_files_split)} images")
print(f"Val set: {len(val_files_split)} images")
print(f"Train dogs: {sum(train_labels_split)}, cats: {len(train_labels_split) - sum(train_labels_split)}")
print(f"Val dogs: {sum(val_labels_split)}, cats: {len(val_labels_split) - sum(val_labels_split)}")

# Save split for future use
os.makedirs('/home/code/splits', exist_ok=True)
split_data = {
    'train_files': train_files_split,
    'val_files': val_files_split,
    'train_labels': train_labels_split,
    'val_labels': val_labels_split
}
with open('/home/code/splits/stratified_split_20pct.pkl', 'wb') as f:
    pickle.dump(split_data, f)
print("Split saved to /home/code/splits/stratified_split_20pct.pkl")

Train set: 18000 images
Val set: 4500 images
Train dogs: 9006, cats: 8994
Val dogs: 2252, cats: 2248
Split saved to /home/code/splits/stratified_split_20pct.pkl


In [7]:
# Define dataset class
class DogsCatsDataset(Dataset):
    def __init__(self, file_list, labels, transform=None, is_test=False):
        self.file_list = file_list
        self.labels = labels
        self.transform = transform
        self.is_test = is_test
        self.train_dir = '/home/data/train'
        self.test_dir = '/home/data/test'
    
    def __len__(self):
        return len(self.file_list)
    
    def __getitem__(self, idx):
        if self.is_test:
            img_path = os.path.join(self.test_dir, self.file_list[idx])
        else:
            img_path = os.path.join(self.train_dir, self.file_list[idx])
        
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        if self.is_test:
            return image
        else:
            label = self.labels[idx]
            return image, torch.tensor(label, dtype=torch.float32)

In [8]:
# Define data transforms with aggressive augmentation
train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create datasets and dataloaders
train_dataset = DogsCatsDataset(train_files_split, train_labels_split, transform=train_transform)
val_dataset = DogsCatsDataset(val_files_split, val_labels_split, transform=val_transform)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

Train batches: 563
Val batches: 141


In [9]:
# Define model with ResNet50 backbone
class DogsCatsResNet50(nn.Module):
    def __init__(self, pretrained=True):
        super(DogsCatsResNet50, self).__init__()
        # Load pretrained ResNet50
        self.backbone = models.resnet50(pretrained=pretrained)
        
        # Get number of features from the backbone
        num_features = self.backbone.fc.in_features
        
        # Replace the final layer with our classification head
        self.backbone.fc = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(num_features, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1)  # Output single logit
        )
    
    def forward(self, x):
        return self.backbone(x)

# Create model
model = DogsCatsResNet50(pretrained=True)
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters (before freezing): {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# Freeze backbone for phase 1 training
for param in model.backbone.parameters():
    param.requires_grad = False

print(f"Trainable parameters (after freezing): {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /home/nonroot/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

  1%|          | 656k/97.8M [00:00<00:15, 6.67MB/s]

  9%|▉         | 9.28M/97.8M [00:00<00:01, 55.9MB/s]

 25%|██▍       | 24.0M/97.8M [00:00<00:00, 101MB/s] 

 40%|████      | 39.4M/97.8M [00:00<00:00, 125MB/s]

 57%|█████▋    | 55.2M/97.8M [00:00<00:00, 140MB/s]

 72%|███████▏  | 70.5M/97.8M [00:00<00:00, 147MB/s]

 88%|████████▊ | 86.1M/97.8M [00:00<00:00, 152MB/s]

100%|██████████| 97.8M/97.8M [00:00<00:00, 132MB/s]




Total parameters: 24,032,833
Trainable parameters (before freezing): 24,032,833
Trainable parameters (after freezing): 0


In [None]:
# Define loss function (numerically stable) and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Use BCEWithLogitsLoss for numerical stability
criterion = nn.BCEWithLogitsLoss()

# Optimizer for phase 1 (only classification head)
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3, weight_decay=1e-4)

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

# Gradient scaler for mixed precision training
scaler = GradScaler()

print(f"Device: {device}")
print(f"Loss function: {criterion}")
print(f"Optimizer: {optimizer}")

In [None]:
# Training functions with gradient clipping and mixed precision
def train_epoch(model, loader, criterion, optimizer, device, scaler):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    pbar = tqdm(loader, desc='Training')
    for images, labels in pbar:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True).unsqueeze(1)
        
        optimizer.zero_grad()
        
        # Mixed precision training
        with autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)
        
        # Backward pass with gradient scaling
        scaler.scale(loss).backward()
        
        # Gradient clipping
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        # Optimizer step
        scaler.step(optimizer)
        scaler.update()
        
        # Statistics
        total_loss += loss.item()
        probs = torch.sigmoid(outputs)
        predicted = (probs > 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        
        # Update progress bar
        pbar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'acc': f'{100.*correct/total:.2f}%'
        })
    
    return total_loss / len(loader), correct / total

def validate_epoch(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_probs = []
    all_labels = []
    
    with torch.no_grad():
        pbar = tqdm(loader, desc='Validation')
        for images, labels in pbar:
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True).unsqueeze(1)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            probs = torch.sigmoid(outputs)
            predicted = (probs > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
            
            all_probs.extend(probs.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())
            
            pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{100.*correct/total:.2f}%'
            })
    
    # Calculate log loss with numerical stability
    all_probs = np.array(all_probs)
    all_labels = np.array(all_labels)
    
    # Clamp probabilities to avoid log(0)
    epsilon = 1e-7
    all_probs_clamped = np.clip(all_probs, epsilon, 1 - epsilon)
    log_loss = -np.mean(all_labels * np.log(all_probs_clamped) + (1 - all_labels) * np.log(1 - all_probs_clamped))
    
    return total_loss / len(loader), correct / total, log_loss, all_probs, all_labels

In [None]:
# Training loop with early stopping and checkpointing
import os
os.makedirs('/home/code/models', exist_ok=True)

num_epochs_phase1 = 10  # Train only classification head
num_epochs_phase2 = 15  # Fine-tune entire network
best_val_loss = float('inf')
best_val_log_loss = float('inf')
best_val_acc = 0
early_stopping_patience = 5
early_stopping_counter = 0

train_losses = []
val_losses = []
val_accs = []
val_log_losses = []

print("="*60)
print("PHASE 1: Training Classification Head (Frozen Backbone)")
print("="*60)

for epoch in range(num_epochs_phase1):
    print(f"\nEpoch {epoch+1}/{num_epochs_phase1}")
    
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device, scaler)
    val_loss, val_acc, val_log_loss, val_probs, val_labels = validate_epoch(model, val_loader, criterion, device)
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    val_accs.append(val_acc)
    val_log_losses.append(val_log_loss)
    
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val LogLoss: {val_log_loss:.4f}")
    
    # Learning rate scheduling
    scheduler.step(val_loss)
    
    # Save best model
    if val_log_loss < best_val_log_loss:
        best_val_log_loss = val_log_loss
        best_val_loss = val_loss
        best_val_acc = val_acc
        torch.save(model.state_dict(), '/home/code/models/resnet50_phase1_best.pth')
        print(f"✓ New best model saved! Val LogLoss: {val_log_loss:.4f}")
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        print(f"Early stopping counter: {early_stopping_counter}/{early_stopping_patience}")
    
    # Early stopping
    if early_stopping_counter >= early_stopping_patience:
        print("Early stopping triggered!")
        break

print(f"\nPhase 1 completed. Best Val LogLoss: {best_val_log_loss:.4f}")