# ResNet50 + Mixup Training

## Objective
Train ResNet50 with Mixup regularization using the proven optimization recipe from exp_007.
This creates a diverse model for ensembling with EfficientNet-B4 (exp_008).

## Expected Results
- Target CV: 0.055-0.057 (improvement over ResNet50 baseline of 0.0590)
- Creates diverse architecture for two-model ensemble
- Ensemble target: 0.032-0.034 (additional 5-10% improvement)

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Verify GPU
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Set random seeds for reproducibility
def set_seed(seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

## Data Loading and Preprocessing

In [None]:
# Load data
train_dir = '/home/data/train'
test_dir = '/home/data/test'

train_files = []
train_labels = []

for label, category in enumerate(['cat', 'dog']):
    category_path = os.path.join(train_dir, category)
    for img_file in os.listdir(category_path):
        if img_file.endswith('.jpg'):
            train_files.append(os.path.join(category_path, img_file))
            train_labels.append(label)

print(f"Total training images: {len(train_files)}")
print(f"Cats: {sum(1 for x in train_labels if x == 0)}")
print(f"Dogs: {sum(1 for x in train_labels if x == 1)}")

# Test files
test_files = [os.path.join(test_dir, f) for f in os.listdir(test_dir) if f.endswith('.jpg')]
print(f"Total test images: {len(test_files)}")

In [None]:
# Define Mixup function
class Mixup:
    def __init__(self, alpha=0.2):
        self.alpha = alpha
    
    def __call__(self, batch_x, batch_y):
        if self.alpha > 0:
            lam = np.random.beta(self.alpha, self.alpha)
        else:
            lam = 1
        
        batch_size = batch_x.size(0)
        index = torch.randperm(batch_size).cuda() if batch_x.is_cuda else torch.randperm(batch_size)
        
        mixed_x = lam * batch_x + (1 - lam) * batch_x[index]
        y_a, y_b = batch_y, batch_y[index]
        
        return mixed_x, y_a, y_b, lam

# Custom loss for Mixup
def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

In [None]:
# Custom Dataset with Mixup
class DogCatDataset(Dataset):
    def __init__(self, file_paths, labels, transform=None, is_training=True):
        self.file_paths = file_paths
        self.labels = labels
        self.transform = transform
        self.is_training = is_training
    
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        img_path = self.file_paths[idx]
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        if self.labels is not None:
            label = self.labels[idx]
            return image, torch.tensor(label, dtype=torch.float32)
        else:
            return image

# Transforms
train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomErasing(p=0.25, scale=(0.02, 0.15)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

tta_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

## Model Definition

In [None]:
class DogCatClassifier(nn.Module):
    def __init__(self, num_classes=1):
        super(DogCatClassifier, self).__init__()
        self.backbone = models.resnet50(pretrained=True)
        
        # Freeze all layers initially
        for param in self.backbone.parameters():
            param.requires_grad = False
        
        # Replace final layer
        num_features = self.backbone.fc.in_features
        self.backbone.fc = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(num_features, num_classes)
        )
        
        # Initialize new layer
        nn.init.xavier_normal_(self.backbone.fc[1].weight)
        nn.init.constant_(self.backbone.fc[1].bias, 0)
    
    def forward(self, x):
        return self.backbone(x)
    
    def unfreeze_layers(self, layer_names):
        """Unfreeze specific layers for fine-tuning"""
        for name, param in self.backbone.named_parameters():
            for layer_name in layer_names:
                if layer_name in name:
                    param.requires_grad = True
                    break

# Learning rate setup
def get_optimizer_params(model, backbone_lr, head_lr):
    """Separate parameters for different learning rates"""
    backbone_params = []
    head_params = []
    
    # Head parameters (final classifier)
    for name, param in model.backbone.fc.named_parameters():
        head_params.append(param)
    
    # Backbone parameters
    for name, param in model.backbone.named_parameters():
        if 'fc' not in name:  # Exclude head
            backbone_params.append(param)
    
    return [
        {'params': backbone_params, 'lr': backbone_lr},
        {'params': head_params, 'lr': head_lr}
    ]

## Training Functions

In [None]:
def train_epoch(model, loader, optimizer, criterion, device, mixup_fn, scheduler=None):
    model.train()
    total_loss = 0
    
    for batch_idx, (data, target) in enumerate(loader):
        data, target = data.to(device), target.to(device)
        
        # Apply Mixup
        data, target_a, target_b, lam = mixup_fn(data, target)
        
        optimizer.zero_grad()
        output = model(data).squeeze()
        loss = mixup_criterion(criterion, output, target_a, target_b, lam)
        loss.backward()
        optimizer.step()
        
        if scheduler:
            scheduler.step()
        
        total_loss += loss.item()
    
    return total_loss / len(loader)

def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for data, target in loader:
            data, target = data.to(device), target.to(device)
            output = model(data).squeeze()
            loss = criterion(output, target)
            
            total_loss += loss.item()
            all_preds.extend(torch.sigmoid(output).cpu().numpy())
            all_targets.extend(target.cpu().numpy())
    
    avg_loss = total_loss / len(loader)
    logloss = log_loss(all_targets, all_preds)
    
    return avg_loss, logloss, all_preds

def predict_with_tta(model, test_loader, device, n_augmentations=5):
    model.eval()
    all_preds = []
    
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            batch_preds = []
            
            for _ in range(n_augmentations):
                output = model(data).squeeze()
                batch_preds.append(torch.sigmoid(output).cpu().numpy())
            
            # Average across augmentations
            avg_pred = np.mean(batch_preds, axis=0)
            all_preds.extend(avg_pred)
    
    return np.array(all_preds)

## Cross-Validation Training

In [None]:
# 5-fold stratified CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_scores = []
fold_models = []

# Training configuration
EPOCHS_HEAD = 3
EPOCHS_FINETUNE = 12
BATCH_SIZE = 64
BACKBONE_LR = 0.00002
HEAD_LR = 0.0002
WARMUP_EPOCHS = 2

print(f"Training configuration:")
print(f"- Epochs: {EPOCHS_HEAD} (head) + {EPOCHS_FINETUNE} (fine-tune) = {EPOCHS_HEAD + EPOCHS_FINETUNE} total")
print(f"- Batch size: {BATCH_SIZE}")
print(f"- Learning rates: backbone={BACKBONE_LR}, head={HEAD_LR}")
print(f"- Mixup alpha: 0.2")
print(f"- TTA augmentations: 5")
print("="*60)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_files, train_labels)):
    print(f"\nFold {fold + 1}/5")
    print("="*40)
    
    # Prepare data
    train_files_fold = [train_files[i] for i in train_idx]
    train_labels_fold = [train_labels[i] for i in train_idx]
    val_files_fold = [train_files[i] for i in val_idx]
    val_labels_fold = [train_labels[i] for i in val_idx]
    
    train_dataset = DogCatDataset(train_files_fold, train_labels_fold, train_transform, is_training=True)
    val_dataset = DogCatDataset(val_files_fold, val_labels_fold, val_transform, is_training=False)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
    
    # Initialize model
    model = DogCatClassifier().cuda()
    
    # Loss and optimizer
    criterion = nn.BCEWithLogitsLoss()
    mixup_fn = Mixup(alpha=0.2)
    
    # Phase 1: Train head only
    print("Phase 1: Training head only...")
    head_params = model.backbone.fc.parameters()
    optimizer_head = optim.AdamW(head_params, lr=HEAD_LR, weight_decay=0.05)
    
    for epoch in range(EPOCHS_HEAD):
        train_loss = train_epoch(model, train_loader, optimizer_head, criterion, 'cuda', mixup_fn)
        val_loss, val_logloss, _ = validate(model, val_loader, criterion, 'cuda')
        print(f"  Epoch {epoch+1}/{EPOCHS_HEAD} - Train Loss: {train_loss:.4f}, Val LogLoss: {val_logloss:.4f}")
    
    # Phase 2: Fine-tune all layers
    print("Phase 2: Fine-tuning all layers...")
    model.unfreeze_layers(['layer3', 'layer4'])
    
    optimizer_params = get_optimizer_params(model, BACKBONE_LR, HEAD_LR)
    optimizer_ft = optim.AdamW(optimizer_params, weight_decay=0.05)
    
    # Cosine annealing with warmup
    total_steps = EPOCHS_FINETUNE * len(train_loader)
    warmup_steps = WARMUP_EPOCHS * len(train_loader)
    
    def lr_lambda(current_step):
        if current_step < warmup_steps:
            return float(current_step) / float(max(1, warmup_steps))
        progress = float(current_step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        return 0.5 * (1.0 + np.cos(np.pi * progress))
    
    scheduler = optim.lr_scheduler.LambdaLR(optimizer_ft, lr_lambda)
    
    best_logloss = float('inf')
    best_model_state = None
    
    for epoch in range(EPOCHS_FINETUNE):
        train_loss = train_epoch(model, train_loader, optimizer_ft, criterion, 'cuda', mixup_fn, scheduler)
        val_loss, val_logloss, _ = validate(model, val_loader, criterion, 'cuda')
        
        print(f"  Epoch {epoch+1}/{EPOCHS_FINETUNE} - Train Loss: {train_loss:.4f}, Val LogLoss: {val_logloss:.4f}")
        
        if val_logloss < best_logloss:
            best_logloss = val_logloss
            best_model_state = model.state_dict().copy()
    
    # Load best model
    model.load_state_dict(best_model_state)
    fold_scores.append(best_logloss)
    fold_models.append(model)
    
    print(f"Fold {fold + 1} Best LogLoss: {best_logloss:.4f}")

print("\n" + "="*60)
print(f"Cross-validation completed!")
print(f"Mean LogLoss: {np.mean(fold_scores):.4f} Â± {np.std(fold_scores):.4f}")
print(f"Individual folds: {[f'{score:.4f}' for score in fold_scores]}")