# Experiment 002: Longer Training with Early Stopping

## Goal: Address the primary bottleneck of undertraining

This experiment increases training duration from 3 to 15 epochs with early stopping (patience=3) to allow models to converge properly. Based on the baseline analysis, validation loss was still decreasing at epoch 3, indicating massive undertraining.

### Changes from Baseline:
- Training epochs: 3 → 15 (with early stopping)
- Added model checkpointing to save best model per fold
- Added EarlyStopping callback with patience=3
- Keep all other parameters same for isolated comparison

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

# Check GPU availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("WARNING: No GPU available, using CPU")

PyTorch version: 2.2.0+cu118
CUDA available: True
GPU: NVIDIA A100-SXM4-80GB
GPU Memory: 85.1 GB


In [2]:
# Data paths
DATA_DIR = '/home/data'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
TEST_DIR = os.path.join(DATA_DIR, 'test')

# Verify data exists
print(f"Train directory exists: {os.path.exists(TRAIN_DIR)}")
print(f"Test directory exists: {os.path.exists(TEST_DIR)}")

# List some training images
train_files = os.listdir(TRAIN_DIR)[:5]
print(f"Sample training files: {train_files}")

Train directory exists: True
Test directory exists: True
Sample training files: ['dog.5.jpg', 'cat.8112.jpg', 'cat.1197.jpg', 'dog.8491.jpg', 'dog.9129.jpg']


In [3]:
# Create dataset class
class DogsCatsDataset(Dataset):
    def __init__(self, file_paths, labels=None, transform=None):
        self.file_paths = file_paths
        self.labels = labels
        self.transform = transform
        
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        img_path = self.file_paths[idx]
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
            
        if self.labels is not None:
            label = self.labels[idx]
            return image, label
        return image

# Data transforms (same as baseline)
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_transform = val_transform

In [4]:
# Prepare training data
print("Loading training data...")
train_files = [os.path.join(TRAIN_DIR, f) for f in os.listdir(TRAIN_DIR) if f.endswith('.jpg')]
train_labels = [1 if 'dog' in os.path.basename(f) else 0 for f in train_files]

print(f"Total training images: {len(train_files)}")
print(f"Dog images: {sum(train_labels)}")
print(f"Cat images: {len(train_labels) - sum(train_labels)}")

# Convert to numpy arrays
train_files = np.array(train_files)
train_labels = np.array(train_labels)

Loading training data...
Total training images: 22500
Dog images: 11258
Cat images: 11242


In [5]:
# Create model function (same architecture as baseline)
def create_model():
    # Load pretrained ResNet50
    model = models.resnet50(pretrained=True)
    
    # Freeze early layers (same as baseline - will change in future experiments)
    for param in model.parameters():
        param.requires_grad = False
    
    # Replace final layer for binary classification
    num_features = model.fc.in_features
    model.fc = nn.Sequential(
        nn.Dropout(0.5),
        nn.Linear(num_features, 1),
        nn.Sigmoid()
    )
    
    return model

# Training function with early stopping and checkpointing
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, device, max_epochs=15, patience=3):
    best_val_loss = float('inf')
    best_model_state = None
    epochs_no_improve = 0
    training_history = []
    
    print(f"Training for up to {max_epochs} epochs with early stopping (patience={patience})...\n")
    
    for epoch in range(max_epochs):
        # Training phase
        model.train()
        train_loss = 0
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device).float()
            
            optimizer.zero_grad()
            output = model(data).squeeze()
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
            if batch_idx % 100 == 0:
                print(f'Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}')
        
        # Validation phase
        model.eval()
        val_loss = 0
        val_preds = []
        val_targets = []
        
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device).float()
                output = model(data).squeeze()
                loss = criterion(output, target)
                val_loss += loss.item()
                
                val_preds.extend(output.cpu().numpy())
                val_targets.extend(target.cpu().numpy())
        
        # Calculate metrics
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        val_log_loss = log_loss(val_targets, val_preds)
        
        print(f'Epoch {epoch+1}/{max_epochs}:')
        print(f'  Train Loss: {train_loss:.4f}')
        print(f'  Val Loss: {val_loss:.4f}')
        print(f'  Val Log Loss: {val_log_loss:.4f}')
        
        scheduler.step(val_loss)
        
        # Check for improvement
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict().copy()
            epochs_no_improve = 0
            print(f'  ✓ New best validation loss: {best_val_loss:.4f}')
        else:
            epochs_no_improve += 1
            print(f'  ✗ No improvement for {epochs_no_improve} epoch(s)')
        
        # Early stopping
        if epochs_no_improve >= patience:
            print(f'\nEarly stopping triggered after {epoch+1} epochs')
            break
        
        training_history.append({
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'val_loss': val_loss,
            'val_log_loss': val_log_loss
        })
        
        print()
    
    # Load best model state
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print(f"Loaded best model from epoch with val loss: {best_val_loss:.4f}")
    
    return model, best_val_loss, training_history

In [None]:
# Cross-validation setup
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Store results
cv_scores = []
oof_predictions = np.zeros(len(train_files))
all_training_histories = []

print(f"\nStarting {n_splits}-fold cross-validation...\n")

In [None]:
# Run cross-validation
fold = 1
for train_idx, val_idx in skf.split(train_files, train_labels):
    print(f"\n{'='*60}")
    print(f"FOLD {fold}/{n_splits}")
    print(f"{'='*60}")
    
    # Split data
    X_train, X_val = train_files[train_idx], train_files[val_idx]
    y_train, y_val = train_labels[train_idx], train_labels[val_idx]
    
    print(f"Training samples: {len(X_train)}")
    print(f"Validation samples: {len(X_val)}")
    
    # Create datasets
    train_dataset = DogsCatsDataset(X_train, y_train, transform=train_transform)
    val_dataset = DogsCatsDataset(X_val, y_val, transform=val_transform)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)
    
    # Create model
    model = create_model()
    model = model.to(device)
    
    # Loss and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.fc.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)
    
    # Train model with early stopping
    model, best_val_loss, fold_history = train_model(
        model, train_loader, val_loader, criterion, optimizer, scheduler, 
        device, max_epochs=15, patience=3
    )
    
    # Store training history
    all_training_histories.append(fold_history)
    
    # Calculate final validation log loss with best model
    model.eval()
    val_preds = []
    val_targets = []
    
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device).float()
            output = model(data).squeeze()
            val_preds.extend(output.cpu().numpy())
            val_targets.extend(target.cpu().numpy())
    
    final_val_log_loss = log_loss(val_targets, val_preds)
    oof_predictions[val_idx] = val_preds
    cv_scores.append(final_val_log_loss)
    
    print(f"Fold {fold} Final Log Loss: {final_val_log_loss:.4f}")
    print(f"Fold {fold} Best Val Loss: {best_val_loss:.4f}")
    
    fold += 1

# Overall CV score
print(f"\n{'='*60}")
print(f"CROSS-VALIDATION RESULTS")
print(f"{'='*60}")
print(f"Mean Log Loss: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
print(f"Individual folds: {cv_scores}")

# Show training history summary
print(f"\n{'='*60}")
print(f"TRAINING HISTORY SUMMARY")
print(f"{'='*60}")
for i, history in enumerate(all_training_histories):
    epochs = len(history)
    best_epoch = min(history, key=lambda x: x['val_log_loss'])
    print(f"Fold {i+1}: {epochs} epochs, best val log loss: {best_epoch['val_log_loss']:.4f} at epoch {best_epoch['epoch']}")

In [None]:
# Generate predictions on test set
print("\nGenerating predictions on test set...")

# Load test files
test_files = [os.path.join(TEST_DIR, f) for f in sorted(os.listdir(TEST_DIR)) if f.endswith('.jpg')]
test_ids = [int(os.path.splitext(os.path.basename(f))[0]) for f in test_files]

print(f"Total test images: {len(test_files)}")

# Create test dataset and loader
test_dataset = DogsCatsDataset(test_files, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

# Average predictions from all folds
test_predictions = np.zeros(len(test_files))

fold = 1
for train_idx, val_idx in skf.split(train_files, train_labels):
    print(f"Generating predictions from fold {fold}...")
    
    # Recreate and load model for this fold
    model = create_model()
    model = model.to(device)
    
    # Create temporary training split to train model
    X_train, X_val = train_files[train_idx], train_files[val_idx]
    y_train, y_val = train_labels[train_idx], train_labels[val_idx]
    
    train_dataset = DogsCatsDataset(X_train, y_train, transform=train_transform)
    val_dataset = DogsCatsDataset(X_val, y_val, transform=val_transform)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)
    
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.fc.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)
    
    # Train model with early stopping
    model, _, _ = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, 
                              device, max_epochs=15, patience=3)
    
    # Generate predictions
    model.eval()
    fold_preds = []
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            output = model(data).squeeze()
            fold_preds.extend(output.cpu().numpy())
    
    test_predictions += np.array(fold_preds)
    fold += 1

# Average predictions across folds
test_predictions /= n_splits

print(f"Test predictions shape: {test_predictions.shape}")
print(f"Test predictions range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")

In [None]:
# Create submission file
print("\nCreating submission file...")

submission = pd.DataFrame({
    'id': test_ids,
    'label': test_predictions
})

# Sort by id
submission = submission.sort_values('id').reset_index(drop=True)

print(f"Submission shape: {submission.shape}")
print(f"Sample predictions:")
print(submission.head(10))

# Save submission
SUBMISSION_DIR = '/home/submission'
os.makedirs(SUBMISSION_DIR, exist_ok=True)
submission_path = os.path.join(SUBMISSION_DIR, 'submission.csv')
submission.to_csv(submission_path, index=False)

print(f"Submission saved to: {submission_path}")

# Verify submission format
sample_sub = pd.read_csv('/home/data/sample_submission.csv')
print(f"\nSubmission format matches sample: {list(submission.columns) == list(sample_sub.columns)}")
print(f"ID ranges match: {submission['id'].min() == sample_sub['id'].min() and submission['id'].max() == sample_sub['id'].max()}")