# Multi-Architecture Ensemble for Footprint Classification

This notebook trains an ensemble of 3 different architectures (ResNet-34, EfficientNet-B0, ConvNeXt-Tiny) and generates a Kaggle submission.

In [26]:
# Cell 1: Imports & Setup
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torchvision.transforms.functional as TF
import timm
from pathlib import Path
from PIL import Image
import numpy as np
import pandas as pd
import random
import copy
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

Using device: cuda
GPU: NVIDIA GeForce RTX 3080


In [27]:
# Cell 2: Configuration

# Ensemble of 3 ResNet-34 models with different seeds for diversity
MODEL_CONFIGS = [
    {'name': 'ResNet-34-seed42', 'type': 'resnet34', 'seed': 42},
    {'name': 'ResNet-34-seed123', 'type': 'resnet34', 'seed': 123},
    {'name': 'ResNet-34-seed456', 'type': 'resnet34', 'seed': 456},
]

IMG_SIZE = 224
BATCH_SIZE = 128
EPOCHS = 25
LR = 0.0005
WEIGHT_DECAY = 1e-4
DROPOUT = 0.5  # Changed from 0.3 to match main notebook
EARLY_STOP_PATIENCE = 7
VAL_SPLIT = 0.2

# Data paths
DATA_DIR = Path('./data')
TRAIN_DIR = DATA_DIR / 'train'
TEST_DIR = DATA_DIR / 'test'

print(f'Ensemble Configuration:')
print(f'  Model: ResNet-34 x 3 (different seeds)')
print(f'  Seeds: {[m["seed"] for m in MODEL_CONFIGS]}')
print(f'  Image size: {IMG_SIZE}')
print(f'  Batch size: {BATCH_SIZE}')
print(f'  Epochs: {EPOCHS}')
print(f'  Learning rate: {LR}')
print(f'  Dropout: {DROPOUT}')
print(f'  Early stopping patience: {EARLY_STOP_PATIENCE}')

Ensemble Configuration:
  Model: ResNet-34 x 3 (different seeds)
  Seeds: [42, 123, 456]
  Image size: 224
  Batch size: 128
  Epochs: 25
  Learning rate: 0.0005
  Dropout: 0.5
  Early stopping patience: 7


In [28]:
# Cell 3: Data Loading

def set_seed(seed):
    """Set all random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def load_data(data_dir):
    """Load image paths and labels from directory structure"""
    paths, labels = [], []
    # Folders are named '0' (Female) and '1' (Male)
    for class_idx in [0, 1]:
        class_dir = data_dir / str(class_idx)
        if class_dir.exists():
            for img_path in class_dir.glob('*'):
                if img_path.suffix.lower() in ['.png', '.jpg', '.jpeg']:
                    paths.append(img_path)
                    labels.append(class_idx)
    return paths, labels

# Check if grayscale
sample_img = Image.open(list(TRAIN_DIR.glob('*/*'))[0])
IS_GRAYSCALE = sample_img.mode == 'L'
INPUT_CHANNELS = 1 if IS_GRAYSCALE else 3
print(f'Image mode: {"Grayscale" if IS_GRAYSCALE else "RGB"}')

# Normalization
mean_std = ([0.5], [0.5]) if IS_GRAYSCALE else ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])

# Training transforms - simpler augmentation (matches main notebook's best config)
train_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(p=0.5),  # Only HorizontalFlip - best from main notebook
    transforms.ToTensor(),
    transforms.Normalize(*mean_std),
])

# Validation/test transforms
val_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(*mean_std)
])

class FootprintDataset(Dataset):
    def __init__(self, paths, labels, transform=None):
        self.paths = paths
        self.labels = labels
        self.transform = transform
    
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, idx):
        img = Image.open(self.paths[idx]).convert('L' if IS_GRAYSCALE else 'RGB')
        if self.transform:
            img = self.transform(img)
        return img, self.labels[idx]

# Load and split data
train_paths, train_labels = load_data(TRAIN_DIR)
print(f'Total training images: {len(train_paths)}')
print(f'Class distribution: 0(Female)={sum(1 for l in train_labels if l==0)}, 1(Male)={sum(1 for l in train_labels if l==1)}')

Image mode: RGB
Total training images: 1573
Class distribution: 0(Female)=845, 1(Male)=728


In [29]:
# Cell 4: Model Factory

def create_model(model_type):
    """Create a model based on the specified type"""
    
    if model_type == 'resnet34':
        model = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1)
        if INPUT_CHANNELS == 1:
            model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        model.fc = nn.Sequential(
            nn.Dropout(DROPOUT),
            nn.Linear(model.fc.in_features, 2)
        )
        
    elif model_type == 'efficientnet_b0':
        model = timm.create_model('efficientnet_b0', pretrained=True, 
                                   num_classes=2, in_chans=INPUT_CHANNELS, drop_rate=DROPOUT)
        
    elif model_type == 'mobilenetv3':
        model = models.mobilenet_v3_large(weights=models.MobileNet_V3_Large_Weights.IMAGENET1K_V1)
        if INPUT_CHANNELS == 1:
            model.features[0][0] = nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1, bias=False)
        model.classifier[3] = nn.Linear(model.classifier[3].in_features, 2)
        
    elif model_type == 'convnext_tiny':
        model = timm.create_model('convnext_tiny', pretrained=True, 
                                   num_classes=2, in_chans=INPUT_CHANNELS, drop_rate=DROPOUT)
    else:
        raise ValueError(f'Unknown model type: {model_type}')
    
    return model.to(device)

# Test model creation for each architecture
print('Testing model creation:')
for config in MODEL_CONFIGS:
    test_model = create_model(config['type'])
    params = sum(p.numel() for p in test_model.parameters())
    print(f"  {config['name']}: {params:,} parameters")
    del test_model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

Testing model creation:
  ResNet-34-seed42: 21,285,698 parameters
  ResNet-34-seed123: 21,285,698 parameters
  ResNet-34-seed456: 21,285,698 parameters


In [30]:
# Cell 5: Training Function

def train_single_model(model_config, train_paths, train_labels):
    """Train a single model with the given configuration"""
    model_name = model_config['name']
    model_type = model_config['type']
    seed = model_config['seed']
    
    print(f'\n{"="*60}')
    print(f'Training {model_name} (seed={seed})')
    print(f'{"="*60}')
    
    # Set seed for reproducibility
    set_seed(seed)
    
    # Split data with this seed
    t_paths, v_paths, t_labels, v_labels = train_test_split(
        train_paths, train_labels, test_size=VAL_SPLIT, 
        stratify=train_labels, random_state=seed
    )
    
    # Create datasets and loaders
    train_dataset = FootprintDataset(t_paths, t_labels, train_transform)
    val_dataset = FootprintDataset(v_paths, v_labels, val_transform)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    
    # Create model
    model = create_model(model_type)
    
    # Class weights for imbalanced data
    n_female = sum(1 for l in t_labels if l == 0)
    n_male = sum(1 for l in t_labels if l == 1)
    total = n_female + n_male
    class_weights = torch.tensor([
        total / (2 * n_female),
        total / (2 * n_male)
    ]).to(device)
    
    # Loss, optimizer, scheduler (StepLR - matches main notebook)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)  # StepLR like main notebook
    
    # Training loop with early stopping
    best_val_acc = 0.0
    best_model_wts = None
    epochs_no_improve = 0
    
    for epoch in range(EPOCHS):
        # Training phase
        model.train()
        train_loss, train_correct, train_total = 0.0, 0, 0
        
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * images.size(0)
            _, predicted = outputs.max(1)
            train_correct += predicted.eq(labels).sum().item()
            train_total += labels.size(0)
        
        train_acc = train_correct / train_total
        
        # Validation phase
        model.eval()
        val_correct, val_total = 0, 0
        
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = outputs.max(1)
                val_correct += predicted.eq(labels).sum().item()
                val_total += labels.size(0)
        
        val_acc = val_correct / val_total
        scheduler.step()
        
        # Save best model and check early stopping
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_wts = copy.deepcopy(model.state_dict())
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
        
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f'Epoch [{epoch+1}/{EPOCHS}] - Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')
        
        # Early stopping
        if epochs_no_improve >= EARLY_STOP_PATIENCE:
            print(f'Early stopping at epoch {epoch+1} (no improvement for {EARLY_STOP_PATIENCE} epochs)')
            break
    
    # Load best weights
    model.load_state_dict(best_model_wts)
    print(f'Best validation accuracy: {best_val_acc:.4f}')
    
    return model, best_val_acc

In [31]:
# Cell 6: Train Ensemble

print(f'Training ensemble of {len(MODEL_CONFIGS)} different architectures...')
print(f'Models: {[m["name"] for m in MODEL_CONFIGS]}')

ensemble_models = []
ensemble_accuracies = []
ensemble_names = []

for i, config in enumerate(MODEL_CONFIGS):
    print(f'\n[Model {i+1}/{len(MODEL_CONFIGS)}]')
    
    model, val_acc = train_single_model(config, train_paths, train_labels)
    ensemble_models.append(model)
    ensemble_accuracies.append(val_acc)
    ensemble_names.append(config['name'])
    
    # Save checkpoint
    safe_name = config['name'].replace('-', '_').replace(' ', '_')
    torch.save(model.state_dict(), f'ensemble_{safe_name}.pth')
    print(f'Saved: ensemble_{safe_name}.pth')

print(f'\n{"="*60}')
print('ENSEMBLE TRAINING COMPLETE')
print(f'{"="*60}')
for name, acc in zip(ensemble_names, ensemble_accuracies):
    print(f'  {name}: {acc:.4f}')
print(f'\nMean accuracy: {np.mean(ensemble_accuracies):.4f}')

Training ensemble of 3 different architectures...
Models: ['ResNet-34-seed42', 'ResNet-34-seed123', 'ResNet-34-seed456']

[Model 1/3]

Training ResNet-34-seed42 (seed=42)
Epoch [1/25] - Train Acc: 0.6661, Val Acc: 0.5079
Epoch [5/25] - Train Acc: 0.9245, Val Acc: 0.6635
Epoch [10/25] - Train Acc: 0.9897, Val Acc: 0.9016
Epoch [15/25] - Train Acc: 0.9992, Val Acc: 0.8921
Early stopping at epoch 17 (no improvement for 7 epochs)
Best validation accuracy: 0.9016
Saved: ensemble_ResNet_34_seed42.pth

[Model 2/3]

Training ResNet-34-seed123 (seed=123)
Epoch [1/25] - Train Acc: 0.6479, Val Acc: 0.5683
Epoch [5/25] - Train Acc: 0.9237, Val Acc: 0.7556
Epoch [10/25] - Train Acc: 0.9905, Val Acc: 0.8698
Epoch [15/25] - Train Acc: 0.9992, Val Acc: 0.8825
Epoch [20/25] - Train Acc: 1.0000, Val Acc: 0.8730
Early stopping at epoch 21 (no improvement for 7 epochs)
Best validation accuracy: 0.8889
Saved: ensemble_ResNet_34_seed123.pth

[Model 3/3]

Training ResNet-34-seed456 (seed=456)
Epoch [1/25] - 

In [None]:
# Cell 7: TTA Prediction Functions (HorizontalFlip only - matches main notebook)

class TestDataset(Dataset):
    """Dataset for test images (no labels)"""
    def __init__(self, paths, transform):
        self.paths = paths
        self.transform = transform
    
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, idx):
        img = Image.open(self.paths[idx]).convert('L' if IS_GRAYSCALE else 'RGB')
        img_transformed = self.transform(img)
        return img_transformed, self.paths[idx].stem  # .stem = filename WITHOUT extension

def apply_tta(model, image_tensor):
    """Apply test-time augmentation (horizontal flip only)
    
    Returns averaged probabilities across:
    - Original image
    - Horizontal flip
    
    Note: Rotations removed because footprints are NOT rotationally invariant
    """
    model.eval()
    predictions = []
    
    with torch.no_grad():
        # Original
        pred = F.softmax(model(image_tensor), dim=1)
        predictions.append(pred)
        
        # Horizontal flip only
        flipped = torch.flip(image_tensor, dims=[3])
        pred = F.softmax(model(flipped), dim=1)
        predictions.append(pred)
    
    # Average predictions
    return torch.stack(predictions).mean(dim=0)

def ensemble_predict_with_tta(models, image_tensor):
    """Get ensemble prediction with TTA for each model"""
    all_predictions = []
    
    for model in models:
        model_pred = apply_tta(model, image_tensor)
        all_predictions.append(model_pred)
    
    # Average across all models
    ensemble_pred = torch.stack(all_predictions).mean(dim=0)
    return ensemble_pred

print('TTA functions defined:')
print('  - 2 augmentations per model (original + horizontal flip)')
print(f'  - {len(MODEL_CONFIGS)} models in ensemble')
print(f'  - Total predictions averaged: {2 * len(MODEL_CONFIGS)}')

In [33]:
# Cell 8: Generate Submission

# Load test images
test_paths = sorted(list(TEST_DIR.glob('*.png')) + list(TEST_DIR.glob('*.jpg')) + list(TEST_DIR.glob('*.jpeg')))
print(f'Found {len(test_paths)} test images')

# Create test dataset and loader
test_dataset = TestDataset(test_paths, val_transform)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=0)

# Set all models to eval mode
for model in ensemble_models:
    model.eval()

# Generate predictions
filenames = []
predictions = []

print('\nGenerating predictions with ensemble + TTA...')
for image, filename in tqdm(test_loader):
    image = image.to(device)
    
    # Get ensemble prediction with TTA
    probs = ensemble_predict_with_tta(ensemble_models, image)
    pred_class = probs.argmax(dim=1).item()
    
    filenames.append(filename[0])
    predictions.append(pred_class)

print(f'Generated {len(predictions)} predictions')

Found 1055 test images

Generating predictions with ensemble + TTA...


100%|██████████| 1055/1055 [01:17<00:00, 13.62it/s]

Generated 1055 predictions





In [34]:
# Cell 9: Save Submission

# Create submission DataFrame
submission_df = pd.DataFrame({
    'filename': filenames,
    'label': predictions
})

# Sort by filename for consistency
submission_df = submission_df.sort_values('filename').reset_index(drop=True)

# Save to CSV
submission_df.to_csv('submission.csv', index=False)

print('Submission saved to submission.csv')
print(f'\nTotal predictions: {len(submission_df)}')
print(f'\nClass distribution (0=Female, 1=Male):')
print(submission_df['label'].value_counts())
print(f'\nFirst 10 rows:')
print(submission_df.head(10))

Submission saved to submission.csv

Total predictions: 1055

Class distribution (0=Female, 1=Male):
label
0    632
1    423
Name: count, dtype: int64

First 10 rows:
   filename  label
0  img_0003      1
1  img_0004      1
2  img_0005      0
3  img_0006      0
4  img_0009      0
5  img_0010      0
6  img_0011      0
7  img_0018      0
8  img_0019      0
9  img_0022      1
