In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle

def load_cifar_batch(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

# Specify the folder where the CIFAR-10 batch files are
cifar10_dir = '/kaggle/input/deep-learning-spring-2025-project-1/cifar-10-python/cifar-10-batches-py'

# Load the label names
meta_data_dict = load_cifar_batch(os.path.join(cifar10_dir, 'batches.meta'))
label_names = meta_data_dict[b'label_names']

# Load one batch for demonstration (e.g., data_batch_1)
batch_1_dict = load_cifar_batch(os.path.join(cifar10_dir, 'data_batch_1'))
train_images = batch_1_dict[b'data']
train_labels = batch_1_dict[b'labels']

# Reshape the images
train_images = train_images.reshape((10000, 3, 32, 32)).transpose(0, 2, 3, 1)

# Display the first 10 images and labels
plt.figure(figsize=(20, 4))
for i in range(10):
    plt.subplot(1, 10, i+1)
    plt.imshow(train_images[i])
    plt.title(label_names[train_labels[i]].decode('utf-8'))  # Decoding from bytes to string
    plt.axis('off')
plt.show()

In [3]:
# Read the test file, note that it has no labels and needs to be used with your model inference to predict outputs.

def load_cifar_batch(file):
    with open(file, 'rb') as fo:
        batch = pickle.load(fo, encoding='bytes')
    return batch

# Load the batch
cifar10_batch = load_cifar_batch('/kaggle/input/deep-learning-spring-2025-project-1/cifar_test_nolabel.pkl')

# Extract images 
images = cifar10_batch[b'data']
# Unlike the train images you are not required to reshape to (number of images, width, height, channels) 
# as the test data is already in (N x W x H x C) format

In [None]:
# Display the first 10 images
plt.figure(figsize=(20, 4))
for i in range(10):
    plt.subplot(1, 10, i+1)
    plt.imshow(images[i])
    plt.axis('off')
plt.show()

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau, OneCycleLR
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision
import torchvision.transforms as transforms
import time
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
from PIL import Image

# Set random seed for reproducibility
torch.manual_seed(128)
if torch.cuda.is_available():
    torch.cuda.manual_seed(128)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [6]:
class ResidualBlock(nn.Module):
    """Implementation of a residual block."""
    
    def __init__(self, in_channels, out_channels, stride=1, activation=nn.ReLU):
        super(ResidualBlock, self).__init__()
        # Main convolution path
        self.main_path = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, 
                     stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3,
                     stride=1, padding=1, bias=False),
            nn.BatchNorm2d(out_channels)
        )
        
        # Skip connection path
        self.skip_connection = self._create_skip_connection(
            in_channels, out_channels, stride)
        
        # Final activation
        self.activation = nn.ReLU(inplace=True)
        
    def _create_skip_connection(self, in_channels, out_channels, stride):
        # No downsampling needed if dimensions match
        if stride == 1 and in_channels == out_channels:
            return nn.Identity()
            
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=1,
                     stride=stride, bias=False),
            nn.BatchNorm2d(out_channels)
        )
    
    def forward(self, x):
        # Main path
        out = self.main_path(x)
        
        # Skip connection
        identity = self.skip_connection(x)
        
        # Combine and activate
        return self.activation(out + identity)

In [7]:
class CustomResNet(nn.Module):
    def __init__(self, block, layers, num_classes=10, dropout_rate=0.1):
        super(CustomResNet, self).__init__()
        self.in_channels = 48
        self.conv1 = nn.Conv2d(3, 48, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(48)
        self.relu = nn.ReLU(inplace=True)
        
        # 4 layers
        self.channels = [48, 96, 128, 364]
        self.layer1 = self._make_layer(block, self.channels[0], layers[0])
        self.layer2 = self._make_layer(block, self.channels[1], layers[1], stride=2)
        self.layer3 = self._make_layer(block, self.channels[2], layers[2], stride=2)
        self.layer4 = self._make_layer(block, self.channels[3], layers[3], stride=2)
        
        # Fixed 4×4 average pooling
        self.avgpool = nn.AvgPool2d(kernel_size=4, stride=1, padding=0)
        
        # Add dropout before final layer
        self.dropout = nn.Dropout(dropout_rate)
        
        self.fc = nn.Linear(self.channels[3], num_classes)
        
        # Initialize weights
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
                
    def _make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
            
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels
        
        for _ in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))
            
        return nn.Sequential(*layers)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        x = self.avgpool(x)
        x = torch.flatten(x, 1)

        x = self.dropout(x)
        
        x = self.fc(x)
        
        return x

def custom_resnet18():
    return CustomResNet(ResidualBlock, [2, 2, 2, 2])

In [8]:
def count_parameters(model):
    """Display model summary and count the number of trainable parameters in the model"""
    from torchsummary import summary
    
    # Display the detailed model summary for CIFAR-10 input size (3, 32, 32)
    summary(model, (3, 32, 32))
    
    # Also return the total parameter count for convenience
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"\nTotal trainable parameters: {total_params:,}")
    return total_params

In [9]:
def train_epoch(model, dataloader, criterion, optimizer, device, scheduler=None):
    model.train()
    loss = 0.0
    correct = 0
    total = 0
    
    start_time = time.time()
    
    for batch_idx, (inputs, labels) in enumerate(dataloader):
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        if scheduler is not None and isinstance(scheduler, OneCycleLR):
            scheduler.step()
        
        loss += loss.item()
        
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
        
    epoch_time = time.time() - start_time
    
    return loss / len(dataloader), 100 * correct / total, epoch_time

def validate(model, dataloader, criterion, device, classes=None):
    model.eval()
    loss = 0.0
    correct = 0
    total = 0
    
    # For confusion matrix
    all_preds = []
    all_targets = []
    
    start_time = time.time()
    
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            loss += loss.item()
            
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
            
            # Collect for confusion matrix
            all_preds.extend(predicted.cpu().numpy())
            all_targets.extend(labels.cpu().numpy())
    
    epoch_time = time.time() - start_time
    
    # Calculate confusion matrix and classification report if classes are provided
    results = {
        'loss': loss / len(dataloader),
        'accuracy': 100 * correct / total,
        'time': epoch_time,
        'predictions': all_preds,
        'targets': all_targets
    }
    
    return results

In [10]:
def generate_test_predictions(model, test_images, device, transform_test):
    """Generate predictions for test images"""
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for i in range(len(test_images)):
            # Standard prediction
            img_tensor = transform_test(test_images[i]).unsqueeze(0).to(device)
            outputs = model(img_tensor)
            _, predicted = outputs.max(1)
            predictions.append(predicted.item())
    
    return predictions

def plot_confusion_matrix(y_true, y_pred, classes):
    """Plot confusion matrix"""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.show()

def plot_training_history(train_losses, val_losses, train_accs, val_accs, lr_history=None):
    """
    Plot training history including losses, accuracies and learning rate
    """
    # Convert GPU tensors to CPU NumPy arrays if needed
    def convert_to_numpy(data):
        if isinstance(data, torch.Tensor):
            return data.cpu().detach().numpy()
        elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], torch.Tensor):
            return [x.cpu().detach().numpy() for x in data]
        return data
    
    train_losses = convert_to_numpy(train_losses)
    val_losses = convert_to_numpy(val_losses)
    train_accs = convert_to_numpy(train_accs)
    val_accs = convert_to_numpy(val_accs)
    lr_history = convert_to_numpy(lr_history)
    
    # Determine the number of plots
    num_plots = 3 if lr_history else 2
    plt.figure(figsize=(12, 4*num_plots))
    
    # Plot losses
    plt.subplot(num_plots, 1, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Loss Curves')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    
    # Plot accuracies
    plt.subplot(num_plots, 1, 2)
    plt.plot(train_accs, label='Train Accuracy')
    plt.plot(val_accs, label='Validation Accuracy')
    plt.title('Accuracy Curves')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    
    # Plot learning rate if available
    if lr_history:
        plt.subplot(num_plots, 1, 3)
        plt.plot(lr_history)
        plt.title('Learning Rate Schedule')
        plt.xlabel('Epoch')
        plt.ylabel('Learning Rate')
        plt.grid(True)
    
    plt.tight_layout()
    plt.show()

In [11]:
def get_lr(optimizer):
    """Get the current learning rate from the optimizer"""
    for param_group in optimizer.param_groups:
        return param_group['lr']

def create_checkpoint(model, optimizer, scheduler, epoch, val_acc, val_loss, filename):
    """Create a checkpoint with all training state"""
    state = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
        'val_acc': val_acc,
        'val_loss': val_loss
    }
    torch.save(state, filename)
    print(f"Checkpoint saved to {filename}")

def load_checkpoint(checkpoint_path, model, optimizer=None, scheduler=None, device=None):
    """Load a checkpoint and restore training state"""
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    
    if optimizer is not None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    if scheduler is not None and checkpoint['scheduler_state_dict']:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    
    epoch = checkpoint['epoch']
    val_acc = checkpoint['val_acc']
    val_loss = checkpoint['val_loss']
    
    print(f"Loaded checkpoint from epoch {epoch} with validation accuracy: {val_acc:.2f}%")
    return epoch, val_acc, val_loss

def check_gpu_memory():
    """Check GPU memory usage if available"""
    if torch.cuda.is_available():
        memory_allocated = torch.cuda.memory_allocated() / 1024**2  # MB
        memory_cached = torch.cuda.memory_reserved() / 1024**2  # MB
        return f"GPU Memory: Allocated {memory_allocated:.2f} MB, Cached {memory_cached:.2f} MB"
    else:
        return "GPU not available"

In [None]:
# Set up the device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set up checkpoints directory
checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

In [None]:
# Dataset class for your pickle-loaded CIFAR-10
class CIFAR10(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform
        self.classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 
                        'dog', 'frog', 'horse', 'ship', 'truck']
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        
        # Convert to PIL image for transforms
        image = Image.fromarray(image)
        
        if self.transform:
            image = self.transform(image)
            
        return image, label

# Load all training batches
print("Loading CIFAR-10 dataset from pickle files...")
train_images = []
train_labels = []

for i in range(1, 6):  # CIFAR-10 has 5 training batches
    batch_file = os.path.join(cifar10_dir, f'data_batch_{i}')
    batch_dict = load_cifar_batch(batch_file)
    batch_images = batch_dict[b'data']
    batch_images = batch_images.reshape((10000, 3, 32, 32)).transpose(0, 2, 3, 1)
    train_images.append(batch_images)
    train_labels.extend(batch_dict[b'labels'])

# Combine all batches into single numpy arrays
train_images = np.concatenate(train_images, axis=0)
train_labels = np.array(train_labels)

print(f"Training data shape: {train_images.shape}")
print(f"Training labels shape: {train_labels.shape}")

def calculate_mean_std(images):
    """Calculate mean and std for a batch of images with shape [N, H, W, C]"""
    # Convert to float and scale to [0, 1]
    imgs = images.astype(np.float32) / 255.0
    
    # Calculate mean and std across all images for each channel
    means = np.mean(imgs, axis=(0, 1, 2))
    stds = np.std(imgs, axis=(0, 1, 2))
    
    return means, stds

# Calculate on your training images
means, stds = calculate_mean_std(train_images)
print(f"Dataset mean: {means}, std: {stds}")

# Data augmentation and normalization
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(means, stds)
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(means, stds)
])

# Create the custom dataset
trainset = CIFAR10(train_images, train_labels, transform=transform_train)

# Get class names for later visualization
classes = trainset.classes

# Create a validation set by splitting the training data
train_size = int(0.9 * len(trainset))
val_size = len(trainset) - train_size
train_dataset, val_dataset = random_split(trainset, [train_size, val_size])

# More efficient data loading with prefetching and multiple workers
num_workers = 4  # Adjust based on your system
prefetch_factor = 2  # Prefetch 2 batches per worker

train_loader = DataLoader(
    train_dataset, 
    batch_size=128, 
    shuffle=True, 
    num_workers=num_workers, 
    pin_memory=True,
    prefetch_factor=prefetch_factor
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=128, 
    shuffle=False, 
    num_workers=num_workers, 
    pin_memory=True,
    prefetch_factor=prefetch_factor
)

# Load the test data from the pkl file
test_dict = load_cifar_batch('/kaggle/input/deep-learning-spring-2025-project-1/cifar_test_nolabel.pkl')
test_images = test_dict[b'data']

# Create dummy labels for the test set
test_labels = np.zeros(len(test_images), dtype=np.int64)  

testset = CIFAR10(test_images, test_labels, transform=transform_test)
test_loader = DataLoader(
    testset,
    batch_size=128,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True,
    prefetch_factor=prefetch_factor
)

In [None]:
# Create model
print("Creating model...")
model = custom_resnet18().to(device)

# Check parameter count
param_count = count_parameters(model)
assert param_count < 5_000_000, "Model exceeds parameter limit of 5 million!"

In [None]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)

# Learning rate scheduler with warmup
scheduler = OneCycleLR(
    optimizer, 
    max_lr=0.1,
    steps_per_epoch=len(train_loader),
    epochs=100,
    pct_start=0.1
)

# Training loop
print("Starting training...")
num_epochs = 100

# For early stopping
patience = 10
patience_counter = 0
best_val_loss = float('inf')
best_val_acc = 0

# For plotting
train_losses = []
val_losses = []
train_accs = []
val_accs = []
lr_history = []

# Resume from checkpoint if exists
start_epoch = 0
checkpoint_path = os.path.join(checkpoint_dir, "latest_checkpoint.pth")
if os.path.exists(checkpoint_path):
    print(f"Found checkpoint at {checkpoint_path}")
    try:
        start_epoch, best_val_acc, best_val_loss = load_checkpoint(
            checkpoint_path, model, optimizer, scheduler, device
        )
        start_epoch += 1  # Start from the next epoch
        print(f"Resuming from epoch {start_epoch}")
    except Exception as e:
        print(f"Error loading checkpoint: {e}")
        print("Starting training from scratch")

print(f"Initial model status: {check_gpu_memory()}")

In [None]:
for epoch in range(start_epoch, num_epochs):
    epoch_start_time = time.time()
    
    # Train
    train_loss, train_acc, train_time = train_epoch(
        model, train_loader, criterion, optimizer, device, scheduler
    )
    
    # Validate
    val_results = validate(model, val_loader, criterion, device, classes)
    val_loss = val_results['loss']
    val_acc = val_results['accuracy']
    val_time = val_results['time']
    
    # Update learning rate for non-OneCycleLR schedulers
    if scheduler is not None and not isinstance(scheduler, OneCycleLR):
        scheduler.step(val_loss)
    
    # Record metrics
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
    lr_history.append(get_lr(optimizer))
    
    epoch_time = time.time() - epoch_start_time
    
    print(f"Epoch {epoch+1}/{num_epochs}: "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%, "
          f"Time: {epoch_time:.2f}s, "
          f"LR: {get_lr(optimizer):.6f}")
    
    print(check_gpu_memory())
    
    # Check for best model
    is_best = val_acc > best_val_acc
    if is_best:
        best_val_acc = val_acc
        best_val_loss = val_loss
        create_checkpoint(
            model, optimizer, scheduler, epoch, val_acc, val_loss,
            os.path.join(checkpoint_dir, "best_model.pth")
        )
        print(f"New best model saved with validation accuracy: {val_acc:.2f}%")
        patience_counter = 0
    else:
        patience_counter += 1
        
    # Save regular checkpoint
    create_checkpoint(
        model, optimizer, scheduler, epoch, val_acc, val_loss,
        os.path.join(checkpoint_dir, "latest_checkpoint.pth")
    )
    
    # Early stopping
    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch+1}")
        break
    
    # Every 50 epochs, show confusion matrix
    if (epoch + 1) % 50 == 0 or epoch == num_epochs - 1:
        plot_confusion_matrix(
            val_results['targets'],
            val_results['predictions'],
            classes
        )

In [None]:
# Plot training history
plot_training_history(train_losses, val_losses, train_accs, val_accs, lr_history)

In [None]:
# Load best model for inference
print("Loading best model for inference...")
best_model_path = os.path.join(checkpoint_dir, "best_model.pth")
_, _, _ = load_checkpoint(best_model_path, model, device=device)

# Generate classification report on validation set
val_results = validate(model, val_loader, criterion, device, classes)
print("\nValidation Classification Report:")
print(classification_report(
    val_results['targets'],
    val_results['predictions'],
    target_names=classes
))

print("Generating predictions on test set...")

# Generate predictions
predictions = generate_test_predictions(model, images, device, transform_test)

# Create submission file
submission = pd.DataFrame({
    'id': range(len(predictions)),
    'label': predictions
})
submission.to_csv('submission.csv', index=False)

print("Predictions saved to 'submission.csv'")