# Data Processing


In [1]:
#reproducability
import random, os
import numpy as np
import torch
import pickle

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:" , device)

Device: cuda


In [2]:

train_path = os.path.join('Data', 'train-70_.pkl')
val_path   = os.path.join('Data', 'validation-10_.pkl')

#----Load training data-----#
with open(train_path, 'rb') as f:
    train_data = pickle.load(f)

images_train = train_data['images']
labels_train = train_data['labels']
all_classes = sorted(list(set(labels_train)))  # 15 classes


print("Train images shape:", images_train.shape)
print("Train lables shape:", len(labels_train))
print("Number of classes:", len(all_classes))
print("class names:", all_classes)


with open(val_path, 'rb') as f:
    val_data = pickle.load(f)

images_val = val_data['images']
labels_val = val_data['labels']

print("Val images shape:", images_val.shape)
print("val labels shape:", len(labels_val))
print("Number of classes:", len(all_classes))



Train images shape: (5775, 64, 64, 3)
Train lables shape: 5775
Number of classes: 15
class names: [6, 22, 26, 28, 35, 57, 62, 70, 108, 139, 151, 163, 173, 188, 189]
Val images shape: (825, 64, 64, 3)
val labels shape: 825
Number of classes: 15


In [3]:
#convert data to pyTorch tensors for manipulation and modeling
from torch.utils.data import TensorDataset, DataLoader

# ---- Map original labels to 0..num_classes-1 ----
class_to_idx = {cls: i for i, cls in enumerate(all_classes)}

train_labels_mapped = torch.tensor([class_to_idx[l] for l in labels_train]).long()
val_labels_mapped   = torch.tensor([class_to_idx[l] for l in labels_val]).long()

# Now you can use these in your dataset
train_labels_tensor = train_labels_mapped
val_labels_tensor   = val_labels_mapped

print("Mapped train labels range:", train_labels_tensor.min(), train_labels_tensor.max())
print("Mapped val labels range:", val_labels_tensor.min(), val_labels_tensor.max())

Mapped train labels range: tensor(0) tensor(14)
Mapped val labels range: tensor(0) tensor(14)


In [4]:
#standardization of data
#uses mean and std for RGB channels across training set
train_images_float = images_train.astype(np.float32) / 255.0

mean = train_images_float.mean((0,1,2))
std = train_images_float.std((0,1,2))

print("Channel-wise mean:", mean)
print("Channel-wise std:", std)


Channel-wise mean: [0.47475025 0.4356516  0.38977522]
Channel-wise std: [0.26744732 0.2618989  0.2737367 ]


In [5]:
#data set class to wrap tensor
#applies image augmentations

import torchvision.transforms as transforms
from torch.utils.data import Dataset

train_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomCrop(64, padding = 4),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(8),
   
    transforms.ColorJitter(
        brightness=0.15, 
        contrast=0.15, 
        saturation=0.15, 
        hue=0.03
    ),
    #transforms.RandomAffine(
       # degrees=0,
      #  translate=(0.15,0.15),
     #   scale=(0.85, 1.15)
    #),
    transforms.RandomGrayscale(p=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean.tolist(), std.tolist())
])


val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=mean.tolist(), std=std.tolist())
])

class TinyImageNetDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

# Create datasets with transforms
train_dataset = TinyImageNetDataset(images_train, train_labels_tensor, train_transform)
val_dataset = TinyImageNetDataset(images_val, val_labels_tensor, val_transform)



In [6]:
# Data Loaders
batch_size = 64

train_loader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    shuffle = True, #training data must be shuffled
    num_workers = 0, #number of parallel data loaders
    pin_memory = True
)

val_loader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    shuffle = False, #validation data does not need shuffing
    num_workers = 0,
    pin_memory = True
)

for images, labels in train_loader:
    print("Train batch images shape:", images.shape)
    print("Train batch labels shape:", labels.shape)
    break


Train batch images shape: torch.Size([64, 3, 64, 64])
Train batch labels shape: torch.Size([64])


# Model Helpers


In [7]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import AdamW
import numpy as np
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, SequentialLR, LinearLR, OneCycleLR, CosineAnnealingLR



In [8]:

class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, pool=True, dropout=0.0, kernel_size=3):
        super(ConvBlock, self).__init__()
        self.conv = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            padding=kernel_size // 2  # keeps spatial size same
        )
        self.bn = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout2d(dropout) if dropout > 0 else nn.Identity()
        self.pool = nn.MaxPool2d(2, 2) if pool else nn.Identity()

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.pool(x)
        return x


# Model

In [9]:
class theBestCNN(nn.Module):
    def __init__(self, num_classes):
        super(theBestCNN, self).__init__()

        # --- Convolutional blocks ---
        self.conv1 = ConvBlock(3, 128, dropout=0.0, kernel_size=3) 
        self.conv2 = ConvBlock(128, 128, dropout=0.05, kernel_size=3, pool=False) 

        self.conv3 = ConvBlock(128, 256, dropout=0.1) 
        self.conv4 = ConvBlock(256, 256, dropout=0.1, pool=False) 

        self.conv5 = ConvBlock(256, 512, dropout=0.15) 
        self.conv6 = ConvBlock(512, 512, dropout=0.15, pool=False) 

        self.conv7 = ConvBlock(512, 1024, dropout = 0.2)  
        self.conv8 = ConvBlock(1024, 1024, dropout = 0.2, pool=False) 

        # --- Global Average Pooling --#
        self.gap = nn.AdaptiveAvgPool2d((1, 1))

      
        self.classifier = nn.Sequential(
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(512,256),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(256,num_classes)
        )

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = self.conv7(x)
        x = self.conv8(x)

        x = self.gap(x)
        x = x.view(x.size(0), -1)

        x = self.classifier(x)

        return x

In [56]:
#instantiate model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = theBestCNN(num_classes=len(all_classes)).to(device)
print(model)

theBestCNN(
  (conv1): ConvBlock(
    (conv): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (dropout): Identity()
    (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): ConvBlock(
    (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (dropout): Dropout2d(p=0.05, inplace=False)
    (pool): Identity()
  )
  (conv3): ConvBlock(
    (conv): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (dropout): Dropout2d(p=0.1, inplace=False)
    (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode

In [57]:

# --- Loss Function ---
#loss function
criterion = nn.CrossEntropyLoss(label_smoothing=0.04)

#optimizer with l2 regularization (weight decay)
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=0.121,
    momentum=0.9,
    weight_decay=1.5e-4,
    nesterov=True
)

scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=0.121,
    epochs=150,
    steps_per_epoch=len(train_loader),
    pct_start=0.1,
    anneal_strategy='cos',
    div_factor=25.0,
    final_div_factor=10000.0
)


In [58]:
num_epochs = 150
best_val_acc = 0.0
patience = 18
patience_counter = 0

def mixup_data(x, y, alpha=0.15):
    lam = np.random.beta(alpha, alpha) if alpha > 0 else 1
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(x.device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


for epoch in range(num_epochs):
  model.train()
  running_train_loss = 0.0
  correct_train = 0
  total_train = 0

  #------ Training Loop -----#
  for i, (images, labels) in enumerate(train_loader):
    images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)

    optimizer.zero_grad()

    images, labels_a, labels_b, lam = mixup_data(images, labels, alpha=0.15)
    outputs = model(images)
    loss = lam * criterion(outputs, labels_a) + (1 - lam) * criterion(outputs, labels_b)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)

    optimizer.step()
    scheduler.step()

    running_train_loss += loss.item() * images.size(0)
    _, predicted = outputs.max(1)
    total_train += labels.size(0)
    correct_train += predicted.eq(labels).sum().item()

  train_loss = running_train_loss / len(train_loader.dataset)
  train_acc = correct_train / total_train

  # --- Validation Loop ---
  model.eval()
  running_val_loss = 0.0
  correct_val = 0
  total_val = 0

  with torch.no_grad():
      for images, labels in val_loader:
          images, labels = images.to(device), labels.to(device)
          outputs = model(images)
          loss = criterion(outputs, labels)

          running_val_loss += loss.item() * images.size(0)
          _, predicted = outputs.max(1)
          total_val += labels.size(0)
          correct_val += predicted.eq(labels).sum().item()

  val_loss = running_val_loss / len(val_loader.dataset)
  val_acc = correct_val / total_val

  print(f"Epoch [{epoch+1}/{num_epochs}] "
        f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
        f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")


  # Save best model and check early stopping
  if val_acc > best_val_acc:
      best_val_acc = val_acc
      patience_counter = 0
      torch.save(model.state_dict(), "model.pth")
      print(f"‚úì Saved Best Model! Best Val Acc: {best_val_acc:.4f}")
  else:
      patience_counter += 1
      print(f"No improvement. Patience: {patience_counter}/{patience}")

      if patience_counter >= patience:
          print(f"\nEarly stopping triggered at epoch {epoch+1}")
          print(f"Best validation accuracy: {best_val_acc:.4f}")
          break

print(f"\n{'='*50}")
print(f"Training Complete!")
print(f"Best Validation Accuracy: {best_val_acc:.4f}")
print(f"{'='*50}")

Epoch [1/150] Train Loss: 2.7078, Train Acc: 0.0852 | Val Loss: 2.4913, Val Acc: 0.2073
‚úì Saved Best Model! Best Val Acc: 0.2073
Epoch [2/150] Train Loss: 2.5311, Train Acc: 0.1231 | Val Loss: 2.3091, Val Acc: 0.2752
‚úì Saved Best Model! Best Val Acc: 0.2752
Epoch [3/150] Train Loss: 2.3883, Train Acc: 0.1683 | Val Loss: 2.0386, Val Acc: 0.3709
‚úì Saved Best Model! Best Val Acc: 0.3709
Epoch [4/150] Train Loss: 2.2317, Train Acc: 0.1829 | Val Loss: 2.1138, Val Acc: 0.3418
No improvement. Patience: 1/18
Epoch [5/150] Train Loss: 2.2100, Train Acc: 0.2062 | Val Loss: 2.0312, Val Acc: 0.3345
No improvement. Patience: 2/18
Epoch [6/150] Train Loss: 2.1331, Train Acc: 0.2256 | Val Loss: 1.8414, Val Acc: 0.4315
‚úì Saved Best Model! Best Val Acc: 0.4315
Epoch [7/150] Train Loss: 2.1222, Train Acc: 0.2675 | Val Loss: 2.0427, Val Acc: 0.3648
No improvement. Patience: 1/18
Epoch [8/150] Train Loss: 2.0548, Train Acc: 0.2331 | Val Loss: 1.9458, Val Acc: 0.3964
No improvement. Patience: 2/18


Helper Functions

In [59]:
# CORRECTED VERSION BASED ON ACTUAL REQUIREMENTS

def load_model(path="model.pth", num_classes=15):
    """
    Load PyTorch model weights from a file.
    Returns the model on the specified device.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Create model with same architecture as trained
    model = theBestCNN(num_classes=num_classes)
    
    # Load the saved weights
    model.load_state_dict(torch.load(path, map_location=device))
    
    # Move to appropriate device and set to evaluation mode
    model = model.to(device)
    model.eval()
    
    print(f"Model loaded from {path}")
    return model
        

In [60]:
def predict(model, images, device=device):
    """
    Predict classes for a batch of images.
    
    Args:
        model: the trained PyTorch model
        images: torch.Tensor of shape (N, C, H, W)
    Returns:
        preds: torch.Tensor of predicted class indices (N,)
    """
    model.eval()
    images = images.to(device)
    with torch.no_grad():
        outputs = model(images)
        _, predictions = torch.max(outputs, 1)
    return predictions

# Test it correctly:
test_model = load_model("model.pth")

# Get a batch from the DataLoader
test_images, test_labels = next(iter(val_loader))
est_images = test_images.to(device)  # Move to GPU
test_labels = test_labels.to(device)  # Move labels to GPU too!

# Pass the IMAGES tensor, not the DataLoader
preds = predict(test_model, test_images)

print(f"‚úÖ Input shape: {test_images.shape}")
print(f"‚úÖ Output shape: {preds.shape}")
print(f"‚úÖ Sample predictions: {preds[:5]}")
print(f"‚úÖ Accuracy check: {(preds == test_labels).float().mean():.4f}")
print("üéâ Predict function works! Ready for submission.")

Model loaded from model.pth
‚úÖ Input shape: torch.Size([64, 3, 64, 64])
‚úÖ Output shape: torch.Size([64])
‚úÖ Sample predictions: tensor([11, 11, 11, 11, 11], device='cuda:0')
‚úÖ Accuracy check: 0.8125
üéâ Predict function works! Ready for submission.


In [61]:
# === FIXED ULTIMATE MODEL HEALTH CHECK ===

def comprehensive_model_test(model_path="model.pth"):
    model = load_model(model_path)
    model.eval()
    
    print("üß™ COMPREHENSIVE MODEL DIAGNOSTICS")
    print("=" * 50)
    
    # 1. Full validation accuracy
    correct = 0
    total = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in val_loader:
            # Move everything to GPU
            images = images.to(device)
            labels = labels.to(device)
            
            preds = predict(model, images)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            all_predictions.extend(preds.cpu().numpy())  # Move to CPU for storage
            all_labels.extend(labels.cpu().numpy())      # Move to CPU for storage
    
    val_acc = correct / total
    print(f"1. Full Validation Accuracy: {val_acc:.4f} ({correct}/{total})")
    
    # 2. Per-class accuracy (critical!)
    from collections import Counter
    class_correct = [0] * 15
    class_total = [0] * 15
    
    for pred, label in zip(all_predictions, all_labels):
        class_total[label] += 1
        if pred == label:
            class_correct[label] += 1
    
    print("\n2. Per-Class Accuracy:")
    weak_classes = []
    for i in range(15):
        acc = class_correct[i] / class_total[i] if class_total[i] > 0 else 0
        print(f"   Class {i:2d}: {acc:.3f} ({class_correct[i]}/{class_total[i]})")
        if acc < 0.6:  # Classes under 60% are concerning
            weak_classes.append(i)
    
    # 3. Prediction diversity test
    unique_preds = len(Counter(all_predictions))
    print(f"\n3. Prediction Diversity: {unique_preds}/15 classes predicted")
    
    # 4. Batch-wise consistency test
    print("\n4. Batch Consistency Test:")
    for i in range(3):
        test_images, test_labels = next(iter(val_loader))
        test_images = test_images.to(device)
        test_labels = test_labels.to(device)
        preds = predict(model, test_images)
        unique_in_batch = len(torch.unique(preds))
        print(f"   Batch {i}: {unique_in_batch} unique classes predicted")

        # Let's see what's actually in the batches
        print("üîç Let's examine the actual class distribution in batches:")

        for i in range(3):
            test_images, test_labels = next(iter(val_loader))
            unique_actual_classes = len(torch.unique(test_labels))
            print(f"Batch {i}: {unique_actual_classes} actual classes in the batch")
            
    # 5. Final Assessment
    print("\n5. FINAL ASSESSMENT:")
    if val_acc >= 0.78 and unique_preds >= 12 and len(weak_classes) <= 3:
        print("   ‚úÖ EXCELLENT - High chance of test set success!")
        print("   Model is balanced, accurate, and predicts diverse classes")
    elif val_acc >= 0.75 and unique_preds >= 10:
        print("   ‚úÖ GOOD - Competitive model")
        print("   Should perform well on test set")
    else:
        print("   ‚ö†Ô∏è  NEEDS IMPROVEMENT - Test set performance uncertain")
        if weak_classes:
            print(f"   Weak classes: {weak_classes}")
        if unique_preds < 10:
            print(f"   Only predicting {unique_preds}/15 classes")
    
    return val_acc, weak_classes

# Run the comprehensive test
final_accuracy, weak_classes = comprehensive_model_test()
    

Model loaded from model.pth
üß™ COMPREHENSIVE MODEL DIAGNOSTICS
1. Full Validation Accuracy: 0.8158 (673/825)

2. Per-Class Accuracy:
   Class  0: 0.909 (50/55)
   Class  1: 0.891 (49/55)
   Class  2: 0.727 (40/55)
   Class  3: 0.855 (47/55)
   Class  4: 0.818 (45/55)
   Class  5: 0.818 (45/55)
   Class  6: 0.891 (49/55)
   Class  7: 0.764 (42/55)
   Class  8: 0.891 (49/55)
   Class  9: 0.709 (39/55)
   Class 10: 0.800 (44/55)
   Class 11: 0.800 (44/55)
   Class 12: 0.818 (45/55)
   Class 13: 0.891 (49/55)
   Class 14: 0.655 (36/55)

3. Prediction Diversity: 15/15 classes predicted

4. Batch Consistency Test:
   Batch 0: 9 unique classes predicted
üîç Let's examine the actual class distribution in batches:
Batch 0: 2 actual classes in the batch
Batch 1: 2 actual classes in the batch
Batch 2: 2 actual classes in the batch
   Batch 1: 9 unique classes predicted
üîç Let's examine the actual class distribution in batches:
Batch 0: 2 actual classes in the batch
Batch 1: 2 actual classes 

implement Stochastic Gradient descent optimizer for lack of generaliztion: need momentum