In [None]:
# Install required packages
# Run this cell first if you don't have these packages installed
!pip install numpy matplotlib scikit-learn datasets torch


# Week 7 — Convolutional Neural Networks (Image Classification) - PyTorch Version

**Setup Instructions:**
1. Run the cell above to install required packages (if not already installed)
2. **Important:** If you get import errors after installing packages, restart the Jupyter kernel:
   - Go to `Kernel` → `Restart Kernel` in the menu
   - Or use the restart button in the toolbar
3. Then re-run the import cell below

**Note:** This notebook uses PyTorch instead of NumPy for CNN implementation. PyTorch provides automatic differentiation and GPU support, making it easier to build and train neural networks.

**Objectives**

- Load and visualize the CIFAR-100 dataset from HuggingFace.
- Build a PCA + Logistic Regression baseline classifier; compute evaluation metrics (e.g., accuracy).
- Implement a simple CNN classifier using PyTorch that barely beats or fails to beat the baseline.
- Build a deeper CNN model that achieves better performance.
- Train a CNN model on data‑augmented images and visualize augmentations.
- Explore an advanced CNN feature (e.g., global average pooling) and observe its impact.


In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from utils import (
    show_result,
    load_cifar100_dataset,
    pca_logistic_baseline,
    test_exercise_7_pca,
    test_exercise_7_simple_cnn,
    test_exercise_7_proper_cnn,
    test_exercise_7_data_aug_cnn,
    test_exercise_7_advanced_cnn,
    accuracy
)

# Set random seeds for reproducibility
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


## 1. CIFAR-100 Image Dataset

In this exercise, we'll use the CIFAR-100 dataset, a well-known benchmark dataset for image classification. CIFAR-100 contains 60,000 32×32 color images in 100 classes (100 fine-grained classes), with 6,000 images per class.

We'll load the dataset from HuggingFace and convert it to grayscale to simplify training and focus on the CNN architecture rather than computational complexity.

**Task:** Use the `load_cifar10_dataset` function from `utils.py` to load a subset of CIFAR-100, then visualize a few random samples from each class. Report the number of training and test examples.

**Hint:** Call `load_cifar10_dataset(n_train=50000, n_test=200, seed=0, grayscale=False)` to get a manageable subset. The function returns 5 values: train images, train labels, test images, test labels, and class names.


In [None]:
# Load CIFAR-100 dataset
# Using the full dataset for high performance
X_train, y_train, X_test, y_test, class_names = load_cifar100_dataset(
    n_train=50000, n_test=10000, seed=0, grayscale=False
)
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")
print(f"Image shape: {X_train.shape[1:]}")
print(f"Classes: {class_names}")

# Visualize a few random samples from the training set
fig, axes = plt.subplots(1, 5, figsize=(12, 2.5))
for ax in axes:
    idx = random.randint(0, len(X_train) - 1)
    ax.imshow(X_train[idx]) # RGB, no cmap needed
    ax.set_title(f"{class_names[y_train[idx]]}\n(label {y_train[idx]})")
    ax.axis('off')
plt.tight_layout()
plt.show()

## 2. PCA + Logistic Regression Baseline

A simple yet strong baseline for image classification is to flatten each image into a vector, project it onto a lower‑dimensional subspace using **Principal Component Analysis (PCA)**, and then train a multinomial logistic regression classifier.

1. Flatten the training and test images (shape `(N, H*W)`).
2. Fit a PCA model on the training data and project both the training and test data into a lower‑dimensional space (e.g., 20 components).
3. Train a `LogisticRegression` classifier on the reduced features.
4. Evaluate the classifier using **accuracy** (the fraction of correct predictions).

**Task:** Complete the function `student_pca_baseline(...)` below to implement this baseline. It should return the test accuracy as a float in `[0,1]`.

**Hints:**
- Use `train_images.reshape(train_images.shape[0], -1)` to flatten images from (N, H, W) to (N, H*W)
- Import `PCA` from `sklearn.decomposition` and `LogisticRegression` from `sklearn.linear_model`
- Make sure `n_components` doesn't exceed the number of features (use `min(n_components, n_features)`)
- Use `pca.fit_transform()` for training data and `pca.transform()` for test data
- The `accuracy` function from `utils` computes the classification accuracy


In [None]:
def student_pca_baseline(train_images, train_labels, test_images, test_labels, n_components=20):
    '''
    Implements a PCA + Logistic Regression baseline classifier.

    Parameters:
        train_images: numpy array of shape (N_train, H, W) with float32 values in [0,1].
        train_labels: numpy array of shape (N_train,) of integer labels.
        test_images: numpy array of shape (N_test, H, W).
        test_labels: numpy array of shape (N_test,).
        n_components: number of principal components to retain.

    Returns:
        Test accuracy as a float in [0,1].
    '''
    # TODO: flatten images, fit PCA, train LogisticRegression, compute accuracy
    raise NotImplementedError


In [None]:
# Evaluate the PCA baseline implementation
res = test_exercise_7_pca(student_pca_baseline)
show_result("Exercise 1 – PCA Baseline", res)

# If implemented, you can also test on the dataset generated above
try:
    acc = student_pca_baseline(X_train, y_train, X_test, y_test, 20)
    print(f"PCA baseline accuracy on the synthetic dataset: {acc:.3f}")
except NotImplementedError:
    print("Implement student_pca_baseline above.")


## 3. Simple Convolutional Neural Network (PyTorch)

Convolutional neural networks (CNNs) process images by learning **filters** that extract local patterns. We'll start with a very small CNN using PyTorch:

- One convolutional layer with a few filters (e.g., 4 filters, each $3\times3$).
- Apply a non‑linear activation such as ReLU.
- Flatten the result and feed it into a linear layer to produce class logits.

For training, use cross‑entropy loss and an optimizer (e.g., SGD) for a few epochs. Because this network is very shallow and the dataset is small, it may perform worse than the PCA baseline.

**Task:** Complete the function `student_simple_cnn(...)` below. It should construct the described network using PyTorch's `nn.Module`, train it for a few epochs on the training set, and return the test accuracy.

**Hints:**
- Use `nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3)` for the convolutional layer
- Use `nn.ReLU()` for activation
- Use `nn.Flatten()` or `.view()` to flatten the feature maps
- Use `nn.Linear()` for the final classification layer
- Convert NumPy arrays to PyTorch tensors: `torch.from_numpy(arr).float()`
- Move tensors to device: `.to(device)`
- Use `nn.CrossEntropyLoss()` for loss and `optim.SGD()` for optimizer
- Remember to call `model.train()` during training and `model.eval()` during evaluation


In [None]:
def student_simple_cnn(train_images, train_labels, test_images, test_labels, num_epochs=5, learning_rate=0.01):
    '''
    Build and train a simple CNN with one convolutional layer followed by a linear classifier using PyTorch.
    Use small filter sizes (e.g., 3x3) and a small number of filters (e.g., 4).

    Parameters:
        train_images: numpy array (N_train, H, W).
        train_labels: numpy array (N_train,).
        test_images: numpy array (N_test, H, W).
        test_labels: numpy array (N_test,).
        num_epochs: number of training epochs.
        learning_rate: step size for gradient descent.

    Returns:
        Test accuracy as a float.
    '''
    # TODO: implement PyTorch CNN model, training loop, and evaluation
    raise NotImplementedError


In [None]:
# Evaluate the simple CNN implementation
res = test_exercise_7_simple_cnn(student_simple_cnn)
show_result("Exercise 2 – Simple CNN", res)

# Optional: test on the dataset generated above
try:
    acc = student_simple_cnn(X_train, y_train, X_test, y_test)
    print(f"Simple CNN accuracy: {acc:.3f}")
except NotImplementedError:
    print("Implement student_simple_cnn above.")


## 4. Deeper CNN (Improved Model)

Now extend your network to be **deeper** with multiple convolutional blocks. Your goal is to achieve **>50% accuracy on CIFAR-100**.

**Suggested Architecture** (you can modify this):
- **Block 1:** [Conv(3→64) → BN → ReLU → Conv(64→64) → BN → ReLU] → MaxPool(2×2) → Dropout(0.1)
- **Block 2:** [Conv(64→128) → BN → ReLU → Conv(128→128) → BN → ReLU] → MaxPool(2×2) → Dropout(0.2)
- **Block 3:** [Conv(128→256) → BN → ReLU → Conv(256→256) → BN → ReLU] → MaxPool(2×2) → Dropout(0.3)
- **Classifier:** Flatten → Linear(256×4×4 → 512) → BN → ReLU → Dropout(0.5) → Linear(512 → 100)

**Key Components:**
1. **Batch Normalization** (`nn.BatchNorm2d`): Normalizes activations, speeds up training
2. **Dropout** (`nn.Dropout2d` for conv layers, `nn.Dropout` for fc layers): Prevents overfitting
3. **Data Augmentation**: Apply random transformations during training
4. **Mini-batch Training**: Use `DataLoader` to train in batches

**Task:** Complete the function `student_proper_cnn(...)` below.

**Hints:**
- Use `padding=1` in `Conv2d` to preserve spatial dimensions (32×32 → 32×32)
- Use `nn.MaxPool2d(2, 2)` to downsample by 2x (32×32 → 16×16 → 8×8 → 4×4)
- After 3 pooling layers: 32÷2÷2÷2 = 4, so final feature map is 4×4
- Progressive dropout: 0.1 → 0.2 → 0.3 → 0.5 (less in early layers, more in later layers)
- **Data Augmentation** (apply during training):
  - Random horizontal flip: `torch.flip(img, dims=[3])`
  - Random crop: `F.pad(img, (4,4,4,4), mode='reflect')` then crop 32×32
- **Weight Initialization**: Use Kaiming (He) initialization for ReLU networks
  ```python
  nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
  ```
- **Optimizer**: `optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)`
- **Scheduler**: `CosineAnnealingLR` for smooth learning rate decay
- **Loss**: `nn.CrossEntropyLoss(label_smoothing=0.1)` prevents overconfidence
- **Gradient Clipping**: `torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)`
- **Training**: 50 epochs with batch_size=64 should achieve >50% accuracy
- **Mini-batches**: Use `TensorDataset` and `DataLoader` for efficient batch training

**Example Structure:**
```python
class ProperCNN(nn.Module):
    def __init__(self):
        super(ProperCNN, self).__init__()
        # Block 1
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.dropout1 = nn.Dropout2d(0.1)
        # ... continue with blocks 2, 3, and classifier
```


### Implementation Guide

**What's PROVIDED for you:**
- ✅ Complete `ProperCNN` model architecture (3 conv blocks + classifier)
- ✅ `augment_batch()` function for data augmentation
- ✅ Data preparation (tensors, DataLoader)
- ✅ Model initialization with Kaiming weights
- ✅ Loss function, optimizer, and scheduler setup

**What YOU need to implement:**
- 🔨 **Training Loop**: Iterate over epochs and batches, apply augmentation, forward/backward pass, gradient clipping
- 🔨 **Evaluation**: Set model to eval mode, make predictions on test set, compute accuracy

**Your tasks are clearly marked with `# TODO:` and `# YOUR CODE HERE` comments in the function below.**


In [None]:
def student_proper_cnn(train_images, train_labels, test_images, test_labels, num_epochs=50, learning_rate=0.001, batch_size=64):
    '''
    Build and train a moderately deep CNN optimized for CIFAR-100.
    Goal: Achieve >50% test accuracy.
    '''
    n_classes = len(np.unique(train_labels))
    from torch.utils.data import TensorDataset, DataLoader
    
    print(f"Training on {len(train_images)} samples with batch size {batch_size} for {num_epochs} epochs")
    print(f"Number of classes: {n_classes}")
    
    # ============================================================================
    # MODEL ARCHITECTURE (PROVIDED)
    # ============================================================================
    class ProperCNN(nn.Module):
        def __init__(self):
            super(ProperCNN, self).__init__()
            # Block 1: 3 -> 64 -> 64
            self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
            self.bn1 = nn.BatchNorm2d(64)
            self.conv2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
            self.bn2 = nn.BatchNorm2d(64)
            self.pool1 = nn.MaxPool2d(2, 2)  # 32x32 -> 16x16
            self.dropout1 = nn.Dropout2d(0.1)
            
            # Block 2: 64 -> 128 -> 128
            self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
            self.bn3 = nn.BatchNorm2d(128)
            self.conv4 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
            self.bn4 = nn.BatchNorm2d(128)
            self.pool2 = nn.MaxPool2d(2, 2)  # 16x16 -> 8x8
            self.dropout2 = nn.Dropout2d(0.2)
            
            # Block 3: 128 -> 256 -> 256
            self.conv5 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
            self.bn5 = nn.BatchNorm2d(256)
            self.conv6 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
            self.bn6 = nn.BatchNorm2d(256)
            self.pool3 = nn.MaxPool2d(2, 2)  # 8x8 -> 4x4
            self.dropout3 = nn.Dropout2d(0.3)
            
            # Classifier
            self.flatten = nn.Flatten()
            self.fc1 = nn.Linear(256 * 4 * 4, 512)
            self.bn_fc = nn.BatchNorm1d(512)
            self.dropout4 = nn.Dropout(0.5)
            self.fc2 = nn.Linear(512, n_classes)
        
        def forward(self, x):
            # Block 1
            x = F.relu(self.bn1(self.conv1(x)))
            x = F.relu(self.bn2(self.conv2(x)))
            x = self.pool1(x)
            x = self.dropout1(x)
            
            # Block 2
            x = F.relu(self.bn3(self.conv3(x)))
            x = F.relu(self.bn4(self.conv4(x)))
            x = self.pool2(x)
            x = self.dropout2(x)
            
            # Block 3
            x = F.relu(self.bn5(self.conv5(x)))
            x = F.relu(self.bn6(self.conv6(x)))
            x = self.pool3(x)
            x = self.dropout3(x)
            
            # Classifier
            x = self.flatten(x)
            x = F.relu(self.bn_fc(self.fc1(x)))
            x = self.dropout4(x)
            x = self.fc2(x)
            return x
    
    # ============================================================================
    # DATA AUGMENTATION FUNCTION (PROVIDED)
    # ============================================================================
    def augment_batch(images_tensor):
        '''
        Apply random horizontal flip and random crop to a batch of images.
        '''
        aug = images_tensor.clone()
        batch_size = aug.shape[0]
        
        # Random horizontal flip (50% probability)
        flip_mask = torch.rand(batch_size, device=aug.device) > 0.5
        aug[flip_mask] = torch.flip(aug[flip_mask], dims=[3])
        
        # Random crop with padding (shift by up to 4 pixels)
        padded = F.pad(aug, (4, 4, 4, 4), mode='reflect')
        for i in range(batch_size):
            h_start = torch.randint(0, 9, (1,)).item()
            w_start = torch.randint(0, 9, (1,)).item()
            aug[i] = padded[i, :, h_start:h_start+32, w_start:w_start+32]
        
        return aug
    
    # ============================================================================
    # DATA PREPARATION (PROVIDED)
    # ============================================================================
    # Convert numpy arrays to PyTorch tensors and move to device
    X_train_tensor = torch.from_numpy(train_images).float().permute(0, 3, 1, 2).to(device)
    y_train_tensor = torch.from_numpy(train_labels).long().to(device)
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    X_test_tensor = torch.from_numpy(test_images).float().permute(0, 3, 1, 2).to(device)
    
    # ============================================================================
    # MODEL INITIALIZATION (PROVIDED)
    # ============================================================================
    model = ProperCNN().to(device)
    
    # Initialize weights using Kaiming initialization
    for m in model.modules():
        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
            nn.init.constant_(m.weight, 1)
            nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            nn.init.constant_(m.bias, 0)
    
    # Define loss function, optimizer, and scheduler
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    
    # ============================================================================
    # TODO: TRAINING LOOP (YOUR TASK)
    # ============================================================================
    # Implement the training loop here. Follow these steps:
    # 1. Set model to training mode: model.train()
    # 2. Loop over epochs
    # 3. For each epoch, loop over batches from train_loader
    # 4. For each batch:
    #    a. Apply data augmentation: X_batch_aug = augment_batch(X_batch)
    #    b. Zero gradients: optimizer.zero_grad()
    #    c. Forward pass: outputs = model(X_batch_aug)
    #    d. Compute loss: loss = criterion(outputs, y_batch)
    #    e. Backward pass: loss.backward()
    #    f. Clip gradients: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    #    g. Update weights: optimizer.step()
    # 5. After each epoch, update learning rate: scheduler.step()
    # 6. (Optional) Print training progress every 10 epochs
    
    print("Starting training...")
    
    # YOUR CODE HERE
    raise NotImplementedError("Implement the training loop")
    
    # ============================================================================
    # TODO: EVALUATION (YOUR TASK)
    # ============================================================================
    # After training, evaluate the model on the test set:
    # 1. Set model to evaluation mode: model.eval()
    # 2. Use torch.no_grad() context
    # 3. Create a DataLoader for test data (no shuffling needed)
    # 4. Loop over test batches and collect predictions
    # 5. Compute accuracy using the accuracy() function from utils
    # 6. Return the test accuracy
    
    # YOUR CODE HERE
    raise NotImplementedError("Implement the evaluation")


In [None]:
# Evaluate the deeper CNN implementation
res = test_exercise_7_proper_cnn(student_proper_cnn)
show_result("Exercise 3 – Proper CNN", res)

# Optional: test on the dataset generated above
try:
    acc = student_proper_cnn(X_train, y_train, X_test, y_test)
    print(f"Proper CNN accuracy: {acc:.3f}")
except NotImplementedError:
    print("Implement student_proper_cnn above.")


## 5. Discussion

Briefly reflect on your results:

- Did the deeper CNN outperform the baseline models? By how much?
- What were the key techniques that helped achieve >50% accuracy?
- Why is it important to compare against simple baselines?
- How does PyTorch compare to implementing CNNs from scratch with NumPy?

_Provide your answers here._
