# Day 28: CS231n -- CNNs for Visual Recognition

> Stanford CS231n Course Notes -- Andrej Karpathy, Fei-Fei Li

This notebook walks through core CNN components from scratch:
1. Convolution layer (naive + im2col)
2. Max pooling layer
3. ReLU activation
4. SimpleCNN end-to-end
5. VGGNet-16 parameter analysis

**Reference:** https://cs231n.github.io/convolutional-networks/

In [None]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams["font.size"] = 12

## 1. The Output Size Formula

The most important equation in CNNs (from CS231n):

$$\text{output size} = \frac{W - F + 2P}{S} + 1$$

Where:
- **W** = input size
- **F** = filter size
- **P** = padding
- **S** = stride

If this produces a non-integer, the hyperparameters are invalid.

In [None]:
def output_size(W, F, P, S):
    """CS231n output size formula."""
    return (W - F + 2*P) // S + 1

# Common cases from CS231n:
print("Case 1: 32x32 input, 3x3 filter, pad=1, stride=1")
print(f"  Output: {output_size(32, 3, 1, 1)}x{output_size(32, 3, 1, 1)}  (preserved)")

print("\nCase 2: AlexNet first layer - 227x227, 11x11 filter, stride=4")
print(f"  Output: {output_size(227, 11, 0, 4)}x{output_size(227, 11, 0, 4)}")

print("\nCase 3: 2x2 max pooling stride 2 on 32x32")
print(f"  Output: {output_size(32, 2, 0, 2)}x{output_size(32, 2, 0, 2)}  (halved)")

## 2. Naive Convolution Forward Pass

The convolution slides each filter across the input volume,
computing a dot product at every spatial position.

In [None]:
def conv_forward_naive(x, w, b, stride=1, pad=0):
    """Naive convolution: 4 nested loops."""
    N, C, H, W = x.shape
    K, _, FH, FW = w.shape
    H_out = (H - FH + 2*pad) // stride + 1
    W_out = (W - FW + 2*pad) // stride + 1
    
    x_pad = np.pad(x, ((0,0),(0,0),(pad,pad),(pad,pad)), mode="constant")
    out = np.zeros((N, K, H_out, W_out))
    
    for n in range(N):
        for k in range(K):
            for i in range(H_out):
                for j in range(W_out):
                    h_s = i * stride
                    w_s = j * stride
                    region = x_pad[n, :, h_s:h_s+FH, w_s:w_s+FW]
                    out[n, k, i, j] = np.sum(region * w[k]) + b[k]
    return out

# Test
np.random.seed(42)
x = np.random.randn(1, 3, 8, 8)
w = np.random.randn(4, 3, 3, 3)
b = np.zeros(4)

out = conv_forward_naive(x, w, b, stride=1, pad=1)
print(f"Input shape:  {x.shape}")
print(f"Filter shape: {w.shape}")
print(f"Output shape: {out.shape}")
print(f"Expected:     (1, 4, 8, 8)  -- pad=1 preserves spatial size")

## 3. im2col: The Fast Convolution Trick

The **im2col** trick reshapes overlapping input patches into columns,
then convolution becomes a single matrix multiply.

In [None]:
def im2col(x, FH, FW, stride=1, pad=0):
    """Reshape input patches into columns for matrix multiply."""
    N, C, H, W = x.shape
    H_out = (H - FH + 2*pad) // stride + 1
    W_out = (W - FW + 2*pad) // stride + 1
    x_pad = np.pad(x, ((0,0),(0,0),(pad,pad),(pad,pad)), mode="constant")
    cols = np.zeros((N, C, FH, FW, H_out, W_out))
    for i in range(FH):
        for j in range(FW):
            cols[:, :, i, j, :, :] = x_pad[:, :, i:i+stride*H_out:stride, j:j+stride*W_out:stride]
    return cols.reshape(N, C*FH*FW, H_out*W_out)

def conv_forward_im2col(x, w, b, stride=1, pad=0):
    """Fast convolution via im2col + matrix multiply."""
    N, C, H, W = x.shape
    K, _, FH, FW = w.shape
    H_out = (H - FH + 2*pad) // stride + 1
    W_out = (W - FW + 2*pad) // stride + 1
    cols = im2col(x, FH, FW, stride, pad)
    w_flat = w.reshape(K, -1)
    out = np.zeros((N, K, H_out*W_out))
    for n in range(N):
        out[n] = w_flat @ cols[n] + b.reshape(-1, 1)
    return out.reshape(N, K, H_out, W_out)

# Compare naive vs im2col
out_naive = conv_forward_naive(x, w, b, stride=1, pad=1)
out_fast = conv_forward_im2col(x, w, b, stride=1, pad=1)
print(f"Max difference: {np.max(np.abs(out_naive - out_fast)):.2e}")
print("Both methods produce identical results.")

## 4. Max Pooling

CS231n: *"The most common form is a pooling layer with filters of size 2x2
applied with a stride of 2, which discards exactly 75% of the activations."*

In [None]:
def max_pool(x, pool_size=2, stride=2):
    """2x2 max pooling -- halves spatial dimensions."""
    N, C, H, W = x.shape
    H_out = (H - pool_size) // stride + 1
    W_out = (W - pool_size) // stride + 1
    out = np.zeros((N, C, H_out, W_out))
    for i in range(H_out):
        for j in range(W_out):
            h_s, w_s = i*stride, j*stride
            out[:, :, i, j] = np.max(
                x[:, :, h_s:h_s+pool_size, w_s:w_s+pool_size], axis=(2,3)
            )
    return out

pool_in = np.random.randn(1, 4, 8, 8)
pool_out = max_pool(pool_in)
print(f"Input:  {pool_in.shape}")
print(f"Output: {pool_out.shape}  -- halved spatial dimensions")

## 5. SimpleCNN: Full Forward Pass

Architecture: CONV(3x3, 8) -> ReLU -> Pool -> CONV(3x3, 16) -> ReLU -> Pool -> FC(10)

A minimal CNN for CIFAR-10 (32x32x3 input, 10 classes).

In [None]:
def relu(x):
    return np.maximum(0, x)

class SimpleCNN:
    """Minimal CNN: 2 conv layers + 1 FC layer."""
    def __init__(self):
        scale = 0.01
        self.params = {
            "W1": np.random.randn(8, 3, 3, 3) * scale,
            "b1": np.zeros(8),
            "W2": np.random.randn(16, 8, 3, 3) * scale,
            "b2": np.zeros(16),
            "W3": np.random.randn(16 * 8 * 8, 10) * scale,
            "b3": np.zeros(10),
        }
    
    def forward(self, x):
        p = self.params
        # CONV1 -> ReLU -> Pool
        h1 = conv_forward_im2col(x, p["W1"], p["b1"], stride=1, pad=1)
        h1 = relu(h1)
        h1 = max_pool(h1)  # 32->16
        # CONV2 -> ReLU -> Pool
        h2 = conv_forward_im2col(h1, p["W2"], p["b2"], stride=1, pad=1)
        h2 = relu(h2)
        h2 = max_pool(h2)  # 16->8
        # FC
        h2_flat = h2.reshape(x.shape[0], -1)
        scores = h2_flat @ p["W3"] + p["b3"]
        return scores

model = SimpleCNN()
dummy = np.random.randn(2, 3, 32, 32)
scores = model.forward(dummy)
print(f"Input:       {dummy.shape}")
print(f"Scores:      {scores.shape}")
print(f"Predictions: {np.argmax(scores, axis=1)}")
total = sum(p.size for p in model.params.values())
print(f"Total params: {total:,}")

## 6. Visualizing Filters and Activations

First-layer filters operate on RGB pixels and can be visualized directly.
Deeper filters operate on activation maps and are harder to interpret.

In [None]:
# Visualize first-layer filters
filters = model.params["W1"]  # (8, 3, 3, 3)

fig, axes = plt.subplots(1, 8, figsize=(16, 2))
fig.suptitle("First-Layer Filters (random init)", fontweight="bold")
for i in range(8):
    f = filters[i].transpose(1, 2, 0)  # CHW -> HWC
    f = (f - f.min()) / (f.max() - f.min() + 1e-8)
    axes[i].imshow(f)
    axes[i].set_title(f"F{i}", fontsize=9)
    axes[i].axis("off")
plt.tight_layout()
plt.show()

# Activation maps after CONV1
image = np.random.randn(1, 3, 32, 32)
act = relu(conv_forward_im2col(image, model.params["W1"], model.params["b1"], stride=1, pad=1))

fig, axes = plt.subplots(1, 8, figsize=(16, 2))
fig.suptitle("Activation Maps After CONV1 + ReLU", fontweight="bold")
for i in range(8):
    axes[i].imshow(act[0, i], cmap="viridis")
    axes[i].set_title(f"Map {i}", fontsize=9)
    axes[i].axis("off")
plt.tight_layout()
plt.show()

## 7. VGGNet-16 Parameter Analysis

CS231n's key insight: **89% of VGGNet's parameters are in the FC layers**, but
most memory is consumed by the early CONV layers.

In [None]:
# VGGNet-16 layer specs: (name, in_depth, out_depth, filter_size)
vgg = [
    ("CONV3-64", 3, 64, 3), ("CONV3-64", 64, 64, 3),
    ("CONV3-128", 64, 128, 3), ("CONV3-128", 128, 128, 3),
    ("CONV3-256", 128, 256, 3), ("CONV3-256", 256, 256, 3), ("CONV3-256", 256, 256, 3),
    ("CONV3-512", 256, 512, 3), ("CONV3-512", 512, 512, 3), ("CONV3-512", 512, 512, 3),
    ("CONV3-512", 512, 512, 3), ("CONV3-512", 512, 512, 3), ("CONV3-512", 512, 512, 3),
]
fc = [
    ("FC-4096", (7*7*512+1)*4096),
    ("FC-4096", (4096+1)*4096),
    ("FC-1000", (4096+1)*1000),
]

names, params = [], []
for name, d_in, d_out, f in vgg:
    names.append(name)
    params.append((f*f*d_in + 1) * d_out)
for name, p in fc:
    names.append(name)
    params.append(p)

total = sum(params)
print(f"VGGNet-16 Total: {total:,} parameters\n")
for n, p in zip(names, params):
    print(f"  {n:15s} {p:>12,}  ({p/total*100:5.1f}%)")

conv_p = sum(p for n, p in zip(names, params) if n.startswith('CONV'))
fc_p = sum(p for n, p in zip(names, params) if n.startswith('FC'))
print(f"\n  CONV total: {conv_p:>12,}  ({conv_p/total*100:.1f}%)")
print(f"  FC total:   {fc_p:>12,}  ({fc_p/total*100:.1f}%)")

In [None]:
# Visualize parameter distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle("VGGNet-16: Where Do Parameters Live? (from CS231n)", fontweight="bold")

colors = ["steelblue" if n.startswith("CONV") else "coral" for n in names]
ax1.barh(range(len(names)), [p/1e6 for p in params], color=colors, alpha=0.8)
ax1.set_yticks(range(len(names)))
ax1.set_yticklabels(names, fontsize=9)
ax1.set_xlabel("Parameters (Millions)")
ax1.set_title("FC layers dominate")
ax1.invert_yaxis()

pool_sizes = [224, 112, 56, 28, 14, 7]
ax2.bar(range(len(pool_sizes)), pool_sizes, color="seagreen", alpha=0.7)
ax2.set_xticks(range(len(pool_sizes)))
ax2.set_xticklabels(["Input", "Pool1", "Pool2", "Pool3", "Pool4", "Pool5"])
ax2.set_ylabel("Spatial Size")
ax2.set_title("Spatial dims halve at each pool")
for i, v in enumerate(pool_sizes):
    ax2.text(i, v+3, str(v), ha="center", fontsize=10)
plt.tight_layout()
plt.show()

## Key Takeaways

1. **Output size formula**: `(W - F + 2P) / S + 1` -- memorize this
2. **Parameter sharing**: Each filter uses the same weights across the entire input
3. **im2col trick**: Reshapes convolution into matrix multiplication for speed
4. **VGGNet insight**: 89% of params in FC layers; most memory in early CONV layers
5. **Architecture pattern**: Spatial dims shrink while depth grows

---

Next: Try the exercises in `exercises/` to implement these from scratch yourself.