# Task 7.1 Solution: CNN Architecture Study

**Module:** 7 - Computer Vision  
**Type:** Solution Notebook

---

This notebook contains solutions for the CNN architecture exercises.

In [None]:
# Common imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, List

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

## Exercise 1 Solution: LeNet with MaxPooling

MaxPooling typically works better for object recognition because it keeps the strongest activations ("Was there an edge here?") rather than averaging ("How much edge on average?").

In [None]:
class LeNet5_MaxPool(nn.Module):
    """
    LeNet-5 with MaxPooling instead of AvgPooling.
    
    MaxPooling typically works better for object recognition because
    it keeps the strongest activations ("Was there an edge here?")
    rather than averaging ("How much edge on average?").
    """
    
    def __init__(self, num_classes: int = 10):
        super(LeNet5_MaxPool, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 6, kernel_size=5)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)  # Changed from AvgPool
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, num_classes)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Test
model = LeNet5_MaxPool()
x = torch.randn(1, 3, 32, 32)
print(f"LeNet5_MaxPool output shape: {model(x).shape}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

## Exercise 2 Solution: Compare Gradient Flow in ResNet vs Plain Network

As depth increases, gradients in plain networks tend to vanish, while residual networks maintain gradient flow.

In [None]:
class PlainBlock(nn.Module):
    """Plain convolutional block without skip connection."""
    def __init__(self, channels):
        super(PlainBlock, self).__init__()
        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(channels)
    
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        return F.relu(out)  # No skip connection!


class ResidualBlock(nn.Module):
    """Residual block with skip connection."""
    def __init__(self, channels):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(channels)
    
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        return F.relu(out + x)  # Skip connection!


def compare_gradient_flow(num_blocks: int = 10):
    """
    Compare gradient magnitudes through stacked blocks.
    
    As depth increases, gradients in plain networks tend to vanish,
    while residual networks maintain gradient flow.
    """
    channels = 64
    
    # Stack blocks
    plain_blocks = nn.Sequential(*[PlainBlock(channels) for _ in range(num_blocks)])
    res_blocks = nn.Sequential(*[ResidualBlock(channels) for _ in range(num_blocks)])
    
    # Create input with gradient tracking
    x_plain = torch.randn(1, channels, 32, 32, requires_grad=True)
    x_res = torch.randn(1, channels, 32, 32, requires_grad=True)
    
    # Forward pass
    out_plain = plain_blocks(x_plain)
    out_res = res_blocks(x_res)
    
    # Backward pass
    out_plain.sum().backward()
    out_res.sum().backward()
    
    # Compare gradient magnitudes
    grad_plain = x_plain.grad.abs().mean().item()
    grad_res = x_res.grad.abs().mean().item()
    
    return grad_plain, grad_res

# Compare at different depths
print("Gradient Magnitude Comparison (Higher is Better)")
print("="*50)
print(f"{'Depth':<10} {'Plain Network':<20} {'ResNet':<20}")
print("-"*50)

for depth in [5, 10, 20, 30]:
    grad_plain, grad_res = compare_gradient_flow(depth)
    print(f"{depth:<10} {grad_plain:<20.6f} {grad_res:<20.6f}")

print("\nConclusion: ResNet maintains gradient flow even at great depth!")

## Challenge Solution: Squeeze-and-Excitation ResNet

SE blocks learn to weight channels by their importance.
Paper: "Squeeze-and-Excitation Networks" (Hu et al., 2018)

In [None]:
class SEBlock(nn.Module):
    """
    Squeeze-and-Excitation block.
    
    Learns to weight channels by their importance.
    """
    def __init__(self, channels: int, reduction: int = 16):
        super(SEBlock, self).__init__()
        self.squeeze = nn.AdaptiveAvgPool2d(1)
        self.excite = nn.Sequential(
            nn.Linear(channels, channels // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channels // reduction, channels, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        b, c, _, _ = x.size()
        # Squeeze: global average pooling
        y = self.squeeze(x).view(b, c)
        # Excite: learn channel weights
        y = self.excite(y).view(b, c, 1, 1)
        # Scale: multiply input by channel weights
        return x * y


class SEBasicBlock(nn.Module):
    """ResNet BasicBlock with SE attention."""
    expansion = 1
    
    def __init__(self, in_channels: int, out_channels: int, stride: int = 1, reduction: int = 16):
        super(SEBasicBlock, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        # SE Block
        self.se = SEBlock(out_channels, reduction)
        
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 1, stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out = self.se(out)  # Apply SE attention
        out += self.shortcut(x)
        return F.relu(out)


class SEResNet18(nn.Module):
    """ResNet-18 with Squeeze-and-Excitation blocks."""
    
    def __init__(self, num_classes: int = 10):
        super(SEResNet18, self).__init__()
        
        self.in_channels = 64
        
        self.conv1 = nn.Conv2d(3, 64, 3, 1, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        
        self.layer1 = self._make_layer(64, 2, 1)
        self.layer2 = self._make_layer(128, 2, 2)
        self.layer3 = self._make_layer(256, 2, 2)
        self.layer4 = self._make_layer(512, 2, 2)
        
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)
    
    def _make_layer(self, out_channels: int, num_blocks: int, stride: int) -> nn.Sequential:
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for s in strides:
            layers.append(SEBasicBlock(self.in_channels, out_channels, s))
            self.in_channels = out_channels
        return nn.Sequential(*layers)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

# Test SE-ResNet
se_resnet = SEResNet18(num_classes=10)
x = torch.randn(1, 3, 32, 32)
print(f"SE-ResNet18 output: {se_resnet(x).shape}")
print(f"Parameters: {sum(p.numel() for p in se_resnet.parameters()):,}")

## Summary

In this solution notebook, we implemented:

1. **LeNet with MaxPooling** - Replaced AvgPool with MaxPool for better feature detection
2. **Gradient Flow Comparison** - Demonstrated how ResNet maintains gradient flow at depth
3. **SE-ResNet** - Added Squeeze-and-Excitation attention to ResNet blocks

Key insights:
- MaxPooling preserves strongest activations (better for recognition)
- Skip connections are essential for training deep networks
- Channel attention (SE blocks) can improve accuracy with minimal overhead

In [None]:
# Cleanup
import gc
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Cleanup complete!")