# LeNet 5 Architecture

The LeNet-5 architecture, introduced by Yann LeCun in 1998, is a pioneering convolutional neural network. A variant of LeNet-5 is given below:

MNIST Image (28×28) → C1 (28×28×6) → S2 (14×14×6) → C3 (14×14×16) → S4 (7×7×16) → C5 (120) → F6 (84) → Output (10)

1. Input: 28x28 grayscale image
2. C1: Convolutional layer (6 feature maps, 5x5 kernels)
3. S2: Average pooling layer (2x2)
4. C3: Convolutional layer (16 feature maps, 5x5 kernels)
5. S4: Average pooling layer (2x2)
6. C5: Fully connected layer (120 units)
7. F6: Fully connected layer (84 units)
8. Output: Fully connected layer (10 units)

# Do the following

- **Q2a** Build LeNet-5 Architecture. Note that the activation function is Sigmoid.

- **Q2b** The code provides the parameter count for LeNet-5. Justify the number of parameters through manual calculation of number of parameters.

- **Q2c** Modify the code to use ReLU activation instead of Sigmoid. Which activation function is better: ReLU or Sigmoid? Explain.

- **Q2d** What changes would you make to this architecture to handle CIFAR-10 images?

In [29]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from random import randint
import time
# Clear the output and plot updated metrics
from IPython.display import clear_output, display
import torch
from sklearn.metrics import roc_curve, precision_recall_curve, auc, average_precision_score
from TutQ2utils import *

In [None]:
def get_device():
    # Check if CUDA is available
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using CUDA device.")

    # Check if MPS is available (for Apple M-series chips and newer macOS versions)
    elif getattr(torch.backends, 'mps', None) and torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using MPS device.")
    else:
        # Fallback to CPU if neither CUDA nor MPS is available
        device = torch.device("cpu")
        print("Using CPU.")
    
    return device

# Get the device
device = get_device()
print(device)

In [None]:
# DO NOT REMOVE THIS CELL – THIS DOWNLOADS THE MNIST DATASET
# RUN THIS CELL BEFORE YOU RUN THE REST OF THE CELLS BELOW
#Install torchvision
from torchvision import datasets

# This downloads the MNIST datasets ~63MB
mnist_train = datasets.MNIST("./", train=True, download=True)
mnist_test  = datasets.MNIST("./", train=False, download=True)

x_train = mnist_train.data 
y_train = mnist_train.targets
    
x_test = mnist_test.data 
y_test = mnist_test.targets 

# Convert the data to a float type 
x_train_float = x_train.float()
x_test_float = x_test.float()

#Compute mean and std
#Why?
mean = x_train_float.mean()
std = x_train_float.std()

# Make sure all tensors are on the same device to avoid device mismatch errors
mean = mean.to(device)
std = std.to(device)

print(x_train.shape)

# **Q2a** Build LeNet-5 Architecture. Note that the activation function is Sigmoid.

In [32]:
class LeNet_wiki(nn.Module):
    def __init__(self):

        super(LeNet_wiki, self).__init__()

        # Your code goes here


    def forward(self, x):

        #Conv1
        x = self.conv1(x)
        x = self.activation(x)
        
        #Your code goes here
        
        return x

## Instantiate a LeNet model

In [None]:
#LeNet=LeNet5_convnet()
LeNet = LeNet_wiki()
LeNet = LeNet.to(device)
print(LeNet)


# **Q2b** The code below provides the parameter count for LeNet-5. Justify the number of parameters through manual calculation of number of parameters.

In [None]:
lenet5_parameter_count = 0
for param in LeNet.parameters():
    lenet5_parameter_count += param.numel()
print(f'There are {lenet5_parameter_count} parameters in this neural network')

# Select Loss function, learning rate and batch size

In [35]:
criterion = nn.CrossEntropyLoss()
initial_lr= 0.1 
btach_size= 128

# ROC curves for untrained model

In [None]:
# First, ensure mean and std are on CPU
mean = mean.cpu()
std = std.cpu()

# Now normalize the data
x_test_float = x_test.float()
x_test_normalized = (x_test_float - mean) / std

auc_scores = calculate_roc_curves_test(model=LeNet, x_test=x_test, y_test=y_test, num_classes=10, device=device, mean=mean, std=std)

In [37]:
class TrainingMetrics:
    def __init__(self):
        self.train_losses = []
        self.train_errors = []
        self.test_errors = []
        self.mean_aucs = []
        self.mean_aps = []
        self.start_time = time.time()

    def update(self, train_loss = None, train_error = None, mean_auc=None, mean_ap=None, test_error=None):
        self.train_losses.append(train_loss)
        self.train_errors.append(train_error)
        if mean_auc is not None:
            self.mean_aucs.append(mean_auc)
        if mean_ap is not None:
            self.mean_aps.append(mean_ap)
        if test_error is not None:
            self.test_errors.append(test_error)

In [38]:
def train_epoch(model, optimizer, x_train, y_train, criterion, device, mean, std, batch_size):
    """
    Train for one epoch
    """
    model.train()
    # running_loss stores the accumulated training loss in an epoch
    running_loss = 0
    # running_error stores the accumulated training error in an epoch
    running_train_error = 0
    # number of batches
    num_batches = 0
    
    #batch 
    shuffled_indices = torch.randperm(len(x_train))
    
    for count in range(0, len(x_train), batch_size):
        #initialize the gradients to zero (to prevent accumulation of gradients)
        optimizer.zero_grad()
        
        #select a minibatch
        indices = shuffled_indices[count:count + batch_size]
        minibatch_data = x_train[indices].unsqueeze(dim=1)
        minibatch_label = y_train[indices]
        
        #Transfer the data to the device 
        minibatch_data = minibatch_data.float().to(device)
        minibatch_label = minibatch_label.to(device)
        
        #Normalize the input data
        #mean and std are the mean and standard deviation of the dataset
        inputs = (minibatch_data - mean) / std
        inputs = inputs.to(device)
        inputs.requires_grad_()
        
        #scores represent the output of FC layer
        # note the absence of SoftMax layer (data is normalized prior to the calculation of loss)        
        scores = model(inputs)

        #Calculation of loss
        loss = criterion(scores, minibatch_label)

        #Calculation of gradients with respect to weihghts
        loss.backward()

        #Updating the weights
        optimizer.step()
        
        #Accumulating loss and erorr rate
        running_loss += loss.detach().item()
        error = error_rate(scores.detach(), minibatch_label)
        running_train_error += error.item()

        #updating the bacth count
        num_batches += 1
    
    train_loss = running_loss / num_batches
    train_error = running_train_error / num_batches
    
    return train_loss, train_error

In [None]:
def train_model(model, x_train, y_train, x_test, y_test, num_epochs, batch_size, initial_lr, device, mean, std):
    """
    Main training loop
    """
    metrics = TrainingMetrics()
    criterion = nn.CrossEntropyLoss()    
    # Initialize dynamic plots
    metrics_plotter = DynamicMetricsPlot()
   
    for epoch in range(1, num_epochs + 1):

        # select the optimizer and initialize the learning rate with initial_lr     
        optimizer = torch.optim.SGD(model.parameters(), lr=initial_lr)
        # update the learning rate
        current_lr = learning_rate_scheduler(optimizer, epoch, initial_lr)
        
        # Train for one epoch
        train_loss, train_error = train_epoch(model, optimizer, x_train, y_train, criterion, device, mean, std, batch_size)
        
        # Calculate ROC and PR curves on test set 
        if epoch % 1 == 0:  
            epoch_metrics = calculate_metrics(model, x_test, y_test, num_classes=10,device=device, mean=mean, std=std)
            mean_auc = epoch_metrics['mean_auc']
            mean_ap = epoch_metrics['mean_ap']
            test_error = epoch_metrics['test_error']
            #print(test_error)
        else:
            mean_auc = mean_ap = None

        # Update metrics
        metrics.update(train_loss = train_loss, train_error = train_error, mean_auc = mean_auc, mean_ap = mean_ap, test_error = test_error)
        
        # Print statistics
        elapsed_time = (time.time() - metrics.start_time) / 60        
        print_epoch_stats(epoch, elapsed_time, current_lr, train_loss, train_error, test_error, mean_auc, mean_ap)
        
        metrics_plotter.update(metrics)
    
    return metrics

# Usage
initial_lr = 0.1
num_epochs = 15
batch_size = 32

#metrics = TrainingMetrics()

metrics = train_model(
    model=LeNet,
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    num_epochs=num_epochs,
    batch_size=batch_size,
    initial_lr=initial_lr,
    device=device,
    mean=mean,
    std=std
)

In [None]:
# Calculate and plot both ROC and PR curves
auc_scores, ap_scores = plot_roc_pr_curves(model=LeNet,  x_test=x_test, y_test=y_test, num_classes=10, device=device,   mean=mean, std=std)

In [None]:
# choose a picture at random
idx=randint(0, 10000-1)
im=x_test[idx]

# Display the picture
plt.imshow(im.numpy(), cmap='gray')
plt.show()
#show(im)

# send to device, rescale, and view as a batch of 1 
im = im.float().to(device)
#im= (im-mean) / std
im=im.view(1,28,28).unsqueeze(dim=1)

# feed it to the net and display the confidence scores
scores =  LeNet(im) 
probs= torch.softmax(scores, dim=1)
show_prob_mnist(probs.cpu())

# **Q2c** Modify the code to use ReLU activation instead of Sigmoid. Which activation function is better: ReLU or Sigmoid? Explain.



# **Q2d** What changes would you make to this architecture to handle CIFAR-10 images?