## Importing Library

In [None]:
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms

<br><br>
<hr>
<br><br>

## Affine layer

In [None]:
def affine_forward(x, w, b):
    N = x.shape[0]
    D,M = w.shape
    input_vectors = x.reshape(N,D)
    out = np.matmul(input_vectors,w) + b
    return out, (x, w, b)


In [None]:
def affine_backward(dout, cache):
    x,w,b = cache

    N = x.shape[0]
    D,M = w.shape

    dx = np.matmul(dout,w.transpose()).reshape(x.shape)
    dw = np.matmul(x.reshape(N,D).transpose(),dout)
    db = np.sum(dout, axis=0)
    return dx,dw,db

## Convolution layer

In [None]:
def conv_forward_naive(x, w, b, conv_param):
    out = None

    N, C, H, W = x.shape
    F, _, HH, WW = w.shape
    stride = conv_param['stride']
    pad = conv_param['pad']

    H_out = 1 + (H + 2 * pad - HH) / stride
    W_out = 1 + (W + 2 * pad - WW) / stride
    assert(H_out == int(H_out) and W_out == int(W_out))
    H_out, W_out = int(H_out), int(W_out)

    x_pad = np.pad(x, ((0,), (0,), (pad,), (pad,)), mode='constant', constant_values=0)
    out = np.zeros((N, F, H_out, W_out))

    for i in range(N):
        for f in range(F):
            for h_o in range(H_out):
                for w_o in range(W_out):
                    top = h_o * stride
                    bottom = h_o * stride + HH
                    left = w_o * stride
                    right = w_o * stride + WW
                    window_samples = x_pad[i, :, top:bottom, left:right]

                    out[i, f, h_o, w_o] = np.dot(window_samples.ravel(), w[f].ravel()) + b[f]


    cache = (x_pad, w, b, conv_param)
    return out, cache


In [None]:
def conv_backward_naive(dout, cache):
    x_pad, w, b, conv_param = cache
    stride = conv_param['stride']
    pad = conv_param['pad']
    N = x_pad.shape[0]
    F, C, HH, WW = w.shape
    _, _, H_out, W_out = dout.shape

    dx, dw, db = np.zeros_like(x_pad), np.zeros_like(w), np.zeros_like(b)
    
    
    db = dout.sum(axis=(0, 2, 3))

    for i in range(N):
        for h_o in range(H_out):
            for w_o in range(W_out):
                for f in range(F):
                    top = h_o * stride
                    bottom = h_o * stride + HH
                    left = w_o * stride
                    right = w_o * stride + WW
                    window_samples = x_pad[i, :, top:bottom, left:right]

                    dloss_doutput_pixel = dout[i, f, h_o, w_o]
                    dw[f] += window_samples * dloss_doutput_pixel
                    dx[i, :, top:bottom, left:right] += w[f] * dloss_doutput_pixel

    dx = dx[:, :, pad:-pad, pad:-pad]


    return dx, dw, db


## Max pooling


In [None]:

def max_pool_forward_naive(x, pool_param):
  
    N, C, H, W = x.shape
    pool_height = pool_param['pool_height']
    pool_width = pool_param['pool_width']
    stride = pool_param['stride']
    W_out = int((W - pool_width) / stride + 1)
    H_out = int((H - pool_height) / stride + 1)
    out = np.zeros((N, C, H_out, W_out))

    for i in range(N):
        for h_o in range(H_out):
            for w_o in range(W_out):
                top = h_o * stride
                bottom = h_o * stride + pool_height
                left = w_o * stride
                right = w_o * stride + pool_width
                window_samples = x[i, :, top:bottom, left:right]

                out[i, :, h_o, w_o] = np.amax(window_samples, axis=(1, 2))


    cache = (x, pool_param)
    return out, cache



In [None]:
def max_pool_backward_naive(dout, cache):
    
    x, pool_param = cache
    N, C, H_out, W_out = np.shape(dout)
    pool_height = pool_param['pool_height']
    pool_width = pool_param['pool_width']
    stride = pool_param['stride']

    dx = np.zeros_like(x) # (N, C, H, W)


    for i in range(N):
        for h_o in range(H_out):
            for w_o in range(W_out):
                top = h_o * stride
                bottom = h_o * stride + pool_height
                left = w_o * stride
                right = w_o * stride + pool_width
                window_samples = x[i, :, top:bottom, left:right]

                window_samples = window_samples.reshape((C, pool_width * pool_height))
                maxes = np.argmax(window_samples, axis=1)

                dout_din = np.zeros_like(window_samples)
                dout_din[np.arange(C), maxes] = 1.0

                dout_din = dout_din.reshape((C, pool_height, pool_width))

                dx[i, :, top:bottom, left:right] += dout_din * dout[i, :, h_o, w_o].reshape((C, 1, 1))


    return dx


## ReLU activation functions

In [None]:
def relu_forward(x):
    out, cache = np.maximum(x,0),x

    return out, cache


### ReLU backward

In [None]:
def relu_backward(dout, cache):
    dr = cache
    dr[dr<=0] = 0
    dr[dr>0] = 1
    dx = np.multiply(dr,dout)

    return dx

<br><br>
<hr>
<br><br>

## Simple multi layer network for MNIST

In [None]:
class SimpleCNN:
    def __init__(self):
        np.random.seed(231) # for reproducibility

        # Hardcoded network dimensions
        self.num_classes = 10
        self.hidden_dim = 1024

        # Layer shapes and parameters initialization
        self.conv_param1 = {'stride': 2, 'pad': 1}
        self.W1 = 0.01 * np.random.randn(8, 1, 4, 4) # First convolutional layer weights
        self.b1 = np.zeros(8) # First convolutional layer biases

        self.conv_param2 = {'stride': 1, 'pad': 1}
        self.W2 = 0.01 * np.random.randn(16, 8, 3, 3) # Second convolutional layer weights
        self.b2 = np.zeros(16) # Second convolutional layer biases

        self.W3 = 0.01 * np.random.randn(16 * 7 * 7, self.hidden_dim)  # Fully-connected layer weights
        self.b3 = np.zeros(self.hidden_dim) # Fully-connected layer biases

        self.W4 = 0.01 * np.random.randn(self.hidden_dim, self.num_classes) # Output layer weights
        self.b4 = np.zeros(self.num_classes) # Output layer biases


    def forward(self, X):
        # Forward pass through the network
        out1, self.cache1 = conv_forward_naive(X, self.W1, self.b1, self.conv_param1) # Conv1
        relu_out1, self.relu_cache1 = relu_forward(out1) # ReLU

        out2, self.cache2 = conv_forward_naive(relu_out1, self.W2, self.b2, self.conv_param2) # Conv2
        relu_out2, self.relu_cache2 = relu_forward(out2) # ReLU

        pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}
        out3, self.cache3 = max_pool_forward_naive(relu_out2, pool_param) # Max pooling

        self.cache_reshape = out3.shape
        out3_flat = out3.reshape(out3.shape[0], -1) # Flatten for fully-connected layer

        out4, self.cache4 = affine_forward(out3_flat, self.W3, self.b3) # Fully-connected layer

        relu_out4, self.relu_cache4 = relu_forward(out4) # ReLU

        scores, self.cache5 = affine_forward(relu_out4, self.W4, self.b4) # Output layer
        return scores


    def backward(self, scores, y):
        # Backward pass to compute gradients
        loss, grads = 0, {}

        N = scores.shape[0]
        shifted = scores - np.max(scores, axis=1, keepdims=True) # for numerical stability
        probs = np.exp(shifted) / np.sum(np.exp(shifted), axis=1, keepdims=True) # Softmax probabilities

        loss = -np.sum(np.log(probs[np.arange(N), y])) / N # Cross-entropy loss

        dscores = probs.copy()
        dscores[np.arange(N), y] -= 1
        dscores /= N # Gradient of the loss with respect to scores

        dx5, dW4, db4 = affine_backward(dscores, self.cache5) # Backprop through output layer
        grads['W4'], grads['b4'] = dW4, db4

        drelu4 = relu_backward(dx5, self.relu_cache4) # Backprop through ReLU
        dx4, dW3, db3 = affine_backward(drelu4, self.cache4) # Backprop through fully-connected layer
        grads['W3'], grads['b3'] = dW3, db3

        dx3 = dx4.reshape(self.cache_reshape) # Reshape back to pooled feature map shape
        dpool = max_pool_backward_naive(dx3, self.cache3) # Backprop through max pooling

        drelu2 = relu_backward(dpool, self.relu_cache2) # Backprop through ReLU
        dx2, dW2, db2 = conv_backward_naive(drelu2, self.cache2) # Backprop through Conv2
        grads['W2'], grads['b2'] = dW2, db2

        drelu1 = relu_backward(dx2, self.relu_cache1) # Backprop through ReLU
        dx1, dW1, db1 = conv_backward_naive(drelu1, self.cache1) # Backprop through Conv1
        grads['W1'], grads['b1'] = dW1, db1
        return loss, grads


    def update(self, grad, learning_rate):
        # Update model parameters using gradients
        for param in ['W1', 'b1', 'W2', 'b2', 'W3', 'b3', 'W4', 'b4']:
            self.__dict__[param] -= learning_rate * grad[param]

<br><br>
<hr>
<br><br>

## Train network

### Initialize network and hyperparameters

In [None]:
learning_rate = 0.1
num_epochs = 10
batch_size = 20
simple_model = SimpleCNN()

max_data = 100

### load MNIST dataset

In [None]:
# Define preprocessing steps
transform = transforms.Compose([
    transforms.ToTensor(),        
    transforms.Normalize((0.5,), (0.5,))        
])

trainset = torchvision.datasets.MNIST(root='./data', train=True,
                                        download=True, transform=transform)

trainset = torch.utils.data.Subset(trainset, range(max_data))

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

### run training
The goal was to train the model on an MNIST subset. If the loss decreases, your implementation worksâ€”no need for optimal performance here. This confirms basic functionality; further tuning can come later.

In [None]:
def train(model, dataloader, lr, num_epochs):
    for epoch in range(num_epochs):
        epoch_loss = 0
        for i, (X_batch, y_batch) in enumerate(dataloader):
            # Convert tensors to numpy arrays
            X_batch = X_batch.numpy()
            y_batch = y_batch.numpy()

            scores = model.forward(X_batch)
            loss, grads = model.backward(scores, y_batch)

            model.update(grads, lr)
            epoch_loss += loss

        print(f"Epoch {epoch+1:02d}, Loss: {epoch_loss:.4f}")


train(simple_model, trainloader, learning_rate, num_epochs)

Epoch 01, Loss: 11.5139
Epoch 02, Loss: 11.4976
Epoch 03, Loss: 11.4812
Epoch 05, Loss: 11.4550
Epoch 06, Loss: 11.4437
Epoch 07, Loss: 11.4341
Epoch 08, Loss: 11.4211
Epoch 09, Loss: 11.4107
Epoch 10, Loss: 11.4030
