# Layer Ddefinition

The layers of AvgPool2D, Convo2D, Dense, TanH, Softmax, Relu are defined here

In [199]:
import numpy as np

In [None]:

class AvgPool2D:
    def __init__(self, pool_size, stride):
        self.ph, self.pw = pool_size
        self.s = stride           
        self.cache = None

    def forward(self, X):
        N, C, H_in, W_in = X.shape
        H_out = (H_in - self.ph) // self.s + 1
        W_out = (W_in - self.pw) // self.s + 1
        out = np.zeros((N, C, H_out, W_out))
        for h in range(H_out):
            for w in range(W_out):
                h_start = h * self.s
                h_end = h_start + self.ph
                w_start = w * self.s
                w_end = w_start + self.pw
                X_slice = X[:, :, h_start:h_end, w_start:w_end]
                out[:, :, h, w] = np.mean(X_slice, axis=(2, 3))
        self.cache = X
        return out

    def backward(self, dOut):
        X = self.cache
        N, C, H_in, W_in = X.shape
        _, _, H_out, W_out = dOut.shape
        dX = np.zeros_like(X)
        distribution_factor = 1.0 / (self.ph * self.pw)

        for h in range(H_out):
            for w in range(W_out):
                h_start = h * self.s
                h_end = h_start + self.ph
                w_start = w * self.s
                w_end = w_start + self.pw
                dOut_slice = dOut[:, :, h, w, np.newaxis, np.newaxis]
                dX_slice = dOut_slice * distribution_factor
                dX[:, :, h_start:h_end, w_start:w_end] += dX_slice


        return dX

In [201]:
def im2col(X, k):
    B, C, H, W = X.shape
    out_h = H - k + 1
    out_w = W - k + 1

    i0 = np.repeat(np.arange(k), k)
    i0 = np.tile(i0, C)
    i1 = np.repeat(np.arange(out_h), out_w)
    j0 = np.tile(np.arange(k), k * C)
    j1 = np.tile(np.arange(out_w), out_h)

    i = i0.reshape(-1,1) + i1.reshape(1,-1)
    j = j0.reshape(-1,1) + j1.reshape(1,-1)

    c = np.repeat(np.arange(C), k*k).reshape(-1,1)

    cols = X[:, c, i, j]
    return cols

def col2im(cols, X_shape, k):
    B, C, H, W = X_shape
    out_h = H - k + 1
    out_w = W - k + 1

    X_grad = np.zeros((B, C, H, W))

    i0 = np.repeat(np.arange(k), k)
    i0 = np.tile(i0, C)
    i1 = np.repeat(np.arange(out_h), out_w)
    j0 = np.tile(np.arange(k), k * C)
    j1 = np.tile(np.arange(out_w), out_h)

    i = i0.reshape(-1,1) + i1.reshape(1,-1)
    j = j0.reshape(-1,1) + j1.reshape(1,-1)
    c = np.repeat(np.arange(C), k*k).reshape(-1,1)

    np.add.at(X_grad, (slice(None), c, i, j), cols.reshape(B, -1, out_h*out_w))

    return X_grad

In [202]:
import numpy as np


class Conv2D:
    def __init__(self, in_channels, out_channels, kernel_size):
        self.in_ch = in_channels
        self.out_ch = out_channels
        self.k = kernel_size

        limit = np.sqrt(1 / (in_channels * kernel_size * kernel_size))
        self.params = {
            "W": np.random.uniform(-limit, limit,
                (out_channels, in_channels, kernel_size, kernel_size)),
            "b": np.zeros(out_channels)
        }
        self.grads = {
            "W": np.zeros_like(self.params["W"]),
            "b": np.zeros_like(self.params["b"])
        }

    def forward(self, X):
        X = np.ascontiguousarray(X)
        self.X = X
        B, C, H, W = X.shape
        k = self.k
        self.out_h = H - k + 1
        self.out_w = W - k + 1

        self.X_col = im2col(X, k)               
        self.X_col = self.X_col.transpose(0, 2, 1)  
        W_col = self.params['W'].reshape(self.out_ch, -1)

        out = self.X_col @ W_col.T
        out = out.transpose(0,2,1).reshape(B, self.out_ch, self.out_h, self.out_w)


        return out

    def backward(self, dout):
        X = np.ascontiguousarray(self.X)
        B = dout.shape[0]
        k = self.k
        C_out = self.out_ch

        dout_flat = dout.reshape(B, C_out, -1).transpose(0,2,1)

        dW_col = dout_flat.transpose(0,2,1) @ self.X_col 
        self.grads["W"] = dW_col.sum(axis=0).reshape(self.params["W"].shape)

        self.grads["b"] = dout.sum(axis=(0,2,3))

        W_col = self.params["W"].reshape(C_out, -1)
        dX_col = dout_flat @ W_col 
        dX_col = dX_col.transpose(0,2,1)

        dX = col2im(dX_col, self.X.shape, k)
        return dX


In [204]:
import numpy as np

class DenseLayer:
    def __init__(self, in_features, out_features):
        limit = np.sqrt(1 / in_features)
        self.params = {}
        self.params['W'] = np.random.uniform(-limit, limit, (in_features, out_features))
        self.params['b'] = np.zeros((1, out_features))

    def forward(self, X):
        self.X = X        
        return X @ self.params['W'] + self.params['b']

    def backward(self, d_out):
        self.grads = {}
        self.grads['W'] = self.X.T @ d_out
        self.grads['b'] = np.sum(d_out, axis=0, keepdims=True)
        return d_out @ self.params['W'].T


In [205]:
class Flatten:
    def forward(self, X):
        self.shape = X.shape
        return X.reshape(X.shape[0], -1)

    def backward(self, d_out):
        return d_out.reshape(self.shape)


In [206]:
import numpy as np

class ReLU:
    def __init__(self):
        pass

    def forward(self, X):
        self.X = X
        return np.maximum(0, X)

    def backward(self, d_out):
        return d_out * (self.X > 0)


In [207]:
class Sequence:
    def __init__(self, layers, loss_fn=None, optimizer=None):
        self.layers = layers
        self.loss_fn = loss_fn
        self.optimizer = optimizer

    def forward(self, X):
        out = X
        for layer in self.layers:
            out = layer.forward(out)
        self.out = out
        return out

    def backward(self, y):
        loss = self.loss_fn.forward(self.out, y)
        grad = self.loss_fn.backward()
        for layer in reversed(self.layers):
            grad = layer.backward(grad)
        return loss

    def step(self):
        if self.optimizer is None:
            return

        for layer in self.layers:
            if hasattr(layer, "params"):
                for name in layer.params:
                    self.optimizer.update(
                        layer.params[name],
                        layer.grads[name]
                    )


In [208]:
import numpy as np

class Softmax:
    def __init__(self):
        pass

    def forward(self, X):
        self.X = X
        shifted = X - np.max(X, axis=1, keepdims=True)
        exp = np.exp(shifted)
        self.out = exp / np.sum(exp, axis=1, keepdims=True)
        return self.out

    def backward(self, d_out,*args):
        B, C = d_out.shape
        dX = np.zeros_like(d_out)

        for b in range(B):
            s = self.out[b].reshape(-1, 1)          # (C,1)
            jac = np.diagflat(s) - (s @ s.T)        # (C,C)
            dX[b] = jac @ d_out[b]

        return dX


In [209]:
import numpy as np

class Tanh:
    def __init__(self):
        pass

    def forward(self, X):
        self.out = np.tanh(X)
        return self.out

    def backward(self, d_out):
        return d_out * (1 - self.out ** 2)


# LossFunctions and Optimizers defined

Here Loss Function Softmax Cross Entropy is definied, and the optimizer of SGD is used

In [210]:
import numpy as np

class SoftmaxCrossEntropy:
    def __init__(self):
        pass

    def forward(self, logits, y_true):
        self.logits = logits
        B, C = logits.shape
        shifted = logits - np.max(logits, axis=1, keepdims=True)
        exp = np.exp(shifted)
        self.probs = exp / np.sum(exp, axis=1, keepdims=True)
        log_likelihood = -np.log(self.probs[np.arange(B), y_true] + 1e-12)
        loss = np.mean(log_likelihood)
        self.y_true = y_true
        return loss

    def backward(self):
        B = self.logits.shape[0]
        grad = self.probs.copy()
        grad[np.arange(B), self.y_true] -= 1
        return grad / B


In [211]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr

    def update(self, param, grad):
        param -= self.lr * grad


# Model 

The model implemented below follows the LeNet-5 convolutional neural network proposed by LeCun et al. in their 1998 paper on handwritten digit recognition.

The model implemented here however has some changes according to convinence. 

1. The original model used partial connections in C3, which is avoided here. 

1. The original model was trained till convergence, here it is trained till 40 epochs, which is a lot, but with SGD it isn't enough to converge. 

1. The original model used the subsampling layer with learnable parameters, which was avoided here by using simple AvgPooling

1. The original model used a variant of MSE + RBF style activation, but CrossEntropy is Used here.

In [212]:
class Lenet5(Sequence):
    def __init__(self):
        layers = [
            Conv2D(in_channels=1, out_channels=6, kernel_size=5),
            Tanh(),

            AvgPool2D((2,2),2),
            Tanh(),

            Conv2D(in_channels=6 ,out_channels=16, kernel_size=5),
            Tanh(),

            AvgPool2D((2,2),2),
            Tanh(),

            Conv2D(in_channels=16, out_channels=120, kernel_size=5),
            Tanh(),

            Flatten(),
            
            DenseLayer(120, 84),
            Tanh(),

            DenseLayer(84, 10)
        ]
        super().__init__(layers=layers,loss_fn=SoftmaxCrossEntropy(),optimizer=SGD())

## Utility Functions

1. Loading Dataset

1. Evaluation of Model

1. Getting Batches

In [213]:
from sklearn.datasets import fetch_openml
import numpy as np


def load_mnist():
    mnist = fetch_openml("mnist_784", version=1, as_frame=False)

    X = mnist.data.astype(np.float32) / 255.0   # normalize
    y = mnist.target.astype(np.int64)
    X = X.reshape(-1, 1, 28, 28)
    X_padded = np.pad(X, ((0,0), (0,0), (2,2), (2,2)), mode='constant')
    X_train, X_test = X_padded[:10000], X_padded[68000:]
    y_train, y_test = y[:10000], y[68000:]

    return X_train, y_train, X_test, y_test


In [214]:
import numpy as np

def evaluate(model, X, y, batch_size=128):
    correct = 0
    total = 0

    for i in range(0, len(X), batch_size):
        xb = X[i:i+batch_size]
        yb = y[i:i+batch_size]
        logits = model.forward(xb)
        preds = np.argmax(logits, axis=1)
        correct += np.sum(preds == yb)
        total += len(yb)

    return correct / total


In [215]:
def get_batches(X, y, batch_size):
    idx = np.random.permutation(len(X))
    X = X[idx]
    y = y[idx]

    for i in range(0, len(X), batch_size):
        yield X[i:i+batch_size], y[i:i+batch_size]


# Main Code

In [216]:
X_train, y_train, X_test, y_test = load_mnist()

In [217]:
print("Train:", X_train.shape, y_train.shape)
print("Test:",  X_test.shape,  y_test.shape)

Train: (10000, 1, 32, 32) (10000,)
Test: (2000, 1, 32, 32) (2000,)


In [219]:
model = Lenet5()

In [220]:
epochs = 40
batch_size = 128

for epoch in range(epochs):
    loss = 0
    for Xb, yb in get_batches(X_train, y_train, batch_size):
        logits = model.forward(Xb)
        loss = model.backward(yb)
        model.step()

    print(f"Epoch {epoch+1} | Loss: {loss:.4f}, | Acc: {evaluate(model,X_train,y_train)}")

Epoch 1 | Loss: 2.2845, | Acc: 0.2607
Epoch 2 | Loss: 2.2800, | Acc: 0.369
Epoch 3 | Loss: 2.2570, | Acc: 0.4296
Epoch 4 | Loss: 2.2587, | Acc: 0.4651
Epoch 5 | Loss: 2.2051, | Acc: 0.4731
Epoch 6 | Loss: 2.1340, | Acc: 0.4965
Epoch 7 | Loss: 2.1112, | Acc: 0.5271
Epoch 8 | Loss: 1.8623, | Acc: 0.5329
Epoch 9 | Loss: 1.7035, | Acc: 0.5441
Epoch 10 | Loss: 1.7427, | Acc: 0.5881
Epoch 11 | Loss: 1.2500, | Acc: 0.6391
Epoch 12 | Loss: 1.0329, | Acc: 0.6902
Epoch 13 | Loss: 1.0878, | Acc: 0.7211
Epoch 14 | Loss: 0.5486, | Acc: 0.7441
Epoch 15 | Loss: 0.8916, | Acc: 0.7644
Epoch 16 | Loss: 0.9002, | Acc: 0.7788
Epoch 17 | Loss: 0.6481, | Acc: 0.7907
Epoch 18 | Loss: 0.4694, | Acc: 0.8024
Epoch 19 | Loss: 0.5099, | Acc: 0.812
Epoch 20 | Loss: 0.8264, | Acc: 0.8204
Epoch 21 | Loss: 1.2078, | Acc: 0.8334
Epoch 22 | Loss: 0.5243, | Acc: 0.8375
Epoch 23 | Loss: 0.6102, | Acc: 0.8434
Epoch 24 | Loss: 0.5067, | Acc: 0.8487
Epoch 25 | Loss: 0.5669, | Acc: 0.8545
Epoch 26 | Loss: 0.3730, | Acc: 0.86

In [221]:
acc = evaluate(model, X_test, y_test)
print("Test Accuracy:", acc)

Test Accuracy: 0.928


# Conclusions

The model here reached almost 90~ accuracy on test set as compared to 99.2% of original model. This is due to the model was trained till 40 epochs only and on a smaller subset of MNIST dataset due to compute contraints. 