## **Scratch Implementation of CNN**

In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt

from scipy import signal
# signal module from scipy performs operations on 2D matrices (typically used for image processing, filtering, and feature extraction)

from keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

### Base layer class to specify the layer properites

In [2]:
class Layer:

    def __init__(self):
        self.input = None
        self.output = None

    def forward(self, input):
        pass

    def backward(self, output_gradient, learning_rate):
        pass

### Forward and Backward Propagation in Convolution Layer

In [3]:
class Convolutional(Layer):

   def __init__(self, input_shape, kernel_size, depth):

        # Input_shape is 3 dimensional (dxhxw)
        # input_depth  = no.of channels
        # input_height = image height and
        # input_width  = image width
        input_depth, input_height, input_width = input_shape

        # Depth represents the number of kernels of our convolutional layer
        self.depth = depth
        self.input_shape = input_shape

        # number of channels in the image
        self.input_depth = input_depth

        # Calculating Conv layer output of 3 dimensions
        # first dim  = number of filters/kernels
        # second dim = height of the output matrix after applying convolution
        # third dim  = width of the output matrix after applying convolution
        self.output_shape = (depth, input_height - kernel_size + 1, input_width - kernel_size + 1)

        # Kernels shape takes 4 dimensions
        # depth = no. of kernels
        # input_depth = image channels
        # kernel_size = kernel dimension
        self.kernels_shape = (depth, input_depth, kernel_size, kernel_size)

        # Initalizing the Kernels weights randomly
        self.kernels = np.random.randn(*self.kernels_shape)

        # Initializing the biases randomly
        self.biases = np.random.rand(*self.output_shape)

    # Forward pass
   def forward(self, input):
        self.input = input
        # Inititialize output matrix with output_shape
        self.output = np.zeros(self.output_shape)

        # Nested loop for traversing across all filters (depth), then all channels (input_depth) in every input image
        for i in range(self.depth):
            for j in range(self.input_depth):
                # Output = Conv(Input, Kernel) + Bias
                self.output[i] = self.biases[i] + signal.correlate2d(self.input[j], self.kernels[i, j], "valid")    # valid stands for no padding
        return self.output

   def backward(self, output_gradient, learning_rate):
        # Intializing the gradient of the kernels as zeros
        kernels_gradient = np.zeros(self.kernels_shape)

        # Intializing the gradient of the input as zeros
        input_gradient = np.zeros(self.input_shape)

        # Nested loop for updating the gradients of kernels and inputs,
        # first traversing all filters (depth), then all channels (input_depth) in every input image
        for i in range(self.depth):
            for j in range(self.input_depth):
                # Calculate kernels gradient in every i and j index in the kernel,
                kernels_gradient[i,j] = signal.correlate2d(self.input[j], output_gradient[i], "valid")  # Computes the cross-correlation between two 2D arrays

                # Calculate input gradient by sliding the kernel on the output gradient matrix
                input_gradient[j] += signal.convolve2d(output_gradient[i], self.kernels[i, j], "full")  # Performs 2D convolution but flips the kernel before sliding over the input
                                                                                                        # full stands for full padding
                                                                                                        # padding = kernel size−1

        # Update the kernels and biases w.r.t. learned features (stored in gradients)
        self.kernels -= learning_rate * kernels_gradient
        self.biases -= learning_rate * np.sum(output_gradient)

        return input_gradient

### Activation Functions

In [4]:
# Base Activation class to specify the default properties of the Activation Layer

class Activation(Layer):

    def __init__(self, activation, derivative_activation):
        self.activation = activation
        self.derivative_activation = derivative_activation

    def forward(self, input):
        self.input = input
        return self.activation(self.input)

    def backward(self, output_gradient, learning_rate):
        return np.multiply(output_gradient, self.derivative_activation(self.input))

In [5]:
class ReLU(Activation):
  def __init__(self):
    def relu(x):
      return np.where(x>0, x, 0)

    def derivative_relu(x):
      return np.where(x>0, 1, 0)

    super().__init__(relu, derivative_relu)

In [6]:
class TanH(Activation):
    def __init__(self):
      def tanh(x):
        tanH = (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
        return tanH
        # return np.tanh(x)        # built-in function using numpy

      def derivative_tanh(x):
        return 1 - ((np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x)))**2
        # return 1 - np.tanh(x) ** 2

      super().__init__(tanh, derivative_tanh)

In [7]:
class Softmax(Layer):

    def forward(self, input):
      exp_z = np.exp(input)
      self.output = exp_z/np.sum(exp_z, axis=0)
      return self.output

    def backward(self, output_gradient, learning_rate):
      n = np.size(self.output)
      return np.dot((np.identity(n) - self.output.T) * self.output, output_gradient)

### Max Pooling

In [8]:
class MaxPool(Layer):

    def __init__(self, input_shape, kernel_size, depth, stride):
        input_depth, input_height, input_width = input_shape
        self.input_shape = input_shape
        self.kernel_size = kernel_size
        self.kernels_shape = (depth, input_depth, kernel_size, kernel_size)
        self.depth = depth
        self.input_depth = input_depth
        self.kernels = np.random.randn(*self.kernels_shape)
        self.stride = stride
        self.input_height, self.input_width = input_height, input_width

    def forward(self, input):
        self.input = input
        KH = ((self.input_height-self.kernel_size)//self.stride) + 1
        KW = ((self.input_width-self.kernel_size)//self.stride) + 1
        self.output = np.zeros((self.input_depth, KH, KW))

        for depth in range(self.input_depth):
            for r in range(0, self.input_height-1, self.stride):
                for c in range(0, self.input_width-1, self.stride):
                    self.output[depth, r//self.stride, c//self.stride] = np.max(self.input[depth,
                                                                                r:r+self.kernel_size,
                                                                                c:c+self.kernel_size])
        return self.output

    def backward(self, output_gradient, learning_rate):
        self.output_gradient = np.zeros(self.input_shape)
        dx = np.zeros(self.input_shape)
        for depth in range(self.input_depth):
            for r in range(0, self.input_height-1, self.stride):
                for c in range(0, self.input_width-1, self.stride):
                    grad_pool = self.output[depth, r*self.stride:r*self.stride+self.kernel_size, c*self.stride:c*self.stride+self.kernel_size]
                    mask = (grad_pool == np.max(grad_pool))
                    dx[depth, r*self.stride:r*self.stride+self.kernel_size, c*self.stride : c*self.stride+self.kernel_size] = mask*self.output_gradient[depth, r, c]

        return dx

### Reshaping

In [9]:
class Reshape(Layer):
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape

    def forward(self, input):
        return np.reshape(input, self.output_shape)

    def backward(self, output_gradient, learning_rate):
        return np.reshape(output_gradient, self.input_shape)

### Fully Connected Dense Layer

In [10]:
class Dense(Layer):
    def __init__(self, input_size, output_size):
        # Defining shape of weights matrix
        self.weights = np.random.randn(output_size, input_size)
        # Defining shape of bias matrix
        self.bias = np.random.randn(output_size, 1)

    def forward(self, input):
        self.input = input
        return np.dot(self.weights, self.input) + self.bias #X.W+b

    def backward(self, output_gradient, learning_rate):
        # Calculate weights gradient by dot product of output gradient and transpose of input
        weights_gradient = np.dot(output_gradient, self.input.T)

        # Calculating the input gradient by performing dot product of weights transpose and output gradient
        input_gradient = np.dot(self.weights.T, output_gradient)

        # Updating the weights of the layer
        self.weights -= learning_rate * weights_gradient

        # Updating the bias of the layer
        self.bias -= learning_rate * output_gradient

        return input_gradient

### Defining the Loss Function (for binary classes)

In [11]:
def log_loss(y_true, y_pred):
    return np.mean(-y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred))

def derivative_log_loss(y_true, y_pred):
    return ((1 - y_true) / (1 - y_pred) - y_true / y_pred) / np.size(y_true)

## **Loading the Dataset**

**Load MNIST Dataset**

In [12]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [13]:
X_train[:2]

array([[[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]], dtype=uint8)

In [14]:
y_train[:2]

array([5, 0], dtype=uint8)

In [15]:
X_train.shape, y_train.shape

((60000, 28, 28), (60000,))

#### For simplicity we select only 2000 images from class 0 and 1

Function to preprocess the MNIST dataset

In [16]:
def preprocess_data(x, y, limit):

    zero_index = np.where(y == 0)[0][:limit]
    one_index = np.where(y == 1)[0][:limit]

    all_indices = np.hstack((zero_index, one_index))
    all_indices = np.random.permutation(all_indices)
    x, y = x[all_indices], y[all_indices]

    # Reshaping the data
    x = x.reshape(len(x), 1, 28, 28)
    # Normalize all pixel values [between 0-1],
    x = x.astype("float32") / 255

    # One hot encode all the labels
    y = to_categorical(y)
    y = y.reshape(len(y), 2, 1)

    return x, y

In [17]:
limit = 5000
X_train, y_train = preprocess_data(X_train, y_train, limit)
X_test, y_test = preprocess_data(X_test, y_test, limit)

In [18]:
X_train.shape, y_train.shape

((10000, 1, 28, 28), (10000, 2, 1))

In [19]:
X_test.shape, y_test.shape

((2115, 1, 28, 28), (2115, 2, 1))

### Defining the network architecture

In [20]:
network = [
    #input_shape, kernel_size, depth/filter
    Convolutional((1, 28, 28), 3, 5),
    ReLU(),

    #input_shape, kernel_size, depth, stride
    MaxPool((5,26,26), 2, 5, 1),

    #input_shape, output_shape
    Reshape((5, 25, 25), (5 * 25 * 25, 1)),

    #input_size, output_size
    Dense(5 * 25 * 25, 100),
    TanH(),

    #input_size, output_size
    Dense(100, 2), #2classes
    Softmax()
]

In [21]:
network

[<__main__.Convolutional at 0x7b36aafdc110>,
 <__main__.ReLU at 0x7b3697b556d0>,
 <__main__.MaxPool at 0x7b3697b68b10>,
 <__main__.Reshape at 0x7b3697b68890>,
 <__main__.Dense at 0x7b3697b68850>,
 <__main__.TanH at 0x7b36978a4c10>,
 <__main__.Dense at 0x7b3697921f10>,
 <__main__.Softmax at 0x7b3697922b10>]

### Defining the train function

In [22]:
def train(network, loss, loss_derivative, x_train, y_train, epochs = 5, learning_rate = 0.01):
    for e in range(epochs):
        print('Epoch Start:',e)
        error = 0
        idx = 0
        for x, y in zip(x_train, y_train):
            if (idx+1) % 500 == 0:
              print(f"Epoch {e}: {idx+1}/{len(y_train)}")

            idx += 1

            # Forward pass to predict on the training data
            output = predict(network, x)

            # Summing the losses to optimize the network's weights and biases
            error += loss(y, output)

            # Perform backward pass through every layer
            grad = loss_derivative(y, output)
            for layer in reversed(network):
                grad = layer.backward(grad, learning_rate)

        error /= len(x_train)
        print(f"Epoch : {e + 1}/{epochs}, loss = {error}")

Function to make a prediction on a given input

In [23]:
def predict(network, input):
    output = input
    for layer in network:
        output = layer.forward(output)
    return output

### Fitting the model to the data by calling the train function

In [None]:
train(
    network,
    log_loss,
    derivative_log_loss,
    X_train,
    y_train,
    epochs = 5,
    learning_rate = 0.01
)

Epoch Start: 0
Epoch 0: 500/10000
Epoch 0: 1000/10000
Epoch 0: 1500/10000
Epoch 0: 2000/10000
Epoch 0: 2500/10000
Epoch 0: 3000/10000
Epoch 0: 3500/10000
Epoch 0: 4000/10000
Epoch 0: 4500/10000
Epoch 0: 5000/10000
Epoch 0: 5500/10000
Epoch 0: 6000/10000
Epoch 0: 6500/10000
Epoch 0: 7000/10000
Epoch 0: 7500/10000
Epoch 0: 8000/10000
Epoch 0: 8500/10000
Epoch 0: 9000/10000
Epoch 0: 9500/10000
Epoch 0: 10000/10000
Epoch : 1/5, loss = 0.052473486605098436
Epoch Start: 1
Epoch 1: 500/10000
Epoch 1: 1000/10000


  return np.mean(-y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred))
  return np.mean(-y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred))
  return ((1 - y_true) / (1 - y_pred) - y_true / y_pred) / np.size(y_true)


Epoch 1: 1500/10000
Epoch 1: 2000/10000
Epoch 1: 2500/10000
Epoch 1: 3000/10000
Epoch 1: 3500/10000
Epoch 1: 4000/10000
Epoch 1: 4500/10000
Epoch 1: 5000/10000
Epoch 1: 5500/10000
Epoch 1: 6000/10000
Epoch 1: 6500/10000
Epoch 1: 7000/10000
Epoch 1: 7500/10000
Epoch 1: 8000/10000
Epoch 1: 8500/10000
Epoch 1: 9000/10000
Epoch 1: 9500/10000
Epoch 1: 10000/10000
Epoch : 2/5, loss = nan
Epoch Start: 2
Epoch 2: 500/10000
Epoch 2: 1000/10000
Epoch 2: 1500/10000
Epoch 2: 2000/10000
Epoch 2: 2500/10000
Epoch 2: 3000/10000
Epoch 2: 3500/10000
Epoch 2: 4000/10000
Epoch 2: 4500/10000
Epoch 2: 5000/10000
Epoch 2: 5500/10000
Epoch 2: 6000/10000
Epoch 2: 6500/10000
Epoch 2: 7000/10000
Epoch 2: 7500/10000
Epoch 2: 8000/10000
Epoch 2: 8500/10000
Epoch 2: 9000/10000
Epoch 2: 9500/10000
Epoch 2: 10000/10000
Epoch : 3/5, loss = nan
Epoch Start: 3
Epoch 3: 500/10000
Epoch 3: 1000/10000
Epoch 3: 1500/10000
Epoch 3: 2000/10000
Epoch 3: 2500/10000
Epoch 3: 3000/10000
Epoch 3: 3500/10000
Epoch 3: 4000/10000
Ep

### Function to calculate Accuracy on the Test Data

In [25]:
len(X_test)

2115

In [26]:
correct = 0
for x, y in zip(X_test, y_test):
    output = predict(network, x)
    if np.equal(np.argmax(output), np.argmax(y)):
       correct += 1

print(f"Accuracy of the Network on Test data is {(correct/len(X_test)) * 100} %")

Accuracy of the Network on Test data is 46.335697399527184 %
