# Convolutional Neural Network (ANN) _from Scratch_

## Import required libraries

In [1]:
import os
import pickle
import numpy as np
from abc import ABC, abstractmethod

In [2]:
np.random.seed(37)

## Activation Functions

In [3]:
class Activation(ABC):

    @abstractmethod
    def __call__(self, z: np.ndarray) -> np.ndarray:
        pass

    @abstractmethod
    def derivative(self, z: np.ndarray) -> np.ndarray:
        pass

### Linear

In [4]:
class Linear(Activation):
    def __call__(self, x: np.ndarray) -> np.ndarray:
        return x

    def derivative(self, z: np.ndarray) -> np.ndarray:
        return np.ones_like(z)

### ReLU

In [5]:
class ReLU(Activation):
    def __call__(self, z: np.ndarray) -> np.ndarray:
        return np.maximum(0, z)

    def derivative(self, z: np.ndarray) -> np.ndarray:
        return (z > 0).astype(float)

### Sigmoid

In [6]:
class Sigmoid(Activation):
    def __call__(self, z: np.ndarray) -> np.ndarray:
        return 1 / (1 + np.exp(-z))

    def derivative(self, z: np.ndarray) -> np.ndarray:
        sigmoid = self(z)
        return sigmoid * (1 - sigmoid)

### Softmax

In [7]:
class Softmax(Activation):
    def __call__(self, z: np.ndarray) -> np.ndarray:
        exps = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exps / np.sum(exps, axis=1, keepdims=True)

    def derivative(self, z: np.ndarray) -> np.ndarray:
        z = z.reshape(-1, 1)
        return np.diagflat(z) - np.dot(z, z.T)

### Tanh

In [8]:
class Tanh:
    def __call__(self, z: np.ndarray) -> np.ndarray:
        return np.tanh(z)

    def derivative(self, z: np.ndarray) -> np.ndarray:
        tanh = self(z)
        return 1 - tanh ** 2

## Loss Functions

In [9]:
class Loss(ABC):

    @abstractmethod
    def __call__(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
        pass

    @abstractmethod
    def derivative(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
        pass

### Mean Squared Error (MSE)

In [10]:
class MSE(Loss):
    def __call__(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
        return np.mean((y_pred - y_true) ** 2) / 2

    def derivative(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
        return (y_pred - y_true) / y_true.shape[0]

### Binary Cross Entropy Loss

In [11]:
class BinaryCrossEntropy(Loss):
    def __call__(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
        y_pred = np.clip(y_pred, 1e-9, 1 - 1e-9)  # avoid log(0)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    def derivative(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
        y_pred = np.clip(y_pred, 1e-9, 1 - 1e-9)  # Prevent div by 0
        return (y_pred - y_true) / (y_pred * (1 - y_pred) * y_true.shape[0])

## Cross Entropy Loss (Sparse Categorical)
_basically one hot encoded_

In [12]:
class SparseCategoricalCrossEntropy(Loss):
    def __call__(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
        y_pred = np.clip(y_pred, 1e-9, 1 - 1e-9)  # prevent log(0)
        return -np.sum(y_true * np.log(y_pred)) / y_true.shape[0]

    def derivative(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
        return (y_pred - y_true) / y_true.shape[0]

## Layer

In [13]:
class Layer(ABC):

    @abstractmethod
    def __init__(self, in_dim: int, out_dim: int):
        pass

    @abstractmethod
    def forward(self, x: np.ndarray) -> np.ndarray:
        pass

    @abstractmethod
    def backward(self, dA: np.ndarray, lr: float) -> np.ndarray:
        pass

### Dense Layer

In [14]:
class Dense(Layer):
    def __init__(self, in_dim: int, out_dim: int, activation: Activation):
        self.weights = np.random.random((in_dim, out_dim)) * np.sqrt(1 / in_dim)
        self.bias = np.zeros((1, out_dim))
        self.activation = activation
        self.x = None
        self.z = None

    def forward(self, x: np.ndarray) -> np.ndarray:
        self.x = x
        self.z = self.x @ self.weights + self.bias
        return self.activation(self.z)

    def backward(self, dA: np.ndarray, lr: np.ndarray) -> np.ndarray:
        if isinstance(self.activation, Softmax):
            dz = dA
        else:
            dz = dA * self.activation.derivative(self.z)
        dw = self.x.T @ dz
        db = np.sum(dz, axis=0, keepdims=True)

        # prevent exploding gradients
        np.clip(dw, -1, 1, out=dw)
        np.clip(db, -1, 1, out=db)

        self.weights -= lr * dw
        self.bias -= lr * db
        return dz @ self.weights.T

    @property
    def params_(self):
        return (self.weights, self.bias)

### Convolutional Layer

In [18]:
class Convo2D(Layer):
    def __init__(self, in_channels: int, num_filters: int, kernel_size: int | tuple, stride: int, padding: str = None):
        self.in_channels = in_channels
        self.num_filters = num_filters
        self.stride = stride
        self.padding = padding
        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.kh, self.kw = self.kernel_size

        # setting limit to prevent exploding gradients
        limit = np.sqrt(1 / (self.in_channels * self.kh * self.kw))
        self.kernels = np.random.uniform(-limit, limit, (num_filters, in_channels, self.kh, self.kw))
        self.bias = np.zeros((num_filters, 1))

        self.x = None


    def forward(self, x: np.ndarray) -> np.ndarray:
        self.x = x
        # N: input images, c_in: input channels, (H, W): Height, Width
        N, c_in, H, W = x.shape

        H_out = (H + 2 * self.padding - self.kh) // self.stride + 1
        W_out = (W + 2 * self.padding - self.kw) // self.stride + 1

        pad_x = np.pad(x, (
                           (0, 0), (0, 0),
                           (self.padding, self.padding),
                           (self.padding, self.padding)
                       ))

        output_x = np.zeros((N, self.num_filters, H_out, W_out))

        for image in range(N):
            for filter_ in range(self.num_filters):
                for h in range(H_out):
                    for w in range(W_out):
                        h_start = h * self.stride
                        w_start = w * self.stride
                        h_end = h_start + self.kh
                        w_end = w_start + self.kw
                        # image window where kernel filter will be applied
                        window = pad_x[image, :, h_start:h_end, w_start:w_end]
                        # dot product: wx; w = kernel filter;;
                        output_x[image, filter_, h, w] = np.sum(window * self.kernels[filter_]) + self.bias[filter_]

        return output_x


    def backward(self, dA: np.ndarray, lr: int) -> np.ndarray:
        N, c_in, H, W = self.x.shape
        _, _, H_out, W_out = dA.shape


        pad_x = np.pad(self.x, (
                           (0, 0), (0, 0),
                           (self.padding, self.padding),
                           (self.padding, self.padding)
                       ))

        dx_padded = np.zeros_like(pad_x)
        dk = np.zeros_like(self.kernels)
        db = np.zeroes_like(self.bias)

        for image in range(N):
            for filter_ in range(self.num_filters):
                for h in range(H_out):
                    for w in range(W_out):
                        h_start = h * self.stride
                        w_start = w * self.stride
                        h_end = h_start + self.kh
                        w_end = w_start + self.kw

                        window = pad_x[image, : h_start:h_end, w_start:w_end]

                        dk[filter_] += dA[image, filter_, h, w] * window
                        dx_padded[image, :, h_start:h_end, w_start:w_end] += dA[image, filter_, h, w] * self.kernels[filter_]
                        db[filter_] += dA[image, filter_, h, w]

        np.clip(dk, -1, 1, out=dk)
        np.clip(db, -1, 1, out=db)

        self.kernels -= lr * dk
        self.bias -= lr * db

        # remove padding from dx
        if self.padding > 0:
            dx = dx_padded[:, :, self.padding:-self.padding, self.padding:-self.padding]
        else:
            dx = dx_padded

        return dx


        @property
        def params_(self):
            return (self.kernels, self.bias)

### Pooling Layers

#### Max Pooling

## Neural Network

In [15]:
class NeuralNetwork:
    def __init__(self, layers: list[Layer], loss: Loss):
        self.layers = layers
        self.loss_fn = loss

    def add(self, layer: Layer):
        self.layers.append(layer)

    def forward(self, x: np.ndarray) -> np.ndarray:
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, y_pred: np.ndarray, y_true: np.ndarray, lr: float):
        dA = self.loss_fn.derivative(y_pred, y_true)
        for layer in reversed(self.layers):
            dA = layer.backward(dA, lr)

    def fit(
        self,
        X: np.ndarray,
        y: np.ndarray,
        val_X: np.ndarray = None,
        val_y: np.ndarray = None,
        n_iters: int = 100,
        lr: float = 0.01,
        verbose: bool = True
    ):
        self.history = []
        epoch_10pct = 1 if n_iters < 10 else int(n_iters * 0.1)

        for epoch in range(1, n_iters + 1):
            y_pred = self.forward(X)
            train_loss = self.loss_fn(y_pred, y)
            self.backward(y_pred, y, lr)

            if val_X is not None and val_y is not None:
                y_val_pred = self.forward(val_X)
                val_loss = self.loss_fn(y_val_pred, val_y)

            self.history.append(train_loss if val_X is None else (train_loss, val_loss))

            if verbose and epoch % epoch_10pct == 0:
                l = len(str(n_iters))
                print(
                    f"Epoch {epoch:{len(str(n_iters))}d} | Train Loss: {train_loss:.4f}" +
                    ("" if val_X is None else f" | Val Loss: {val_loss:.4f}")
                     )
        return self.history

    def predict(self, x: np.ndarray) -> np.ndarray:
        return self.forward(x)

    @property
    def params_(self) -> np.ndarray:
        return [layer.params_ for layer in self.layers]

    def summary(self):
        print("Model Summary:")
        print("-" * 60)
        total_params = 0

        for i, layer in enumerate(self.layers, start=1):
            name = layer.__class__.__name__
            w_shape = layer.weights.shape
            b_shape = layer.bias.shape
            num_params = np.prod(w_shape) + np.prod(b_shape)
            total_params += num_params
            print(f"{name} layer {i}: Params: {num_params}"
                  f" ({w_shape[0]} x {w_shape[1]} + {b_shape[1]})")

        print("-" * 60)
        print(f"Total Layers: {i}")
        print(f"Total Trainable Parameters: {total_params}")
        print("-" * 60)

    def save(self, filepath: str):
        with open(filepath, "wb") as f:
            pickle.dump(self, f)

    @staticmethod
    def load(filepath: str) -> "NeuralNetwork":
        with open(filepath, "rb") as f:
            return pickle.load(f)


## Saving model

In [52]:
MODEL_BASE_PATH = "../Models/Custom_CNN/"
os.makedirs(MODEL_BASE_PATH, exist_ok=True)