###### What do we need?
* 1. Forward Function ✅
* 2. Backward Function (Backpropagation) ✅
* 3. Convolution ✅
* 4. Pooling (MaxPooling) ✅
* 5. Dropout ✅
* 6. Loss (Cross-Entropy) ✅
---------------------------------
* 7. Optimizer (SGD) ✅
    * SGD is simpler to implement
---------------------------------
* 8. Fully Connected (Dense or Linear) ✅
* 9. Activation (ReLU & Sigmoid) ✅

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# --- Only cell we're going to use tensorflow --- #
from tensorflow.keras.datasets import mnist
# ----------------------------------------------- #

(X_train, y_train), (X_test, y_test) = mnist.load_data()

# --- Converting raw labels to one-hot encoding --- #
labels_dictionary = {
    0: np.array([1,0,0,0,0,0,0,0,0,0]),
    1: np.array([0,1,0,0,0,0,0,0,0,0]),
    2: np.array([0,0,1,0,0,0,0,0,0,0]),
    3: np.array([0,0,0,1,0,0,0,0,0,0]),
    4: np.array([0,0,0,0,1,0,0,0,0,0]),
    5: np.array([0,0,0,0,0,1,0,0,0,0]),
    6: np.array([0,0,0,0,0,0,1,0,0,0]),
    7: np.array([0,0,0,0,0,0,0,1,0,0]),
    8: np.array([0,0,0,0,0,0,0,0,1,0]),
    9: np.array([0,0,0,0,0,0,0,0,0,1])
}

new_y_train = np.zeros((y_train.shape[0], 10))
new_y_test = np.zeros((y_test.shape[0], 10))
for i, v in enumerate(y_train):
    new_y_train[i] = labels_dictionary[v]
for i, v in enumerate(y_test):
    new_y_test[i] = labels_dictionary[v]
# ------------------------------------------------- #.

y_train = new_y_train
y_test = new_y_test
    
print('X_train:', str(X_train.shape))
print('raw y_train:', str(y_train.shape))
print('encoded y_train:', str(new_y_train.shape))
print("-----------------------------")
print('X_test:', str(X_test.shape))
print('raw y_test:', str(y_test.shape))
print('encoded y_test:', str(new_y_test.shape))

In [None]:
# --- Visualizing part of the mnist dataset --- #

c = 300
fig, ax = plt.subplots(3,3)
for i in range(3):
    for j in range(3):
        ax[i][j].imshow(X_train[c], cmap='gray')
        c+=1

In [None]:
class Loss:
    
    def __init__(self, y_true, y_pred):
        self.y_true = y_true
        self.y_pred = y_pred
    
    def BinaryCrossEntropy(self):
        # --- We'll use clip to limit values between defined min and max --- #
        y_pred_clipped = np.clip(self.y_pred, 1e-15, 1-1e-15)
        loss = -np.mean((self.y_true * np.log(y_pred_clipped)) + (1-self.y_true) * np.log(1-y_pred_clipped))
        return(loss)
    
    def CategoricalCrossEntropy(self):
        # --- We'll use clip to limit values between defined min and max --- #
        y_pred_clipped = np.clip(self.y_pred, 1e-15, 1-1e-15)
        loss = -np.mean(self.y_true*(np.log(y_pred_clipped)))
        return(loss)
    
    def MeanSquaredError(self):
        loss = np.mean(np.square(self.y_true - self.y_pred))
        return(loss)
    
    def MeanAbsoluteError(self):
        loss = np.mean(np.abs(self.y_true - self.y_pred))
        return(loss)

In [None]:
class Activation:
    
    def __init__(self, z):
        self.z = z
        
    def ReLU(self):
        output = np.maximum(0, self.z)
        return(output)
    
    def Sigmoid(self):
        # Used for binary classification
        output = 1 / (1 + np.exp(-self.z))
        return(output)
        
    def Softmax(self):
        # Used for multiple classification
        output = np.exp(self.z)/(np.sum(np.exp(self.z)))
        return(output)
    
        # --- If there was an issue in runtime switch the softmax to the code bellow --- #
        
        # exp_shifted = np.exp(self.z - np.max(self.z))
        # return exp_shifted / np.sum(exp_shifted)
        
        # ------------------------------------------------------------------------------ #

In [None]:
class Optimizer:
    
    def __init__(self, params, grads, learning_rate=1e-3):
        self.params = params
        self.grads = grads
        self.learning_rate = learning_rate
        
    def SGD(self):
        for i in range(len(self.params)):
            self.params[i] -= self.learning_rate * self.grads[i]
        return(self.params)

In [None]:
class NeuralNetwork:
    
    def __init__(self):
        self.init = "init function"
        self.cache = {}
        
    def XavierWeights(self, dim_in, dim_out):
        """Xavier initialization for weights."""
        
        limit = np.sqrt(6 / (dim_in + dim_out))
        weights = np.random.uniform(-limit, limit, (dim_out, dim_in))
        biases = np.random.randn(dim_out, 1)
        return(weights, biases)
    
    def HeWeights(self, dim_in, dim_out):
        """He initialization for weights."""
        
        weights = np.random.randn(dim_out, dim_in) * np.sqrt(2 / dim_in)
        biases = np.random.randn(dim_out, 1)
        return(weights, biases)
        
    def Convolution(self, x, filters, kernel_size=3, stride=1, activation='relu', weight='xavier'):
        """'1D-Convolution'
           Args:
               x = input data
               filters = number of neurons
               kernel_size = number of kernel size
               stride = number of stride
               activation = default is 'relu'
               weight = initial weight is based on 'xavier weights'
        """
        
        # --- Weights --- #
        w, b = self.XavierWeights(kernel_size, filters)
        self.cache["weight_conv"] = w
        self.cache["bias_conv"] = b
        # --- ------- --- #
        
        # --- Output Size --- #
        z_size = ((x.shape[0] - kernel_size) // stride) + 1
        # --- ----------- --- #
        
        # --- Calculation --- #
        self.cache['input_conv'] = x
        z = np.zeros((filters, z_size))
        
        for i in range(filters):
            for j in range(z_size):
                segment = x[(j*stride): ((j*stride) + kernel_size)]
                z[i][j] = np.dot(w[i], segment) + b[i][0]
                
        self.cache['output_conv'] = z
        # --- ----------- --- #
        
        # --- Activation --- #
        z = Activation(z).ReLU()
        self.cache['activation_conv'] = z
        # --- ---------- --- #
        
        # --- Output --- #
        return(z)
        
    def MaxPooling(self, x, pool_size=2, stride=1):
        """'1D-MaxPooling'
           Args:
               x = input data
               pool_size = number of pool size
               kernel_size = number of kernel size
               stride = number of stride
        """
        
        # --- Calculation --- #
        self.cache['input_maxpool'] = x
        
        z_size = ((x.shape[0] - pool_size) // stride) + 1
        z = np.zeros(z_size)
        for i in range(z_size):  
            z[i] = np.max(x[i*stride: i*stride+pool_size])
            
        self.cache['output_maxpool'] = z
        # --- ----------- --- #
        
        # --- Output --- #
        return(z)
        
    def Dropout(self, x, rate=0.1):
        """'Dropout'
           Args:
               x = input data
               rate = probablity ratio
        """
        # --- Calculation --- #
        self.cache['input_dropout'] = x
        
        mask = np.random.randn(x.shape[0]) > rate
        x *= mask
        z = x / (1 - rate)
        
        self.cache['output_dropout'] = z
        # --- ----------- --- #
        
        # --- Output --- #
        return(z)
        
    def FullyConnected(self, x, filters, activation='softmax', weight='he'):
        """'1D-MaxPooling'
           Args:
               x = input data
               filters = number of classes
               activation = default is 'softmax'
               weight = initial weight is based on 'he weights'
        """
        # --- Calculation --- #
        x = x.flatten().reshape(-1, 1)
        self.cache['input_fc'] = x
        
        w, b = self.HeWeights(x.shape[0], filters)
        self.cache["weight_fc"] = w
        self.cache["bias_fc"] = b
        
        z = np.dot(w, x) + b
        
        self.cache['output_fc'] = z
        
        z = Activation(z).Softmax()
        self.cache['activation_fc'] = z
        # --- ----------- --- #
        
        # --- Output --- #
        return(z)
    
    def forward(self, x):
        """Forward pass through the network."""
        x = self.Convolution(x, filters=256)
        x = self.MaxPooling(x, pool_size=2, stride=1)
        x = self.Dropout(x, rate=0.1)
        x = self.FullyConnected(x, filters=10)
        return(x)
        
    def Backpropagation(self, y_true, y_pred, loss='cross_entropy', optimizer='sgd', learning_rate=1e-3):
        """Backward pass through the network."""
        conv_filters = 256
        conv_stride = 1
        kernel_size = 3
        l = Loss(y_true, y_pred).CategoricalCrossEntropy()
        
        # Output Layer Gradient (Softmax + Cross-Entropy)
        delta_fc = l
        dw_fc = np.dot(delta_fc, self.cache['input_fc'].T)
        db_fc = delta_fc
        
        d_dropout = np.dot(self.cache["weight_fc"].T, delta_fc)
        d_pool = d_dropout.reshape(self.cache["input_maxpool"].shape)
        d_pool *= self.cache['output_dropout'] / 0.9
        
        d_relu = np.zeros_like(self.cache['activation_conv'])
        for i in range(conv_filters):
            for j in range(self.cache['input_maxpool'].shape[1]):
                d_relu[i, j:j+2] += d_pool[i, j] * self.cache['output_maxpool'][i, j:j+2]
                
        d_z_conv = d_relu * (self.cache['output_conv'] > 0)
        
        dw_conv = np.zeros_like(self.cache['weight_conv'])
        db_conv = np.sum(d_z_conv, axis=1).reshape(-1, 1)
        
        x = self.cache['input_conv']
        for i in range(conv_filters):
            for j in range(d_z_conv.shape[1]):
                segment = x[j*conv_stride : j*conv_stride + kernel_size]
                dw_conv[i] += d_z_conv[i, j] * segment
        
        
        params = [self.cache['weight_fc'], self.cache['bias_fc'], self.cache['weight_conv'], self.cache['bias_conv']]
        grads = [dw_fc, db_fc, dw_conv, db_conv]
        optimizer = Optimizer(params, grads)
        
        backpropagated_result = optimizer.SGD()
        
        return(backpropagated_result)