# Import Libraries

In [22]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

print("Numpy version = ", np.__version__) # 1.26.4
print("Pandas version = ", pd.__version__) # 2.2.3

Numpy version =  1.26.4
Pandas version =  2.2.3


# Load dataset for training and testing

In [5]:
data = pd.read_csv('./dataset/train.csv') # load csv data into a data variables 

In [6]:
data = np.array(data) # convert pandas data to numpy array
m, n = data.shape # (Images, 1 label + 784 pixels) = (42000, 785)
np.random.shuffle(data) # shuffle so we get random train and test data

In [7]:
data_dev = data[:1000] # split first 1000 data for testing 
data_dev = data_dev.T # We transpose the data here (data.T) to align the dimensions properly for mathematical operations and better visualization
Y_dev = data_dev[0] # split labels from those 1000 data
X_dev = data_dev[1:n] # split pixels from those 1000 data 
X_dev = X_dev / 255. # convert 0-255 pixel values to 0.0-1.0 values

data_train = data[1000:] # split everything besides first 1000 data for training
data_train = data_train.T # We transpose the data here (data.T) to align the dimensions properly for mathematical operations and better visualization
Y_train = data_train[0] # split labels from those 1000 data
X_train = data_train[1:n] # split pixels from those 1000 data
X_train = X_train / 255. # convert 0-255 pixel values to 0.0-1.0 values

In [8]:
X_train.shape

(784, 41000)

# Init Paramater
#### To initialize the parameters their are few algorithms that we could use depending purpose and activation function.


##### If your weights never go negative, your model would be severely limited — it could only add influences and never subtract them. That would hurt performance, especially in more complex problems.
Weights determine how strongly a neuron responds to its inputs.
> - Negative weight = the neuron is inhibited by that input.
> - Positive weight = the neuron is activated by that input.

Biases shift the activation threshold:
> - A negative bias can make it harder for the neuron to activate.
> - A positive bias can make it easier to activate.



In [17]:
# tanh / sigmoid
# This will output numpy array of shape (output_nodes, input_nodes) where 
# For 784 input nodes + 10 output nodes: each values will be between (-0.00755667506 to 0.00755667506)
# For 10 input nodes + 10 output nodes: each values will be between (-0.3 to 0.3)
def xavier_init(input_nodes, output_nodes):
    limit = np.sqrt(6 / (input_nodes + output_nodes))
    return np.random.uniform(-limit, limit, size=(output_nodes, input_nodes))

# ReLU / LeakyReLU
# This will output numpy array of shape (output_nodes, input_nodes) where 
# For 784 input nodes: each values will be between (0 to 0.0025510204)
# For 10 input nodes: each values will be between (0 to 0.2)
def he_init(input_nodes, output_nodes):
    std = np.sqrt(2 / input_nodes)
    return np.random.randn(output_nodes, input_nodes) * std

# selu
# This will output numpy array of shape (output_nodes, input_nodes) where 
# For 784 input nodes: each values will be between (0 to 0.0012755102)
# For 10 input nodes: each values will be between (0 to 0.1)
def lecun_init(input_nodes, output_nodes):
    std = np.sqrt(1 / input_nodes)
    return np.random.randn(output_nodes, input_nodes) * std

# Ignores Activation Functions
# This will output numpy array of shape (output_nodes, input_nodes) where each values are between (-0.5 to 0.5)
def uniform_random_init(input_nodes, output_nodes):
    return np.random.rand(output_nodes, input_nodes) - 0.5

# This will output numpy array of shape (output_nodes, input_nodes) where each value is 0.
def zero_init(input_nodes, output_nodes):
    return np.zeros((input_nodes, output_nodes))

In [16]:
def init_params(method="he"):
    if method=="he":
        W1 = he_init(784, 10)
        W2 = he_init(10, 10)
    elif method=="xavier":
        W1 = xavier_init(784, 10)
        W2 = xavier_init(10, 10)
    elif method=="lecun":
        W1 = lecun_init(784, 10)
        W2 = lecun_init(10, 10)
    else:
        W1 = uniform_random_init(784, 10)
        W2 = uniform_random_init(10, 10)

    b1 = zero_init(10, 1)
    b2 = zero_init(10, 1)

    return W1, b1, W2, b2

# Activation Functions
#### Without activation functions, a neural network would just be a linear function — a stack of linear layers is still just linear.

In [None]:
# Properties: Output: (0, 1), Smooth & differentiable, Used historically in binary classification
# Problem: Vanishing gradient for large |x|, Activations saturate (output close to 0 or 1), Non-zero-centered (bad for gradient updates)
def sigmoid(Z):
    return 1 / (1 + np.exp(-z))

# Properties: Output: (-1, 1), Zero-centered: better than sigmoid
# Problem: Vanishing gradient for large |x|
def tanh(Z)
    return np.tanh(Z) # (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z))

# Properties: Fast & simple, Sparse activation (many zeros), Avoids vanishing gradient for positive values
# Problem: Dying ReLU problem: neurons can “die” (output zero) and never recover
def reLU(Z):
    return np.maximum(Z, 0)

# Fixes dying ReLU by allowing small gradient when x < 0
def lReLU(Z, alpha=0.01):
    return np.maximum(Z, alpha * Z)

# Properties: Smooth (vs. ReLU's sharp corner), Can push mean activation closer to 0 (like BatchNorm)
def elu(x, alpha=1.0):
    return np.where(x > 0, x, alpha * (np.exp(x) - 1))

def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A