## Neural network with 2 hidden layers for image classification (digit-recognition):
- using Relu as activation function
- using Softmax

In [48]:
from typing import Dict, Tuple
import numpy as np
import math 
import pandas as pd
from matplotlib import pyplot as plt

###  Load MNIST Data from IDX Files
We load the handwritten digit images and labels from the raw IDX format using `idx2numpy`. The training set contains 60,000 images of size 28×28, and the test set contains 10,000.

In [49]:
import idx2numpy

# Training data
X_train = idx2numpy.convert_from_file("archive/train-images.idx3-ubyte")
y_train = idx2numpy.convert_from_file("archive/train-labels.idx1-ubyte")

# Test data
X_test = idx2numpy.convert_from_file("archive/t10k-images.idx3-ubyte")
y_test = idx2numpy.convert_from_file("archive/t10k-labels.idx1-ubyte")

print("X_train shape:", X_train.shape)  # Should be (60000, 28, 28)
print("y_train shape:", y_train.shape)  # Should be (60000,)


X_train shape: (60000, 28, 28)
y_train shape: (60000,)


###  Flatten and Normalize Data
We flatten each 28×28 image to a 784-length vector, normalize pixel values to [0, 1], and transpose the dataset so each column corresponds to one image sample.

In [51]:
# We want to flatten the shape (784, num_batches or samples) to [1,0]
X_train_flat = X_train.reshape(X_train.shape[0], -1).T / 255.0 # shape: (784, 60000)
X_test_flat =  X_test.reshape(X_test.shape[0], -1).T / 255.0 # shape: (784, 10000)

print(f"X_train_flat shape:{X_train_flat.shape}")
print(f"X_test_flat shape:{X_test_flat.shape}")
print(X_train.shape[0])


X_train_flat shape:(784, 60000)
X_test_flat shape:(784, 10000)
60000


In [52]:
np.random.seed(42)

def initialze_parameters()-> Dict[str, np.ndarray]:
    W1 = np.random.rand(128,784) * np.sqrt(2/784) # formula for He init => sqrt(2/m) with m is the number of inputs to the layer
    b1 = np.zeros((128,1))
    W2 = np.random.rand(64,128) * np.sqrt(2/128)
    b2 = np.zeros((64, 1))
    W3 = np.random.rand(10,64) * np.sqrt(2/64)
    b3 = np.zeros((10,1))
    return W1, b1, W2, b2, W3, b3

In [53]:
# define the activation functions (ReLu + Softmax)
def relu(Z: np.ndarray)-> np.ndarray:
    return np.maximum(0,Z)

def softmax(Z: np.ndarray)-> np.ndarray:
    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True)) # prevent overflow
    return expZ / np.sum(expZ, axis=0, keepdims=True)
    

In [54]:
# forward_pass
def forward_pass(X: np.ndarray, parameters: Dict[str, np.ndarray])-> Tuple[np.ndarray, Tuple]:

    W1, b1 = parameters["W1"], parameters["b1"]
    W2, b2 = parameters["W2"], parameters["b2"]
    W3, b3 = parameters["W3"], parameters["b3"]
    
    Z1 = W1 @ X + b1
    A1 = relu(Z1)
    
    Z2 = W2 @ A1 + b2
    A2 = relu(Z2)
    
    Z3 = W3 @ A2 + b3
    A3 = softmax(Z3)
    
    cache = (Z1, A1, Z2, A2, Z3, A3)
    
    return A3, cache

###  Evaluate Accuracy on First Batch
We compute predictions using `argmax` on the softmax output and compare them to the true labels to calculate accuracy for the first mini-batch.

In [60]:
# test the forward pass on one batch
W1, b1, W2, b2, W3, b3 = initialze_parameters()

X_batch = X_test_flat[:, :32]
A3, cache = forward_pass(X_batch, W1, b1, W2, b2, W3, b3)

prediction = np.argmax(A3, axis=0)
true_labels = y_train[:32]
accuracy = np.mean(prediction == true_labels)

print("Output probabilities shape:", A3.shape)  # (10, 32)
print(f"First prediction :{np.round(A3[:, 0], 3)}")
print(f"Accuracy on first batch: {accuracy * 100:.2f}%")


Output probabilities shape: (10, 32)
First prediction :[0.662 0.    0.    0.02  0.    0.    0.    0.    0.275 0.042]
Accuracy on first batch: 6.25%


# Loss Function(Cross-Entropy)
- I have chosen Cross-Entropy for my loss function becasue Softmax plus Cross entropy is easier to compute from scratch

In [58]:
def compute_loss(A3: np.ndarray, Y: np.ndarray)-> float:
    """
    Cross-entropy loss:
    - A3: predicted probabilities (10, m)
    - Y: one-hot true labels (10, m)
    """
    
    m = Y.shape[1]
    loss = -np.sum(Y * np.log(A3 + 1e-8)) / m
    return loss
    

# One-Hot Encode Labels
- Before backprop, we make sure y_train in one-hot encoded()

In [None]:
def one_hot_encode(y :np.ndarray, num_classes: int = 10)->np.ndarray:
    one_hot= np.zeros(num_classes, y.size)
    one_hot[y, np.arange(y.size)] = 1
    return one_hot

Y_batch = one_hot_encode(y_train)

In [None]:
def backward_pass(X: np.ndarray, Y: np.ndarray, cache: Tuple, parameters: Dict[str, np.ndarray])-> Dict[str, np.ndarray]:
    """
    Backprop through the 3-layer NN using cached forward values.
    """
    W1, W2, W3 = parameters["W1"], parameters["W2"], parameters["W3"]
    Z1, A1, Z2, A2, Z3, A3 = cache
    m = X.shape[1]

    # Output layer gradient (Softmax + CrossEntropy simplifies)
    dZ3 = A3 - Y  # (10, m)
    dW3 = (1 / m) * dZ3 @ A2.T
    db3 = (1 / m) * np.sum(dZ3, axis=1, keepdims=True)

    # Hidden Layer 2
    dA2 = W3.T @ dZ3
    dZ2 = dA2 * (Z2 > 0)  # ReLU backward
    dW2 = (1 / m) * dZ2 @ A1.T
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)

    # Hidden Layer 1
    dA1 = W2.T @ dZ2
    dZ1 = dA1 * (Z1 > 0)  # ReLU backward
    dW1 = (1 / m) * dZ1 @ X.T
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)

    grads = {
        "dW1": dW1, "db1": db1,
        "dW2": dW2, "db2": db2,
        "dW3": dW3, "db3": db3
    }

    return grads


In [None]:
def update_parameters(parameters: Dict[str, np.ndarray], grads: Dict[str, np.ndarray], learning_rate: float=0.1)-> Dict[str, np.ndarray]:
    for key in parameters:
        parameters[key] -= learning_rate * grads["d" + key]
    return parameters


In [61]:
epochs = 10
batch_size = 64
learning_rate = 0.1

parameters = initialze_parameters()
m = X_train_flat.shape[1]  # total training samples

for epoch in range(epochs):
    # Shuffle the data at the start of each epoch
    indices = np.random.permutation(m)
    X_shuffled = X_train_flat[:, indices]
    y_shuffled = y_train[indices]
    
    total_loss = 0
    correct = 0
    batches = m // batch_size

    for i in range(0, m, batch_size):
        # Get mini-batch
        X_batch = X_shuffled[:, i:i+batch_size]
        y_batch = y_shuffled[i:i+batch_size]
        Y_batch = one_hot_encode(y_batch)

        # Forward pass
        A3, cache = forward_pass(X_batch, parameters)

        # Compute loss
        loss = compute_loss(A3, Y_batch)
        total_loss += loss

        # Accuracy
        predictions = np.argmax(A3, axis=0)
        correct += np.sum(predictions == y_batch)

        # Backward pass
        grads = backward_pass(X_batch, Y_batch, cache, parameters)

        # Update weights
        parameters = update_parameters(parameters, grads, learning_rate)

    # Epoch summary
    epoch_loss = total_loss / batches
    epoch_accuracy = correct / m * 100
    print(f"Epoch {epoch+1}/{epochs} | Loss: {epoch_loss:.4f} | Accuracy: {epoch_accuracy:.2f}%")
