In [None]:
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import mnist
from sklearn.preprocessing import OneHotEncoder

# Load the MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Reshape and normalize data
X_train = X_train.reshape(X_train.shape[0], 784).astype('float32') / 255  # Flatten 28x28 to 784 and normalize
X_test = X_test.reshape(X_test.shape[0], 784).astype('float32') / 255

# One-hot encode labels (to handle classification as multi-class)
encoder = OneHotEncoder()
    #sparse=False)
y_train_onehot = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_onehot = encoder.transform(y_test.reshape(-1, 1))

In [None]:
# Neural network architecture parameters
input_size = 784   # 28x28 pixels
hidden_size = 128  # Hidden layer size
output_size = 10   # Output layer (10 classes for digits 0-9)

# Initialize weights and biases
np.random.seed(42)

# Weights initialization (Xavier/Glorot initialization)
W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2. / input_size)  # Input to hidden layer
b1 = np.zeros((1, hidden_size))  # Hidden layer bias
W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2. / hidden_size)  # Hidden to output layer
b2 = np.zeros((1, output_size))  # Output layer bias

In [None]:
# Activation functions
def relu(x):
    return np.maximum(0, x)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # Numerical stability trick
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# Forward propagation
def forward(X):
    # Input to hidden layer
    z1 = np.dot(X, W1) + b1
    a1 = relu(z1)  # Activation of hidden layer

    # Hidden to output layer
    z2 = np.dot(a1, W2) + b2
    a2 = softmax(z2)  # Output layer (probabilities)

    return a1, a2

In [None]:
# Cross-entropy loss
def cross_entropy_loss(y_true, y_pred):
    m = y_true.shape[0]
    log_likelihood = -np.log(y_pred[range(m), np.argmax(y_true, axis=1)])
    loss = np.sum(log_likelihood) / m
    return loss

In [None]:
# Backpropagation
def backprop(X, y_true, a1, a2):
    m = X.shape[0]
    
    # Output layer error (softmax + cross-entropy)
    dz2 = a2 - y_true  # Gradient of cross-entropy loss with respect to output
    dW2 = np.dot(a1.T, dz2) / m  # Gradient of weights between hidden and output layer
    db2 = np.sum(dz2, axis=0, keepdims=True) / m  # Gradient of biases for output layer

    # Hidden layer error
    dz1 = np.dot(dz2, W2.T) * (a1 > 0)  # Derivative of ReLU
    dW1 = np.dot(X.T, dz1) / m  # Gradient of weights between input and hidden layer
    db1 = np.sum(dz1, axis=0, keepdims=True) / m  # Gradient of biases for hidden layer

    return dW1, db1, dW2, db2

In [None]:
# Hyperparameters
learning_rate = 0.1
epochs = 10

# Training loop
for epoch in range(epochs):
    # Forward pass
    a1, a2 = forward(X_train)
    
    # Calculate loss
    loss = cross_entropy_loss(y_train_onehot, a2)
    
    # Backpropagation
    dW1, db1, dW2, db2 = backprop(X_train, y_train_onehot, a1, a2)
    
    # Gradient descent update
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2

    # Print loss every 100 epochs
    if epoch % 100 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

In [None]:
# Evaluate the model
a1_test, a2_test = forward(X_test)
predictions = np.argmax(a2_test, axis=1)
accuracy = np.mean(predictions == y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')