In [14]:
import math
import numpy as np
from torchvision.datasets import MNIST
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

In [15]:
def download_mnist(is_train: bool):
    dataset = MNIST(root='./data',
                    transform=lambda x: np.array(x).flatten() / 255.0,
                    download=True,
                    train=is_train)
    mnist_data = []
    mnist_labels = []
    for image, label in dataset:
        mnist_data.append(image)
        mnist_labels.append(label)
    return np.array(mnist_data), np.array(mnist_labels)

In [16]:
def one_hot_encode(labels):
    encoder = OneHotEncoder(sparse_output=False)
    return encoder.fit_transform(labels.reshape(-1, 1))

In [17]:
def weight_initialization_Xavier_Uniform(input_dim, output_dim):
    low_bound = -math.sqrt(6/(input_dim + output_dim))
    upper_bound = math.sqrt(6/(input_dim + output_dim))
    W = np.random.uniform(low_bound, upper_bound, size=(input_dim, output_dim))
    b = np.zeros((1, output_dim))
    return W, b

In [18]:
def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return expZ / np.sum(expZ, axis=1, keepdims=True)

In [19]:
def sigmoid(Z):
    return 1/(1 + np.exp(-Z))

In [20]:
def forward_propagation(X, W1, b1, W2, b2):
    Z1 = np.dot(X, W1) + b1
    A1 = sigmoid(Z1)

    Z2 = np.dot(A1, W2) +b2
    A2 = softmax(Z2)
    return A1, A2

In [21]:
def compute_loss_cross_entropy(A, Y):
    m = Y.shape[0]
    log_likelihood = -np.log(A[range(m), Y.argmax(axis=1)])
    return np.sum(log_likelihood) / m

In [22]:
def backward_propagation(X, Y, A2, A1, W2):
    m = X.shape[0]
    
    # Output layer error term (dZ2)
    dZ2 = A2 - Y  # Derivative of softmax + cross-entropy
    dW2 = np.dot(A1.T, dZ2) / m  # Gradient w.r.t. weights (output layer)
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m  # Gradient w.r.t. biases (output layer)
    
    # Hidden layer error term (dZ1)
    dZ1 = np.dot(dZ2, W2.T) * A1 * (1 - A1)  # For sigmoid activation (derivative of sigmoid)
    dW1 = np.dot(X.T, dZ1) / m  # Gradient w.r.t. weights (hidden layer)
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m  # Gradient w.r.t. biases (hidden layer)
    
    return dW1, db1, dW2, db2


In [23]:
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    
    return W1, b1, W2, b2

In [24]:
def train_model(train_X, train_Y, input_dim, intermediary_dim, output_dim, epochs=100, learning_rate=0.01, batch_size=100):
    W1, b1 = weight_initialization_Xavier_Uniform(input_dim, intermediary_dim)
    W2, b2 = weight_initialization_Xavier_Uniform(intermediary_dim, output_dim)
    
    for epoch in range(epochs):
        perm = np.random.permutation(train_X.shape[0])
        train_X = train_X[perm]
        train_Y = train_Y[perm]
        
        for i in range(0, train_X.shape[0], batch_size):
            X_batch = train_X[i:i+batch_size]
            Y_batch = train_Y[i:i+batch_size]
            
            # Forward propagation
            A1, A2 = forward_propagation(X_batch, W1, b1, W2, b2)
            
            # Compute loss (optional for tracking)
            loss = compute_loss_cross_entropy(A2, Y_batch)
            
            # Backward propagation
            dW1, db1, dW2, db2 = backward_propagation(X_batch, Y_batch, A2, A1, W2) 
            
            # Update parameters
            W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)
            
        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss:.4f}')
    
    return W1, b1, W2, b2

In [25]:
def predict(X, W1, b1, W2, b2):
    A2 = forward_propagation(X, W1, b1, W2, b2)[1]
    return np.argmax(A2, axis=1)

In [26]:
train_X, train_Y = download_mnist(True)
test_X, test_Y = download_mnist(False)
    
train_Y = one_hot_encode(train_Y)
test_Y_one_hot = one_hot_encode(test_Y)
    
input_dim = 784
intermediary_dim = 100
output_dim = 10
epochs = 100
learning_rate = 0.1
batch_size = 100
    
W1, b1, W2, b2 = train_model(train_X, train_Y, input_dim, intermediary_dim,output_dim, epochs, learning_rate, batch_size)
    
train_predictions = predict(train_X, W1, b1, W2, b2)
test_predictions = predict(test_X, W1, b1, W2, b2)
    
print(f'Training Accuracy: {accuracy_score(np.argmax(train_Y, axis=1), train_predictions) * 100:.2f}%')
print(f'Test Accuracy: {accuracy_score(test_Y, test_predictions) * 100:.2f}%')

Epoch 1/100, Loss: 0.4114
Epoch 2/100, Loss: 0.4418
Epoch 3/100, Loss: 0.3443
Epoch 4/100, Loss: 0.2656
Epoch 5/100, Loss: 0.2281
Epoch 6/100, Loss: 0.2603
Epoch 7/100, Loss: 0.2192
Epoch 8/100, Loss: 0.1227
Epoch 9/100, Loss: 0.1983
Epoch 10/100, Loss: 0.2050
Epoch 11/100, Loss: 0.2496
Epoch 12/100, Loss: 0.1112
Epoch 13/100, Loss: 0.1834
Epoch 14/100, Loss: 0.1699
Epoch 15/100, Loss: 0.2448
Epoch 16/100, Loss: 0.2066
Epoch 17/100, Loss: 0.1567
Epoch 18/100, Loss: 0.2134
Epoch 19/100, Loss: 0.0881
Epoch 20/100, Loss: 0.1386
Epoch 21/100, Loss: 0.0729
Epoch 22/100, Loss: 0.1888
Epoch 23/100, Loss: 0.2213
Epoch 24/100, Loss: 0.1195
Epoch 25/100, Loss: 0.1726
Epoch 26/100, Loss: 0.1811
Epoch 27/100, Loss: 0.2322
Epoch 28/100, Loss: 0.1857
Epoch 29/100, Loss: 0.0421
Epoch 30/100, Loss: 0.0697
Epoch 31/100, Loss: 0.0857
Epoch 32/100, Loss: 0.0943
Epoch 33/100, Loss: 0.1002
Epoch 34/100, Loss: 0.1723
Epoch 35/100, Loss: 0.0916
Epoch 36/100, Loss: 0.1303
Epoch 37/100, Loss: 0.0982
Epoch 38/1