In [1]:
import numpy as np
import os
import urllib.request

# Download MNIST dataset (mnist.npz) if not already available
if not os.path.exists("mnist.npz"):
    print("Downloading mnist.npz...")
    url = "https://s3.amazonaws.com/img-datasets/mnist.npz"
    urllib.request.urlretrieve(url, "mnist.npz")

# Load data from the .npz file
data = np.load("mnist.npz")
x_train = data["x_train"]
y_train = data["y_train"]
x_test = data["x_test"]
y_test = data["y_test"]

# Preprocess data
# Flatten the images and normalize pixel values to [0, 1]
x_train = x_train.reshape(-1, 28 * 28).astype(np.float32) / 255.0
x_test = x_test.reshape(-1, 28 * 28).astype(np.float32) / 255.0

# One-hot encode the labels
def one_hot(y, num_classes=10):
    return np.eye(num_classes)[y]

y_train_oh = one_hot(y_train, 10)
y_test_oh = one_hot(y_test, 10)

# Activation functions and their derivatives
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(np.float32)

def softmax(x):
    # subtract max for numerical stability
    exps = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

# Hyperparameters
input_size = 28 * 28
hidden_size = 128
output_size = 10
learning_rate = 0.1
epochs = 10
batch_size = 128

# Initialize weights and biases with He initialization for ReLU layers
W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
b2 = np.zeros((1, output_size))

# Training loop
num_samples = x_train.shape[0]
num_batches = num_samples // batch_size

for epoch in range(epochs):
    # Shuffle the training data at the start of each epoch
    indices = np.arange(num_samples)
    np.random.shuffle(indices)
    x_train = x_train[indices]
    y_train_oh = y_train_oh[indices]
    
    epoch_loss = 0.0
    for i in range(num_batches):
        start = i * batch_size
        end = start + batch_size
        x_batch = x_train[start:end]
        y_batch = y_train_oh[start:end]
        
        # Forward pass
        z1 = np.dot(x_batch, W1) + b1          # Linear transformation for hidden layer
        a1 = relu(z1)                          # ReLU activation
        z2 = np.dot(a1, W2) + b2               # Linear transformation for output layer
        a2 = softmax(z2)                       # Softmax activation for probabilities
        
        # Compute cross-entropy loss
        loss = -np.sum(y_batch * np.log(a2 + 1e-8)) / batch_size
        epoch_loss += loss
        
        # Backward pass (gradient computation)
        dz2 = a2 - y_batch                     # Derivative of loss w.r.t. z2
        dW2 = np.dot(a1.T, dz2) / batch_size
        db2 = np.sum(dz2, axis=0, keepdims=True) / batch_size
        
        da1 = np.dot(dz2, W2.T)
        dz1 = da1 * relu_derivative(z1)        # Backprop through ReLU
        dW1 = np.dot(x_batch.T, dz1) / batch_size
        db1 = np.sum(dz1, axis=0, keepdims=True) / batch_size
        
        # Update weights and biases using gradient descent
        W2 -= learning_rate * dW2
        b2 -= learning_rate * db2
        W1 -= learning_rate * dW1
        b1 -= learning_rate * db1

    avg_loss = epoch_loss / num_batches
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# Evaluation on test set
z1_test = np.dot(x_test, W1) + b1
a1_test = relu(z1_test)
z2_test = np.dot(a1_test, W2) + b2
a2_test = softmax(z2_test)
predictions = np.argmax(a2_test, axis=1)
accuracy = np.mean(predictions == y_test)
print("Test accuracy: {:.2f}%".format(accuracy * 100))


Downloading mnist.npz...
Epoch 1/10, Loss: 0.4567
Epoch 2/10, Loss: 0.2588
Epoch 3/10, Loss: 0.2099
Epoch 4/10, Loss: 0.1785
Epoch 5/10, Loss: 0.1557
Epoch 6/10, Loss: 0.1382
Epoch 7/10, Loss: 0.1244
Epoch 8/10, Loss: 0.1128
Epoch 9/10, Loss: 0.1036
Epoch 10/10, Loss: 0.0954
Test accuracy: 96.84%
