In [1]:
import numpy as np

In [2]:
def load_mnist():
    data = np.load("mnist.npz")
    X_train = data["x_train"]
    y_train = data["y_train"]
    X_test = data["x_test"]
    y_test = data["y_test"]
    return X_train, y_train, X_test, y_test

In [3]:
X_train, y_train, X_test, y_test = load_mnist()

In [6]:
## Flattening images 
# Flatten images
X_train = X_train.reshape(-1, 784) / 255.0
X_test = X_test.reshape(-1, 784) / 255.0

In [8]:
def one_hot(y, num_classes=10):
    return np.eye(num_classes)[y]

y_train_oh = one_hot(y_train)
y_test_oh = one_hot(y_test)

In [10]:
y_test_oh

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
np.random.seed(42)

input_size = 784
hidden_size = 128
output_size = 10

W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros((1, hidden_size))

W2 = np.random.randn(hidden_size, output_size) * 0.01
b2 = np.zeros((1, output_size))

In [12]:
def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)


In [13]:
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

In [15]:
def forward(X):
    # @ is used for matrix multiplication
    z1 = X @ W1 + b1
    a1 = relu(z1)

    z2 = a1 @ W2 + b2
    a2 = softmax(z2)

    return z1, a1, z2, a2

In [14]:
def cross_entropy(y_true, y_pred):
    epsilon = 1e-8
    return -np.mean(np.sum(y_true * np.log(y_pred + epsilon), axis=1))


In [16]:
def backward(X, y, z1, a1, a2, lr=0.1):
    global W1, b1, W2, b2
    m = X.shape[0]

    dz2 = a2 - y
    dW2 = a1.T @ dz2 / m
    db2 = np.sum(dz2, axis=0, keepdims=True) / m

    da1 = dz2 @ W2.T
    dz1 = da1 * relu_derivative(z1)
    dW1 = X.T @ dz1 / m
    db1 = np.sum(dz1, axis=0, keepdims=True) / m

    W2 -= lr * dW2
    b2 -= lr * db2
    W1 -= lr * dW1
    b1 -= lr * db1


In [20]:
epochs = 40
batch_size = 128
learning_rate = 0.1

for epoch in range(epochs):
    for i in range(0, X_train.shape[0], batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train_oh[i:i+batch_size]

        z1, a1, z2, a2 = forward(X_batch)
        backward(X_batch, y_batch, z1, a1, a2, learning_rate)

    _, _, _, train_pred = forward(X_train[:1000])
    loss = cross_entropy(y_train_oh[:1000], train_pred)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")


Epoch 1, Loss: 0.0610
Epoch 2, Loss: 0.0585
Epoch 3, Loss: 0.0562
Epoch 4, Loss: 0.0540
Epoch 5, Loss: 0.0521
Epoch 6, Loss: 0.0502
Epoch 7, Loss: 0.0485
Epoch 8, Loss: 0.0470
Epoch 9, Loss: 0.0456
Epoch 10, Loss: 0.0442
Epoch 11, Loss: 0.0426
Epoch 12, Loss: 0.0415
Epoch 13, Loss: 0.0400
Epoch 14, Loss: 0.0390
Epoch 15, Loss: 0.0379
Epoch 16, Loss: 0.0368
Epoch 17, Loss: 0.0357
Epoch 18, Loss: 0.0344
Epoch 19, Loss: 0.0333
Epoch 20, Loss: 0.0323
Epoch 21, Loss: 0.0314
Epoch 22, Loss: 0.0303
Epoch 23, Loss: 0.0295
Epoch 24, Loss: 0.0286
Epoch 25, Loss: 0.0279
Epoch 26, Loss: 0.0268
Epoch 27, Loss: 0.0261
Epoch 28, Loss: 0.0253
Epoch 29, Loss: 0.0245
Epoch 30, Loss: 0.0238
Epoch 31, Loss: 0.0230
Epoch 32, Loss: 0.0225
Epoch 33, Loss: 0.0217
Epoch 34, Loss: 0.0210
Epoch 35, Loss: 0.0204
Epoch 36, Loss: 0.0197
Epoch 37, Loss: 0.0190
Epoch 38, Loss: 0.0184
Epoch 39, Loss: 0.0179
Epoch 40, Loss: 0.0174


In [21]:
def accuracy(X, y):
    _, _, _, preds = forward(X)
    return np.mean(np.argmax(preds, axis=1) == y)

In [22]:
print("Train accuracy:", accuracy(X_train[:5000], y_train[:5000]))
print("Test accuracy:", accuracy(X_test, y_test))


Train accuracy: 0.9976
Test accuracy: 0.9799
