In [None]:
import numpy as np
import matplotlib.pyplot as plt

def load_train_images():
    with open(r"D:\OneDrive\Desktop\College\MrM Research\Coding\archive\train-images-idx3-ubyte", 'rb') as f:
        data = np.frombuffer(f.read(), np.uint8, offset=16)
    return data.reshape(-1, 28, 28)

def load_train_labels():
    with open(r"D:\OneDrive\Desktop\College\MrM Research\Coding\archive\train-labels-idx1-ubyte", 'rb') as f:
        return np.frombuffer(f.read(), np.uint8, offset=8)

def load_test_images():
    with open(r"D:\OneDrive\Desktop\College\MrM Research\Coding\archive\t10k-images-idx3-ubyte", 'rb') as f:
        data = np.frombuffer(f.read(), np.uint8, offset=16)
    return data.reshape(-1, 28, 28)

def load_test_labels():
    with open(r"D:\OneDrive\Desktop\College\MrM Research\Coding\archive\t10k-labels-idx1-ubyte", 'rb') as f:
        return np.frombuffer(f.read(), np.uint8, offset=8)
X_train = load_train_images()
y_train = load_train_labels()

X_test = load_test_images()
y_test = load_test_labels()

X_train = X_train / 255.0
X_test = X_test / 255.0
class_names = [
    "T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
    "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"
]

plt.figure(figsize=(6,6))
for i in range(16):
    plt.subplot(4,4,i+1)
    plt.imshow(X_train[i], cmap='gray')
    plt.title(class_names[y_train[i]])
    plt.axis('off')
plt.show()

X = X_train.reshape(-1, 28*28)
X_test_flat = X_test.reshape(-1, 28*28)


def he_init(fan_in, fan_out):
    return np.random.randn(fan_in, fan_out) * np.sqrt(2.0 / fan_in)
# hyperparameters
input_size = 784
hidden_size = 128
output_size = 10
dropout_rate = 0.2
num_epochs = 20
batch_size = 128
learning_rate = 0.001
l2_lambda= 1e-4
# He initialization
W1 = he_init(input_size, hidden_size)
b1 = np.zeros(hidden_size)

W2 = he_init(hidden_size, output_size)
b2 = np.zeros(output_size)
def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)
x = np.linspace(-5, 5, 100)
plt.plot(x, relu(x))
plt.title("ReLU Activation")
plt.show()

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)
    
def cross_entropy(y_true, y_pred):
    m = y_true.shape[0]
    return -np.sum(np.log(y_pred[np.arange(m), y_true] + 1e-8)) / m

#forward
def forward_prop(X, params):
       # 20% neurons dropped

    W1, b1 = params["W1"], params["b1"]
    W2, b2 = params["W2"], params["b2"]

    Z1 = X @ W1 + b1
    A1 = relu(Z1)
    dropout_mask = (np.random.rand(*A1.shape) > dropout_rate)
    A1 = (A1 * dropout_mask) / (1 - dropout_rate)

    Z2 = A1 @ W2 + b2
    A2 = softmax(Z2)

    cache = {
    "X": X,
    "Z1": Z1,
    "A1": A1,
    "Z2": Z2,
    "A2": A2,
    "dropout_mask": dropout_mask
    }


    return A2, cache


def backward_prop(y, params, cache):
    W2 = params["W2"]
    X = cache["X"]
    A1 = cache["A1"]
    A2 = cache["A2"]
    Z1 = cache["Z1"]

    m = y.shape[0]
    # Output layer gradient
    dZ2 = A2.copy()
    dZ2[np.arange(m), y] -= 1
    dZ2 /= m
    dW2 = A1.T @ dZ2 + l2_lambda * W2
    dW1 = X.T @ dZ1 + l2_lambda * params["W1"]


    # Hidden layer gradient
    dA1 = dZ2 @ W2.T
    dA1 = dA1 * cache["dropout_mask"] / (1 - dropout_rate)
    dZ1 = dA1 * relu_derivative(Z1)

    dW1 = X.T @ dZ1 + l2_lambda * params["W1"]
    db1 = np.sum(dZ1, axis=0)
    db2 = np.sum(dZ2, axis=0)


    grads = {
        "W1": dW1, "b1": db1,
        "W2": dW2, "b2": db2
    }

    return grads

#optimiser
def init_adam(params):
    adam = {}
    for key in params:
        adam["m_" + key] = np.zeros_like(params[key])
        adam["v_" + key] = np.zeros_like(params[key])
    adam["t"] = 0
    return adam
def adam_update(params, grads, adam,
                lr=0.001,
                beta1=0.9,
                beta2=0.999,
                eps=1e-8):
    
    adam["t"] += 1
    t = adam["t"]
    
    for key in params:
        # Update biased first moment
        adam["m_" + key] = beta1 * adam["m_" + key] + (1 - beta1) * grads[key]
        
        # Update biased second moment
        adam["v_" + key] = beta2 * adam["v_" + key] + (1 - beta2) * (grads[key] ** 2)
        
        # Bias correction
        m_hat = adam["m_" + key] / (1 - beta1 ** t)
        v_hat = adam["v_" + key] / (1 - beta2 ** t)
        
        # Parameter update
        params[key] -= lr * m_hat / (np.sqrt(v_hat) + eps)
params = {
    "W1": W1, "b1": b1,
    "W2": W2, "b2": b2
}


adam = init_adam(params)

#mini batch
y = y_train

num_samples = X.shape[0]

loss_history = []

for epoch in range(num_epochs):

    # Shuffle data
    perm = np.random.permutation(num_samples)
    X_shuffled = X[perm]
    y_shuffled = y[perm]

    epoch_loss = 0

    for i in range(0, num_samples, batch_size):

        X_batch = X_shuffled[i:i+batch_size]
        y_batch = y_shuffled[i:i+batch_size]

        # Forward pass
        A2, cache = forward_prop(X_batch, params)

        # Loss
        # Loss
        data_loss = cross_entropy(y_batch, A2)
        epoch_loss += data_loss


        # Backward pass
        grads = backward_prop(y_batch, params, cache)

        # Adam update
        adam_update(params, grads, adam, lr=learning_rate)

    epoch_loss /= max(1, num_samples // batch_size)

    loss_history.append(epoch_loss)

    if epoch % 2 == 0:
        print(f"Epoch {epoch}, Loss: {epoch_loss:.4f}")

        l2_loss = 0.5 * l2_lambda * (
    np.sum(params["W1"] ** 2) + np.sum(params["W2"] ** 2)
)
        loss = cross_entropy(y_batch, A2)+l2_loss
        epoch_loss += loss

        # Backward pass
        grads = backward_prop(y_batch, params, cache)

        # Adam update
        adam_update(params, grads, adam, lr=learning_rate)

    epoch_loss /= max(1, num_samples // batch_size)

    loss_history.append(epoch_loss)

    if epoch % 2 == 0:
        print(f"Epoch {epoch}, Loss: {epoch_loss:.4f}")
l2_loss = 0.5 * l2_lambda * (
    np.sum(params["W1"] ** 2) + np.sum(params["W2"] ** 2)
)

epoch_loss += l2_loss
epoch_loss /= max(1, num_samples // batch_size)
loss_history.append(epoch_loss)


