Name : Saarthak Khamkar

Roll No : D088

SAP ID : 60009230057

### Step 1: Load MNIST

In [1]:
from sklearn.datasets import fetch_openml
import numpy as np

# Load MNIST (70,000 images)
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
X = X / 255.0
y = y.astype(int)

# Train-test split
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

# One-hot encoding labels
def one_hot(y, num_classes=10):
    out = np.zeros((y.size, num_classes))
    out[np.arange(y.size), y] = 1
    return out

Y_train = one_hot(y_train)
Y_test = one_hot(y_test)

### Step 2: Define Neural Network

In [2]:
def init_weights():
    W1 = np.random.randn(784, 128) * 0.01
    b1 = np.zeros((1, 128))
    W2 = np.random.randn(128, 10) * 0.01
    b2 = np.zeros((1, 10))
    return W1, b1, W2, b2

# Activation + softmax
def relu(x): return np.maximum(0, x)
def relu_deriv(x): return (x>0).astype(float)
def softmax(x):
    e = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e / np.sum(e, axis=1, keepdims=True)

# Forward + loss
def forward(X, W1, b1, W2, b2):
    z1 = X @ W1 + b1
    a1 = relu(z1)
    z2 = a1 @ W2 + b2
    a2 = softmax(z2)
    return z1, a1, z2, a2

def cross_entropy(y_true, y_pred):
    m = y_true.shape[0]
    return -np.sum(y_true * np.log(y_pred+1e-9))/m

### Step 3: Backpropagation

In [3]:
def backward(X, y, z1, a1, a2, W2):
    m = X.shape[0]
    dz2 = a2 - y
    dW2 = (a1.T @ dz2) / m
    db2 = np.sum(dz2, axis=0, keepdims=True) / m

    da1 = dz2 @ W2.T
    dz1 = da1 * relu_deriv(z1)
    dW1 = (X.T @ dz1) / m
    db1 = np.sum(dz1, axis=0, keepdims=True) / m

    return dW1, db1, dW2, db2

### Step 4: Implement Optimizers

In [4]:
def sgd(params, grads, lr, state=None):
    for p,g in zip(params, grads):
        p -= lr * g

def sgd_momentum(params, grads, lr, state, beta=0.9):
    if state is None:
        state = [np.zeros_like(p) for p in params]
    for i,(p,g) in enumerate(zip(params, grads)):
        state[i] = beta*state[i] + (1-beta)*g
        p -= lr * state[i]
    return state

def adagrad(params, grads, lr, state, eps=1e-8):
    if state is None:
        state = [np.zeros_like(p) for p in params]
    for i,(p,g) in enumerate(zip(params, grads)):
        state[i] += g*g
        p -= (lr / (np.sqrt(state[i])+eps)) * g
    return state

def rmsprop(params, grads, lr, state, beta=0.9, eps=1e-8):
    if state is None:
        state = [np.zeros_like(p) for p in params]
    for i,(p,g) in enumerate(zip(params, grads)):
        state[i] = beta*state[i] + (1-beta)*g*g
        p -= (lr / (np.sqrt(state[i])+eps)) * g
    return state

def adadelta(params, grads, lr, state, rho=0.95, eps=1e-6):
    if state is None:
        state = [[np.zeros_like(p), np.zeros_like(p)] for p in params]
    for i,(p,g) in enumerate(zip(params, grads)):
        Eg2, Ex2 = state[i]
        Eg2[:] = rho*Eg2 + (1-rho)*g*g
        update = - (np.sqrt(Ex2+eps)/np.sqrt(Eg2+eps)) * g
        p += update
        Ex2[:] = rho*Ex2 + (1-rho)*update*update
    return state

def adam(params, grads, lr, state, beta1=0.9, beta2=0.999, eps=1e-8):
    if state is None:
        state = [[np.zeros_like(p), np.zeros_like(p), 0] for p in params]
    for i,(p,g) in enumerate(zip(params, grads)):
        m,v,t = state[i]
        t += 1
        m[:] = beta1*m + (1-beta1)*g
        v[:] = beta2*v + (1-beta2)*g*g
        m_hat = m / (1-beta1**t)
        v_hat = v / (1-beta2**t)
        p -= lr * m_hat / (np.sqrt(v_hat)+eps)
        state[i][2] = t
    return state

### Step 5: Training Loop

In [5]:
def train(optimizer_fn, lr=0.01, batch_size=128, epochs=10):
    W1, b1, W2, b2 = init_weights()
    state = None

    for epoch in range(epochs):
        idx = np.random.permutation(len(X_train))
        X_shuff, Y_shuff = X_train[idx], Y_train[idx]

        for i in range(0, len(X_train), batch_size):
            xb = X_shuff[i:i+batch_size]
            yb = Y_shuff[i:i+batch_size]

            z1,a1,z2,a2 = forward(xb, W1,b1,W2,b2)
            dW1,db1,dW2,db2 = backward(xb, yb, z1,a1,a2, W2)

            grads = [dW1,db1,dW2,db2]
            params = [W1,b1,W2,b2]

            out = optimizer_fn(params, grads, lr, state)
            if out is not None: state = out

    _,_,_,a2 = forward(X_test, W1,b1,W2,b2)
    acc = np.mean(np.argmax(a2,axis=1)==y_test)
    return acc

### Step 6: Compare All Optimizers

In [6]:
optimizers = {
    "SGD": lambda p,g,lr,s: sgd(p,g,lr),
    "SGD Momentum": lambda p,g,lr,s: sgd_momentum(p,g,lr,s),
    "Adagrad": lambda p,g,lr,s: adagrad(p,g,lr,s),
    "RMSProp": lambda p,g,lr,s: rmsprop(p,g,lr,s),
    "AdaDelta": lambda p,g,lr,s: adadelta(p,g,lr,s),
    "Adam": lambda p,g,lr,s: adam(p,g,lr,s)
}

for name,opt in optimizers.items():
    acc = train(opt, lr=0.01)
    print(f"{name}: accuracy={acc:.4f}")

SGD: accuracy=0.9063
SGD Momentum: accuracy=0.9063
Adagrad: accuracy=0.9632
RMSProp: accuracy=0.9713
AdaDelta: accuracy=0.9793
Adam: accuracy=0.9697


COLAB LINK : https://colab.research.google.com/drive/1yJME4X0h374rmMkuuJJC5dyRvBEdc2EF?usp=sharing