In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import HTML
from tensorflow.keras.datasets import mnist

In [None]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [None]:
def extract_features(X):
    f1 = np.mean(X, axis=(1,2))
    f2 = np.std(X, axis=(1,2))
    return np.column_stack((f1, f2))
X_train_feat = extract_features(X_train)

In [None]:
mean = np.mean(X_train_feat, axis=0)
std = np.std(X_train_feat, axis=0)
X_train_feat = (X_train_feat - mean) / std

In [None]:
def one_hot(y, num_classes=10):
    return np.eye(num_classes)[y]
y_train_oh = one_hot(y_train)

In [None]:
def softmax(z):
    z = z - np.max(z, axis=1, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

In [None]:
def compute_loss(X, y, W):
    logits = X @ W
    probs = softmax(logits)
    return -np.sum(y * np.log(probs + 1e-8)) / X.shape[0]

In [None]:
def compute_gradient(X, y, W):
    logits = X @ W
    probs = softmax(logits)
    return (X.T @ (probs - y)) / X.shape[0]

In [None]:
learning_rate = 0.1
iterations = 100
initial_W = np.zeros((2,10))


In [None]:
def gradient_descent(X, y):
    W = initial_W.copy()
    history = []
    for i in range(iterations):
        loss = compute_loss(X, y, W)
        history.append((W.copy(), loss))
        grad = compute_gradient(X, y, W)
        W -= learning_rate * grad
    return W, history


In [None]:
def sgd(X, y):
    W = initial_W.copy()
    history = []
    n = len(y)
    for i in range(iterations):
        loss = compute_loss(X, y, W)
        history.append((W.copy(), loss))
        idx = np.random.randint(0, n)
        grad = compute_gradient(X[idx:idx+1], y[idx:idx+1], W)
        W -= learning_rate * grad
    return W, history


In [None]:
def mini_batch(X, y, batch_size=64):
    W = initial_W.copy()
    history = []
    n = len(y)
    for i in range(iterations):
        loss = compute_loss(X, y, W)
        history.append((W.copy(), loss))
        idx = np.random.choice(n, batch_size, replace=False)
        grad = compute_gradient(X[idx], y[idx], W)
        W -= learning_rate * grad
    return W, history


In [None]:
def nesterov(X, y, momentum=0.9):
    W = initial_W.copy()
    V = np.zeros_like(W)
    history = []
    for i in range(iterations):
        loss = compute_loss(X, y, W)
        history.append((W.copy(), loss))
        lookahead = W - momentum * V
        grad = compute_gradient(X, y, lookahead)
        V = momentum * V + learning_rate * grad
        W -= V
    return W, history


In [None]:
def adagrad(X, y):
    W = initial_W.copy()
    G = np.zeros_like(W)
    history = []
    eps = 1e-8
    for i in range(iterations):
        loss = compute_loss(X, y, W)
        history.append((W.copy(), loss))
        grad = compute_gradient(X, y, W)
        G += grad**2
        W -= learning_rate * grad / (np.sqrt(G) + eps)
    return W, history


In [None]:
def rmsprop(X, y, beta=0.9):
    W = initial_W.copy()
    E = np.zeros_like(W)
    history = []
    eps = 1e-8
    for i in range(iterations):
        loss = compute_loss(X, y, W)
        history.append((W.copy(), loss))
        grad = compute_gradient(X, y, W)
        E = beta * E + (1-beta) * grad**2
        W -= learning_rate * grad / (np.sqrt(E) + eps)
    return W, history


In [None]:
def adam(X, y):
    W = initial_W.copy()
    m = np.zeros_like(W)
    v = np.zeros_like(W)
    history = []
    beta1, beta2 = 0.9, 0.999
    eps = 1e-8
    for t in range(1, iterations+1):
        loss = compute_loss(X, y, W)
        history.append((W.copy(), loss))
        grad = compute_gradient(X, y, W)
        m = beta1*m + (1-beta1)*grad
        v = beta2*v + (1-beta2)*(grad**2)
        m_hat = m/(1-beta1**t)
        v_hat = v/(1-beta2**t)
        W -= learning_rate * m_hat/(np.sqrt(v_hat)+eps)
    return W, history


In [None]:
W_gd, hist_gd = gradient_descent(X_train_feat, y_train_oh)
W_sgd, hist_sgd = sgd(X_train_feat, y_train_oh)
W_mb, hist_mb = mini_batch(X_train_feat, y_train_oh)
W_nest, hist_nest = nesterov(X_train_feat, y_train_oh)
W_ada, hist_ada = adagrad(X_train_feat, y_train_oh)
W_rms, hist_rms = rmsprop(X_train_feat, y_train_oh)
W_adam, hist_adam = adam(X_train_feat, y_train_oh)


In [None]:
fig, ax = plt.subplots(figsize=(8,6))

ax.set_xlim(0, iterations)

max_loss = max(
    max([h[1] for h in hist_gd]),
    max([h[1] for h in hist_sgd]),
    max([h[1] for h in hist_mb]),
    max([h[1] for h in hist_nest]),
    max([h[1] for h in hist_ada]),
    max([h[1] for h in hist_rms]),
    max([h[1] for h in hist_adam])
)

ax.set_ylim(0, max_loss)

# Create empty lines
gd_line,   = ax.plot([], [], label="GD")
sgd_line,  = ax.plot([], [], label="SGD")
mb_line,   = ax.plot([], [], label="Mini-Batch")
nest_line, = ax.plot([], [], label="Nesterov")
ada_line,  = ax.plot([], [], label="Adagrad")
rms_line,  = ax.plot([], [], label="RMSProp")
adam_line, = ax.plot([], [], label="Adam")

ax.set_xlabel("Iteration")
ax.set_ylabel("Loss")
ax.set_title("Loss vs Iteration (All Optimizers)")
ax.legend()

def update(frame):

    gd_line.set_data(range(frame),   [h[1] for h in hist_gd[:frame]])
    sgd_line.set_data(range(frame),  [h[1] for h in hist_sgd[:frame]])
    mb_line.set_data(range(frame),   [h[1] for h in hist_mb[:frame]])
    nest_line.set_data(range(frame), [h[1] for h in hist_nest[:frame]])
    ada_line.set_data(range(frame),  [h[1] for h in hist_ada[:frame]])
    rms_line.set_data(range(frame),  [h[1] for h in hist_rms[:frame]])
    adam_line.set_data(range(frame), [h[1] for h in hist_adam[:frame]])

    return (gd_line, sgd_line, mb_line,
            nest_line, ada_line, rms_line, adam_line)

ani = animation.FuncAnimation(
    fig,
    update,
    frames=range(1, iterations),
    interval=80,
    blit=False
)

HTML(ani.to_jshtml())


In [None]:
w1_vals = np.linspace(-5, 5, 100)
w2_vals = np.linspace(-5, 5, 100)

W1, W2 = np.meshgrid(w1_vals, w2_vals)
Loss_surface = np.zeros_like(W1)

W_base = W_adam.copy()  

for i in range(W1.shape[0]):
    for j in range(W1.shape[1]):
        W_temp = W_base.copy()
        W_temp[0,0] = W1[i,j]
        W_temp[1,0] = W2[i,j]
        Loss_surface[i,j] = compute_loss(X_train_feat, y_train_oh, W_temp)


In [None]:
traj_gd   = np.array([h[0][0:2,0] for h in hist_gd])
traj_sgd  = np.array([h[0][0:2,0] for h in hist_sgd])
traj_mb   = np.array([h[0][0:2,0] for h in hist_mb])
traj_nest = np.array([h[0][0:2,0] for h in hist_nest])
traj_ada  = np.array([h[0][0:2,0] for h in hist_ada])
traj_rms  = np.array([h[0][0:2,0] for h in hist_rms])
traj_adam = np.array([h[0][0:2,0] for h in hist_adam])


In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax.contour(W1, W2, Loss_surface, levels=50)

lines = {}

names = ["GD","SGD","MB","Nesterov","Adagrad","RMSProp","Adam"]
colors = ["r","g","b","c","m","y","k"]

for name, color in zip(names, colors):
    lines[name], = ax.plot([], [], marker='o', linestyle='-', color=color, label=name)

ax.set_xlabel("W[0,0]")
ax.set_ylabel("W[1,0]")
ax.set_title("2D Contour - Optimizer Trajectories")
ax.legend()

def update2d(frame):
    lines["GD"].set_data(traj_gd[:frame,0], traj_gd[:frame,1])
    lines["SGD"].set_data(traj_sgd[:frame,0], traj_sgd[:frame,1])
    lines["MB"].set_data(traj_mb[:frame,0], traj_mb[:frame,1])
    lines["Nesterov"].set_data(traj_nest[:frame,0], traj_nest[:frame,1])
    lines["Adagrad"].set_data(traj_ada[:frame,0], traj_ada[:frame,1])
    lines["RMSProp"].set_data(traj_rms[:frame,0], traj_rms[:frame,1])
    lines["Adam"].set_data(traj_adam[:frame,0], traj_adam[:frame,1])
    return list(lines.values())

ani2d = animation.FuncAnimation(
    fig,
    update2d,
    frames=iterations,
    interval=80
)

HTML(ani2d.to_jshtml())


In [None]:
fig3d = plt.figure(figsize=(10,7))
ax3d = fig3d.add_subplot(111, projection='3d')

ax3d.plot_surface(W1, W2, Loss_surface, cmap='viridis', alpha=0.6)

points = {}

for name, color in zip(names, colors):
    points[name], = ax3d.plot([], [], [], marker='o', linestyle='None', color=color, label=name)

ax3d.set_xlabel("W[0,0]")
ax3d.set_ylabel("W[1,0]")
ax3d.set_zlabel("Loss")
ax3d.set_title("3D Surface - Optimizer Movement")
ax3d.legend()

def update3d(frame):

    optimizer_data = [
        ("GD", traj_gd, hist_gd),
        ("SGD", traj_sgd, hist_sgd),
        ("MB", traj_mb, hist_mb),
        ("Nesterov", traj_nest, hist_nest),
        ("Adagrad", traj_ada, hist_ada),
        ("RMSProp", traj_rms, hist_rms),
        ("Adam", traj_adam, hist_adam)
    ]

    for name, traj, hist in optimizer_data:
        w1, w2 = traj[frame]
        loss = hist[frame][1]
        points[name].set_data([w1], [w2])
        points[name].set_3d_properties([loss])

    return list(points.values())

ani3d = animation.FuncAnimation(
    fig3d,
    update3d,
    frames=iterations,
    interval=80
)

HTML(ani3d.to_jshtml())
