In [None]:
import numpy as np
import matplotlib.pyplot as plt


In [None]:
def f(x, y):
    return x**2 + y**2

def grad_f(x, y):
    return np.array([2 * x, 2 * y])

# Standard Gradient Descent (GD)
def gradient_descent(x, y, learning_rate=0.1, epochs=100):
    history = []
    for _ in range(epochs):
        grad = grad_f(x, y)
        x -= learning_rate * grad[0]
        y -= learning_rate * grad[1]
        history.append((x, y, f(x, y)))
    return history

# Stochastic Gradient Descent (SGD)
def stochastic_gradient_descent(x, y, learning_rate=0.1, epochs=100):
    history = []
    for _ in range(epochs):
        grad = grad_f(x, y) + np.random.normal(0, 0.1, size=2)
        x -= learning_rate * grad[0]
        y -= learning_rate * grad[1]
        history.append((x, y, f(x, y)))
    return history

# Momentum-based Gradient Descent
def momentum_gradient_descent(x, y, learning_rate=0.1, momentum=0.9, epochs=100):
    history = []
    vx, vy = 0, 0
    for _ in range(epochs):
        grad = grad_f(x, y)
        vx = momentum * vx + learning_rate * grad[0]
        vy = momentum * vy + learning_rate * grad[1]
        x -= vx
        y -= vy
        history.append((x, y, f(x, y)))
    return history

# Adam Optimizer
def adam_optimizer(x, y, learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8, epochs=100):
    history = []
    m_x, m_y = 0, 0  # First moment estimates
    v_x, v_y = 0, 0  # Second moment estimates
    for t in range(1, epochs + 1):
        grad = grad_f(x, y)

        m_x = beta1 * m_x + (1 - beta1) * grad[0]
        m_y = beta1 * m_y + (1 - beta1) * grad[1]

        v_x = beta2 * v_x + (1 - beta2) * grad[0]**2
        v_y = beta2 * v_y + (1 - beta2) * grad[1]**2

        m_x_hat = m_x / (1 - beta1**t)
        m_y_hat = m_y / (1 - beta1**t)
        v_x_hat = v_x / (1 - beta2**t)
        v_y_hat = v_y / (1 - beta2**t)

        x -= learning_rate * m_x_hat / (np.sqrt(v_x_hat) + epsilon)
        y -= learning_rate * m_y_hat / (np.sqrt(v_y_hat) + epsilon)
        history.append((x, y, f(x, y)))
    return history



In [None]:
x, y = 5.0, 5.0  # Starting point

gd_history = gradient_descent(x, y, learning_rate=0.1, epochs=100)
sgd_history = stochastic_gradient_descent(x, y, learning_rate=0.1, epochs=100)
momentum_history = momentum_gradient_descent(x, y, learning_rate=0.1, momentum=0.9, epochs=100)
adam_history = adam_optimizer(x, y, learning_rate=0.1, epochs=100)

gd_loss = [f(x, y) for x, y, _ in gd_history]
sgd_loss = [f(x, y) for x, y, _ in sgd_history]
momentum_loss = [f(x, y) for x, y, _ in momentum_history]
adam_loss = [f(x, y) for x, y, _ in adam_history]



In [None]:
plt.plot(gd_loss, label="Gradient Descent")
plt.plot(sgd_loss, label="Stochastic Gradient Descent")
plt.plot(momentum_loss, label="Momentum Gradient Descent")
plt.plot(adam_loss, label="Adam Optimizer")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Comparison of Gradient Descent Variants on f(x, y) = x^2 + y^2")
plt.legend()
plt.show()