In [1]:
import numpy as np

In [2]:
def adam(f, grad_f, theta0, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, max_iter=1000):
    theta = theta0

    m = np.zeros_like(theta) #1st moment vector
    v = np.zeros_like(theta) #2nd moment vector
    t= 0

    for t in range(1, max_iter + 1):
        grad = grad_f(theta) #gradients at current theta

        m = beta1 * m + (1 - beta1) * grad #update biased first moment estimate
        v = beta2 * v + (1 - beta2) * (grad ** 2) #update biased second raw moment estimate

        #compute bias-corrected moment estimates
        m_hat = m / (1 - beta1 ** t)
        v_hat = v / (1 - beta2 ** t)

        #parameter update
        theta = theta - alpha * m_hat / (np.sqrt(v_hat) + epsilon)

        #convergence check
        if np.linalg.norm(grad) < 1e-6:
            break
    
    return theta

In [3]:
def f(x):
    return x**2

def grad_f(x):
    return 2*x

In [4]:
theta0 = np.array([3.0])
optimized = adam(f, grad_f, theta0)
print("Optimized Theta = ", optimized)

Optimized Theta =  [2.07678563]
