## Gradient Descent Algorithms 

In [10]:
from typing import (
    List,
    Dict,
    Tuple
)
from numpy import (
    array,
    cov,
    diag,
    eye,
    hstack,
    ravel,
    kron,
    mean,
    multiply,
    ones,
    savez_compressed,
    sqrt,
    squeeze,
    vstack,
    zeros,
)
import numpy as np
import torch
import tensorflow as tf

from torch.utils.data import TensorDataset, DataLoader

### Batch Gradient Descent

In [3]:
np.random.seed(42)

def make_linear_data(
    n: int = 200, 
    weight: float = 3.0, 
    bias: float = 2.0, 
    noise: float = 1.0
) -> Tuple[np.ndarray, np.ndarray]:

    x = np.random.randn(n, 1)
    y = weight * x + bias + noise * np.random.randn(n, 1)
    return x, y

X, y = make_linear_data()
X_b = hstack([np.ones((X.shape[0], 1)), X])

def mse_loss(
    y_true: np.ndarray, 
    y_pred: np.ndarray
) -> float:
 
    return float(np.mean((y_true - y_pred) ** 2))

def predict(
    X: np.ndarray, 
    w: np.ndarray
) -> np.ndarray:

    return X.dot(w)

def batch_gradient_descent(
    X: np.ndarray, 
    y: np.ndarray, 
    learning_rate: float = 0.1, 
    epochs: int = 100
) -> Tuple[np.ndarray, List[float]]:

    m, n = X.shape
    w = np.zeros((n, 1))
    losses: List[float] = []
    for epoch in range(epochs):
        y_pred = predict(X, w)
        error = y_pred - y
        grad = (2.0 / m) * X.T.dot(error)
        w = w - learning_rate * grad
        losses.append(mse_loss(y, y_pred))
    return w, losses

w_bgd, losses_bgd = batch_gradient_descent(X_b, y, learning_rate=0.1, epochs=200)
print('Batch GD final weights (bias, w):', w_bgd.ravel())
print('Final MSE:', losses_bgd[-1])

DATA = dict(X=X, y=y, X_b=X_b)
BGD_RESULT = dict(w=w_bgd, losses=losses_bgd)

Batch GD final weights (bias, w): [2.08998074 3.10086975]
Final MSE: 0.9605306224045634


In [4]:
X_t = torch.from_numpy(DATA['X']).float()
y_t = torch.from_numpy(DATA['y']).float()
model = torch.nn.Linear(1, 1)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
criterion = torch.nn.MSELoss()
for epoch in range(200):
    optimizer.zero_grad()
    preds = model(X_t)
    loss = criterion(preds, y_t)
    loss.backward()
    optimizer.step()
w_t = model.weight.detach().ravel()[0]
b_t = model.bias.detach().ravel()[0]
print('PyTorch final weights (bias, w):', b_t, w_t)
print('PyTorch final MSE:', loss.item())

PyTorch final weights (bias, w): tensor(2.0900) tensor(3.1009)
PyTorch final MSE: 0.960530698299408


In [8]:
model = tf.keras.Sequential([tf.keras.layers.Input(shape=(1,)), tf.keras.layers.Dense(1)])
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.1), loss='mse')
model.fit(DATA['X'], DATA['y'], epochs=200, batch_size=DATA['X'].shape[0], verbose=0)
w_tf = model.layers[0].kernel.numpy().ravel()[0]
b_tf = model.layers[0].bias.numpy().ravel()[0]
loss_tf = model.evaluate(DATA['X'], DATA['y'], verbose=0)
print('TensorFlow final weights (bias, w):', b_tf, w_tf)
print('TensorFlow final MSE:', loss_tf)

TensorFlow final weights (bias, w): 2.0899804 3.1008692
TensorFlow final MSE: 0.960530698299408


### Stochastic Gradient Descent

In [9]:
def stochastic_gradient_descent(
    X: np.ndarray, 
    y: np.ndarray, 
    learning_rate: float = 0.01, 
    epochs: int = 5
) -> Tuple[np.ndarray, List[float]]:

    m, n = X.shape
    w = np.zeros((n, 1))
    losses: List[float] = []
    for epoch in range(epochs):
        perm = np.random.permutation(m)
        for i in perm:
            xi = X[i:i+1]
            yi = y[i:i+1]
            pred = xi.dot(w)
            grad = 2.0 * xi.T.dot(pred - yi)
            w = w - learning_rate * grad
        losses.append(mse_loss(y, predict(X, w)))
    return w, losses

w_sgd, losses_sgd = stochastic_gradient_descent(DATA['X_b'], DATA['y'], learning_rate=0.01, epochs=50)
print('SGD final weights (bias, w):', w_sgd.ravel())
print('SGD final MSE:', losses_sgd[-1])
SGD_RESULT = dict(w=w_sgd, losses=losses_sgd)

SGD final weights (bias, w): [2.16366577 3.08969605]
SGD final MSE: 0.9661351264515031


In [11]:
X_t = torch.from_numpy(DATA['X']).float()
y_t = torch.from_numpy(DATA['y']).float()
dataset = TensorDataset(X_t, y_t)
loader = DataLoader(dataset, batch_size=1, shuffle=True)
model = torch.nn.Linear(1, 1)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()
for epoch in range(50):
    for xb, yb in loader:
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
print('PyTorch SGD final weights (bias, w):', model.bias.item(), model.weight.item())

PyTorch SGD final weights (bias, w): 2.1767284870147705 3.095686674118042


In [12]:
ds = tf.data.Dataset.from_tensor_slices((DATA['X'], DATA['y'])).shuffle(200).batch(1)
model = tf.keras.Sequential([tf.keras.layers.Dense(1, input_shape=(1,))])
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
loss_fn = tf.keras.losses.MeanSquaredError()
for epoch in range(50):
    for xb, yb in ds:
        with tf.GradientTape() as tape:
            preds = model(xb, training=True)
            loss = loss_fn(yb, preds)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
w_tf = model.layers[0].kernel.numpy().ravel()[0]
b_tf = model.layers[0].bias.numpy().ravel()[0]
print('TensorFlow SGD final weights (bias, w):', b_tf, w_tf)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


TensorFlow SGD final weights (bias, w): 2.093462 3.1641674


### Mini-Batch Gradient Descent

In [None]:
def minibatch_gradient_descent(
    X: np.ndarray, 
    y: np.ndarray, 
    learning_rate: float = 0.05, 
    epochs: int = 100, 
    batch_size: int = 20
) -> Tuple[np.ndarray, List[float]]:
    m, n = X.shape
    w = np.zeros((n, 1))
    losses = []
    for epoch in range(epochs):
        perm = np.random.permutation(m)
        for i in range(0, m, batch_size):
            idx = perm[i:i+batch_size]
            xb = X[idx]
            yb = y[idx]
            pred = xb.dot(w)
            grad = (2.0 / xb.shape[0]) * xb.T.dot(pred - yb)
            w = w - learning_rate * grad
        losses.append(mse_loss(y, predict(X, w)))
    return w, losses

w_mbgd, losses_mbgd = minibatch_gradient_descent(DATA['X_b'], DATA['y'], learning_rate=0.05, epochs=200, batch_size=32)
print('Mini-batch GD final weights (bias, w):', w_mbgd.ravel())
print('Mini-batch final MSE:', losses_mbgd[-1])

Mini-batch GD final weights (bias, w): [2.04203987 3.11233255]
Mini-batch final MSE: 0.9629872982499463


In [14]:
X_t = torch.from_numpy(DATA['X']).float()
y_t = torch.from_numpy(DATA['y']).float()
dataset = TensorDataset(X_t, y_t)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
model = torch.nn.Linear(1, 1)
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)
criterion = torch.nn.MSELoss()
for epoch in range(200):
    for xb, yb in loader:
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
print('PyTorch mini-batch final weights (bias, w):', model.bias.item(), model.weight.item())

PyTorch mini-batch final weights (bias, w): 2.0999948978424072 3.0703885555267334


In [15]:
model = tf.keras.Sequential([tf.keras.layers.Dense(1, input_shape=(1,))])
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.05), loss='mse')
model.fit(DATA['X'], DATA['y'], epochs=200, batch_size=32, verbose=0)
print('TensorFlow mini-batch final weights (bias, w):', model.layers[0].bias.numpy().ravel()[0], model.layers[0].kernel.numpy().ravel()[0])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


TensorFlow mini-batch final weights (bias, w): 2.058479 3.0675073


### Momentum-Based Gradient Descent

In [None]:
def momentum_gradient_descent(
    X: np.ndarray, 
    y: np.ndarray, 
    learning_rate: float = 0.05, 
    epochs: int = 100, 
    beta: float = 0.9
)-> Tuple[np.ndarray, List[float]]:
    m, n = X.shape
    w = np.zeros((n, 1))
    v = np.zeros_like(w)
    losses = []
    for epoch in range(epochs):
        y_pred = X.dot(w)
        grad = (2.0 / m) * X.T.dot(y_pred - y)
        v = beta * v + learning_rate * grad
        w = w - v
        losses.append(mse_loss(y, predict(X, w)))
    return w, losses

w_mom, losses_mom = momentum_gradient_descent(DATA['X_b'], DATA['y'], learning_rate=0.05, epochs=200)
print('Momentum GD final weights (bias, w):', w_mom.ravel())
print('Momentum final MSE:', losses_mom[-1])

Momentum GD final weights (bias, w): [2.09001287 3.10094807]
Momentum final MSE: 0.960530628531266


In [17]:
X_t = torch.from_numpy(DATA['X']).float()
y_t = torch.from_numpy(DATA['y']).float()
model = torch.nn.Linear(1, 1)
optimizer = torch.optim.SGD(model.parameters(), lr=0.05, momentum=0.9)
criterion = torch.nn.MSELoss()
for epoch in range(200):
    optimizer.zero_grad()
    pred = model(X_t)
    loss = criterion(pred, y_t)
    loss.backward()
    optimizer.step()
print('PyTorch Momentum final weights (bias, w):', model.bias.item(), model.weight.item())

PyTorch Momentum final weights (bias, w): 2.0900185108184814 3.10093092918396


In [18]:
model = tf.keras.Sequential([tf.keras.layers.Dense(1, input_shape=(1,))])
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.05, momentum=0.9), loss='mse')
model.fit(DATA['X'], DATA['y'], epochs=200, batch_size=DATA['X'].shape[0], verbose=0)
print('TensorFlow Momentum final weights (bias, w):', model.layers[0].bias.numpy().ravel()[0], model.layers[0].kernel.numpy().ravel()[0])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


TensorFlow Momentum final weights (bias, w): 2.09001 3.1009314


### Adaptive Gradient Descent (AdaGrad)

In [19]:
def adagrad(
    X: np.ndarray, 
    y: np.ndarray, 
    learning_rate: float = 0.5, 
    epochs: int = 100, 
    eps: float = 1e-8
) -> Tuple[np.ndarray, List[float]]:
    
    m, n = X.shape
    w = np.zeros((n, 1))
    g_acc = np.zeros_like(w)
    losses = []
    for epoch in range(epochs):
        y_pred = X.dot(w)
        grad = (2.0 / m) * X.T.dot(y_pred - y)
        g_acc += grad * grad
        adjusted_learning_rate = learning_rate / (np.sqrt(g_acc) + eps)
        w = w - adjusted_learning_rate * grad
        losses.append(mse_loss(y, predict(X, w)))
    return w, losses

w_adagrad, losses_adagrad = adagrad(DATA['X_b'], DATA['y'], learning_rate=0.5, epochs=200)
print('AdaGrad final weights (bias, w):', w_adagrad.ravel())
print('AdaGrad final MSE:', losses_adagrad[-1])

AdaGrad final weights (bias, w): [2.08998071 3.10086935]
AdaGrad final MSE: 0.9605306224047068


In [20]:
X_t = torch.from_numpy(DATA['X']).float()
y_t = torch.from_numpy(DATA['y']).float()
model = torch.nn.Linear(1, 1)
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.5)
criterion = torch.nn.MSELoss()
for epoch in range(200):
    optimizer.zero_grad()
    pred = model(X_t)
    loss = criterion(pred, y_t)
    loss.backward()
    optimizer.step()
print('PyTorch AdaGrad final weights (bias, w):', model.bias.item(), model.weight.item())

PyTorch AdaGrad final weights (bias, w): 2.089967966079712 3.100628614425659


In [21]:
model = tf.keras.Sequential([tf.keras.layers.Dense(1, input_shape=(1,))])
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.5), loss='mse')
model.fit(DATA['X'], DATA['y'], epochs=200, batch_size=DATA['X'].shape[0], verbose=0)
print('TensorFlow AdaGrad final weights (bias, w):', model.layers[0].bias.numpy().ravel()[0], model.layers[0].kernel.numpy().ravel()[0])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


TensorFlow AdaGrad final weights (bias, w): 2.0899801 3.1008692


### RMSProp

In [22]:
def rmsprop(
    X: np.ndarray, 
    y: np.ndarray, 
    lr: float = 0.01, 
    epochs: int = 200, 
    beta: float = 0.9, 
    eps: float = 1e-8
) -> Tuple[np.ndarray, List[float]]:
    
    m, n = X.shape
    w = np.zeros((n, 1))
    s = np.zeros_like(w)
    losses = []
    for epoch in range(epochs):
        y_pred = X.dot(w)
        grad = (2.0 / m) * X.T.dot(y_pred - y)
        s = beta * s + (1 - beta) * (grad * grad)
        w = w - (lr / (np.sqrt(s) + eps)) * grad
        losses.append(mse_loss(y, predict(X, w)))
    return w, losses

w_rms, losses_rms = rmsprop(DATA['X_b'], DATA['y'], lr=0.01, epochs=200)
print('RMSProp final weights (bias, w):', w_rms.ravel())
print('RMSProp final MSE:', losses_rms[-1])

RMSProp final weights (bias, w): [1.88754513 2.00681497]
RMSProp final MSE: 2.0177368866177257


In [23]:
X_t = torch.from_numpy(DATA['X']).float()
y_t = torch.from_numpy(DATA['y']).float()
model = torch.nn.Linear(1, 1)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()
for epoch in range(200):
    optimizer.zero_grad()
    pred = model(X_t)
    loss = criterion(pred, y_t)
    loss.backward()
    optimizer.step()
print('PyTorch RMSProp final weights (bias, w):', model.bias.item(), model.weight.item())

PyTorch RMSProp final weights (bias, w): 1.9795793294906616 2.9544224739074707


In [24]:
model = tf.keras.Sequential([tf.keras.layers.Dense(1, input_shape=(1,))])
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.01), loss='mse')
model.fit(DATA['X'], DATA['y'], epochs=200, batch_size=DATA['X'].shape[0], verbose=0)
print('TensorFlow RMSProp final weights (bias, w):', model.layers[0].bias.numpy().ravel()[0], model.layers[0].kernel.numpy().ravel()[0])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


TensorFlow RMSProp final weights (bias, w): 1.8863635 1.9090302


### ADAM: Adaptive Moment Estimation

- Adam is a variant of SGD that uses adaptive learning rates for each parameter. It combines the benefits of both AdaGrad and RMSProp.
- Mathematically, Adam is defined as follows:

In [25]:
def adam(
    X: np.ndarray, 
    y: np.ndarray, 
    learning_rate: float = 0.1, 
    epochs: int = 200, 
    beta1: float = 0.9, 
    beta2: float = 0.999, 
    epsilon: float = 1e-8
)-> Tuple[np.ndarray, List[float]]:
    
    m, n = X.shape
    w = np.zeros((n, 1))
    m_t = np.zeros_like(w)
    v_t = np.zeros_like(w)
    losses = []
    for t in range(1, epochs + 1):
        y_pred = X.dot(w)
        grad = (2.0 / m) * X.T.dot(y_pred - y)
        m_t = beta1 * m_t + (1 - beta1) * grad
        v_t = beta2 * v_t + (1 - beta2) * (grad * grad)
        m_hat = m_t / (1 - beta1 ** t)
        v_hat = v_t / (1 - beta2 ** t)
        w = w - learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
        losses.append(mse_loss(y, predict(X, w)))
    return w, losses

w_adam, losses_adam = adam(DATA['X_b'], DATA['y'], learning_rate=0.1, epochs=200)
print('Adam final weights (bias, w):', w_adam.ravel())
print('Adam final MSE:', losses_adam[-1])

Adam final weights (bias, w): [2.08993389 3.10095987]
Adam final MSE: 0.960530631961434


In [26]:
X_t = torch.from_numpy(DATA['X']).float()
y_t = torch.from_numpy(DATA['y']).float()
model = torch.nn.Linear(1, 1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
criterion = torch.nn.MSELoss()

for epoch in range(200):
    optimizer.zero_grad()
    pred = model(X_t)
    loss = criterion(pred, y_t)
    loss.backward()
    optimizer.step()
print('PyTorch Adam final weights (bias, w):', model.bias.item(), model.weight.item())

PyTorch Adam final weights (bias, w): 2.08988094329834 3.100698471069336


In [27]:
model = tf.keras.Sequential([tf.keras.layers.Dense(1, input_shape=(1,))])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), loss='mse')
model.fit(DATA['X'], DATA['y'], epochs=200, batch_size=DATA['X'].shape[0], verbose=0)
print('TensorFlow Adam final weights (bias, w):', model.layers[0].bias.numpy().ravel()[0], model.layers[0].kernel.numpy().ravel()[0])

TensorFlow Adam final weights (bias, w): 2.0899477 3.1011472
