In [1]:
import numpy as np

Batch Gradient Descent

In [None]:
def batch_gradient_descent(X, y, lr = 0.01, n_iters = 1000, tol = 1e-6):
  '''
  Args:
    X: shape (n_samples, n_features)
    y: shape (n_samples,)
    lr: learning rate
    n_iters: max iterations
    tol: tolerance for convergence

  Returns:
    w: learned weights, shape (n_features,)
    history: list of loss values per iteration

  '''
  n_samples, n_features = X.shape
  w = np.zeros(n_features)
  history = []

  for i in range(n_iters):
    preds = X.dot(w)
    error = preds - y
    grad = (2/n_samples) * X.T.dot(error)
    w -= lr * grad
    loss = np.mean(error**2)
    history.append(loss)
    if np.linalg.norm(grad) < tol:
      break
  return w, history

Stochastic Gradient Descent

In [None]:
def stochastic_gradient_descent(X, y, lr = 0.01, n_epochs = 50, batch_size = 1):
  n_samples, n_features = X.shape
  w = np.zeros(n_features)
  history = []
  ''' batch size = 1 corresponds to pure SGD '''

  for epoch in range(n_epochs):
    perm = np.random.permutation(n_samples)
    ''' generates a random permutation of the indices of the samples '''
    X_shuffled = X[perm]
    y_shuffled = y[perm]

    for i in range(0, n_samples, batch_size):
      ''' gonna extract mini-batches of features and target values '''
      X_batch = X_shuffled[i:i+batch_size]
      y_batch = y_shuffled[i:i+batch_size]
      preds = X_batch.dot(w)
      error = preds - y_batch
      grad = (2/batch_size) * X_batch.T.dot(error)
      w -= lr * grad

    preds_full = X.dot(w)
    loss = np.mean((preds_full - y)**2)
    history.append(loss)
  return w, history


Momentum Gradient Descent

In [None]:
def momentum_gradient_descent(X, y, lr = 0.01, n_iters = 1000, momentum = 0.9):
  n_samples, n_features = X.shape
  w = np.zeros(n_features)
  v = np.zeros_like(w)
  history = []

  for i in range(n_iters):
    preds = X.dot(w)
    error = preds - y
    grad = (2/n_samples) * X.T.dot(error)
    v = momentum * v + lr * grad
    w -= v
    history.append(np.mean(error**2))
  return w, history

In [None]:
def nesterov_gradient_descent(X, y, lr = 0.01, n_iters = 1000, momentum = 0.9):
  n_samples, n_features = X.shape
  w = np.zeros(n_features)
  v = np.zeros_like(w)
  history = []

  for i in range(n_iters):
    w_lookahead = w - momentum * v
    preds = X.dot(w_lookahead)
    error = preds - y
    grad = (2/n_samples) * X.T.dot(error)
    v = momentum * v + lr * grad
    w -= v
    history.append(np.mean(error**2))
  return w, history


AdaGrad - Adaptive Learning Rate

In [None]:
def adagrad(X, y, lr = 0.01, n_epochs = 100, epsilon = 1e-8):
  n_samples, n_features = X.shape
  w = np.zeros(n_features)
  G = np.zeros(n_features)
  history = []

  for epoch in range(n_epochs):
    grad_sum = np.zeros_like(w)
    for i in range(n_samples):
      xi = X[i]
      yi = y[i]
      pred = xi.dot(w)
      error = pred - yi
      grad = 2 * xi * error
      G += grad**2
      adjusted_lr = lr / (np.sqrt(G) + epsilon)
      w -= adjusted_lr * grad
    preds_full = X.dot(w)
    history.append(np.mean((preds_full - y)**2))
  return w, history


In [2]:
def rmsprop(X, y, lr = 0.01, n_epochs = 100, decay = 0.9, epsilon = 1e-8):
  n_samples, n_features = X.shape
  w = np.zeros(n_features)
  E_g2 = np.zeros(n_features)
  history = []

  for epoch in range(n_epochs):
    for i in range(n_samples):
      xi = X[i]
      yi = y[i]
      pred = xi.dot(w)
      error = pred - yi
      grad = 2 * xi * error
      E_g2 = decay * E_g2 + (1 - decay) * grad**2
      adjusted_lr = lr / (np.sqrt(E_g2) + epsilon)
      w -= adjusted_lr * grad
    preds_full = X.dot(w)
    history.append(np.mean((preds_full - y)**2))
  return w, history

In [4]:
def adam(X, y, lr = 0.01, n_iters = 1000, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
  n_samples, n_features = X.shape
  w = np.zeros(n_features)
  v = np.zeros(n_features)
  m = np.zeros(n_features)
  history = []

  for t in range(1, n_iters + 1):
    preds = X.dot(w)
    error = preds - y
    grad = (2/n_samples) * X.T.dot(error)

    m = beta1 * m + (1 - beta1) * grad
    v = beta2 * v + (1 - beta2) * grad**2
    m_hat = m / (1 - beta1**t)
    v_hat = v / (1 - beta2**t)

    w -= lr * m_hat / (np.sqrt(v_hat) + epsilon)
    history.append(np.mean(error**2))
  return w, history