In [None]:
import torch.nn as nn
import torch.optim as optim

model = nn.Linear(10, 1)

optimizer_vanilla = optim.SGD(model.parameters(), lr = 0.01)
# Normal Gradient function

optimizer_momentum = optim.SGD(model.parameters(), lr = 0.01, momentum = 0.9)
# Momentum Function uses friction control to speed up or slow down the step done by SGD high momentum  equals faster convergence

optimizer_nesterov = optim.SGD(model.parameters(), lr = 0.01, momentum = 0.9, nesterov = True)
# Nesterov does the same thing as momentum but stops at optimal point before reaching the end which may again go uphill

In [None]:
import torch.optim as optim

optimizer = optim.Adagrad(model.parameters(), lr=0.01, lr_decay=0, weight_decay=0)
# Adagrad also Adaptive Gradient will update weight based on parameter importances and change learning rate

In [None]:
import math

def rms_prop_step(weight, grad, running_var, lr = 0.01, alpha = 0.99, eps = 1e-8):

  new_var = (alpha * running_var) + ((1- alpha) * (grad ** 2))
  # calculating new variance based on existing variance to keep learned parameters by giving alpha maximum value

  step = (lr / math.sqrt(new_var + eps)) * grad
  # Calculating gradient step to take

  new_weight = weight - step

  return new_weight, new_var

In [None]:
learning_rate = 0.1
alpha_decay = 0.9
epsilon = 0.001

current_weight = 10.0
current_var = 0.0

In [None]:
gradient_1 = 8.0

current_weight, current_var = rms_prop_step(
    weight = current_weight,
    grad = gradient_1,
    running_var = current_var,
    lr = learning_rate,
    alpha = alpha_decay,
    eps = epsilon
)

In [None]:
print(f"  -> New Variance (V): {current_var:.4f}")
print(f"  -> New Weight:       {current_weight:.4f}")

  -> New Variance (V): 6.4000
  -> New Weight:       9.6838


In [None]:
gradient_2 = 1.0

current_weight, current_var = rms_prop_step(
    weight=current_weight,
    grad=gradient_2,
    running_var=current_var,
    lr=learning_rate,
    alpha=alpha_decay,
    eps=epsilon
)

# The Learning rate will automatically adjusted and update weights relatively to previous update due to using alpha parameter

In [None]:
print(f"  -> New Variance (V): {current_var:.4f}")
print(f"  -> New Weight:       {current_weight:.4f}")

  -> New Variance (V): 5.8600
  -> New Weight:       9.6425


In [None]:
def adam_step(weight, grad, m, v, t, lr = 0.001, beta1 = 0.9, beta2 = 0.999, eps = 1e-8):

  m_new = (beta1 * m) + ( (1- beta1) * grad)
  # first calculate the momentum value to get the currently moving distance for the past time

  v_new = (beta2 * v) + ( (1- beta2) * (grad ** 2))
  # second calculate the variance value to get the average speed

  m_hat = m_new / (1- beta1 ** t)
  v_hat = v_new / (1 - beta2 ** t)

  step = lr * m_hat / (math.sqrt(v_hat) + eps)
  new_weight = weight - step

  return new_weight, m_new, v_new


In [None]:
w = 10.0
m_hist = 0.0
v_hist = 0.0
step_count = 1
gradient = 10.0

w, m_hist, v_hist = adam_step(w, gradient, m_hist, v_hist, step_count)

print(f"Step 1 Momentum (m): {m_hist:.4f}")
print(f"Step 1 Variance (v): {v_hist:.4f}")

Step 1 Momentum (m): 1.0000
Step 1 Variance (v): 0.1000


In [None]:
w = 1.0
step_count = 1
gradient = 9.0

w, m_hist, v_hist = adam_step(w, gradient, m_hist, v_hist, step_count)

print(f"Step 1 Momentum (m): {m_hist:.4f}")
print(f"Step 1 Variance (v): {v_hist:.4f}")

# The ADAM optimizer same as combining momentum SGD and RMSProp by utilizing the momentun during step and also able to control the step and gives a good initial headstart

Step 1 Momentum (m): 1.8000
Step 1 Variance (v): 0.1809


In [None]:
import torch
optimizer = torch.optim.NAdam(model.parameters(), lr = 0.001)

# Nesterov Adam is same as ADAM with Nesterov calculation which combines past and current gradient to determine the step

In [None]:
import math

def adamw_step(weight, grad, m, v, t, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8, weight_decay=0.01):

    m_new = (beta1 * m) + ((1 - beta1) * grad)
    v_new = (beta2 * v) + ((1 - beta2) * (grad ** 2))

    m_hat = m_new / (1 - beta1 ** t)
    v_hat = v_new / (1 - beta2 ** t)


    adam_step = lr * (m_hat / (math.sqrt(v_hat) + eps))

    decay_step = lr * weight_decay * weight
    # Weight decay depend only on current weight and Learning Rate and ignores gradient step

    new_weight = weight - adam_step - decay_step

    return new_weight, m_new, v_new

In [None]:
w = 10.0
grad = 0.0
m = 0.0
v = 0.0
t = 1
lr = 0.1
decay = 0.1


new_w, new_m, new_v = adamw_step(
    w, grad, m, v, t,
    lr=lr,
    weight_decay=decay
)

print("ADAMW RESULT:")
print(f"New Weight: {new_w:.4f}")
print(f"Change:     {new_w - w:.4f}")

ADAMW RESULT:
New Weight: 9.9000
Change:     -0.1000
