# The Adam optimizer

The primary usefulness of the adam optimizer is its improved capabilites over other optimizers in almost all cases. To demonstrate this improvement we are going to implement several different optimizers on a basic toy problem.

### But first, some set up:

In [62]:
import time
from contextlib import contextmanager
# function for measuring time taken curtises of ChatGPT (!MAY NOT USE, WE SHALL SEE)
@contextmanager
def time_block(label="Elapsed time"):
    start = time.perf_counter()
    try:
        yield
    finally:
        end = time.perf_counter()
        print(f"{label}: {end - start:.6f} seconds")

# function for measuring accuracy
num_classes = 10
acc = Accuracy(task="multiclass", num_classes=num_classes)
def get_acc(X_test, y_test):
    classifier.eval()
    with torch.no_grad():
        y_pred = classifier(torch.from_numpy(X_test).float())  # [899, 10] 
        y_true = torch.tensor(y_test)                           # [899]
    
        # Convert logits â†’ predicted class indices
        y_pred_labels = torch.argmax(y_pred, dim=1)             # [899]
    
    accuracy = acc(y_pred_labels, y_true).item()
    
    print(
        f"Final accuracy: {accuracy:.4f}"
    )

In [63]:
with time_block("My slow code"):
    total = sum(i * i for i in range(10_000_000))

My slow code: 0.747598 seconds


### Toy digits dataset

In [64]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

digits = load_digits()
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1)) #flattens the data
X_train, X_test, y_train, y_test = train_test_split( #splits into training and testing sets
    data, digits.target, test_size=0.5, shuffle=False
)

In [21]:
len(X_train[0])

64

### Basic Classification Model

In [22]:
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
from torchmetrics.classification import Accuracy
import matplotlib.pyplot as plt

class classifier_model(torch.nn.Module):
    
    def __init__(self,input_size,output_size):
        super().__init__()
        self.layer1 = torch.nn.Sequential(torch.nn.Linear(input_size,output_size)) # single layer model
        
    def forward(self,X):
        z = self.layer1(X)         # Apply the first layer (and only)
        return(z)            # Return the result (and the latent space variable)
            
classifier = classifier_model(64,10) #defines the model

optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent

L = torch.nn.CrossEntropyLoss()

In [24]:
#training
def train_model(L,optimizer,classifier, X_train, y_train, display=False):
    n_epochs = 15
    for epoch in range(n_epochs):
        loss = 0
        for X,y in zip(X_train,y_train): #y are labels
            labels = torch.tensor(y)
            inputs = torch.from_numpy(X).to(torch.float32)
        
            optimizer.zero_grad()  # Clear gradients
            outputs = classifier(inputs) #gets the outputs of the model
            train_loss = L(outputs,labels) #gets loss
            #print(train_loss)
            train_loss.backward() # compute accumulated gradients
            optimizer.step()# perform parameter update based on current gradients
            loss += train_loss.item() # add the mini-batch training loss to epoch loss
        loss = loss / len(X_train) # compute the epoch training loss
        if display:
            print(f"epoch : {epoch + 1}/{n_epochs}, loss = {loss}")
    return loss #this is the final loss at the end of all the training process
train_model(L,optimizer,classifier, X_train, y_train)

0.008544730517154124

### Gradient Scaling
Unlike some basic optimizers, the Adam Optimizer is invariant to gradient scaling, so the default learning rate works well for most datasets. Run the optimizer on the dataset scaled datasets and compare the final accuracy. Is there any scalar that causes a noticable decrease in performance? Then, try the same (or different) scalars on Stochastic Gradient Descent. When does accuracy start to drop?

In [85]:
classifier = classifier_model(64,10) #defines the model
"""
Implement the Adam Optimizer here
"""
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
with time_block("Adam optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train, y_train)
    get_acc(X_test, y_test)
    print(
        f"This is the final loss of the model: {final_loss:.4f} "
    )

Final accuracy: 0.9121
This is the final loss of the model: 0.0106 
Adam optimizer: 2.321951 seconds


In [77]:
"""
Scale the training/testing data. Compare the accuracies.
"""
classifier = classifier_model(64,10) #defines the model
##################### Have people do?
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
#####################
X_train_scaled = X_train/50
with time_block("Adam optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train_scaled, y_train)
    get_acc(X_test/50, y_test)


classifier = classifier_model(64,10) #defines the model
##################### Have people do?
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
#####################
X_train_scaled = X_train*50
with time_block("Adam optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train_scaled, y_train)
    get_acc(X_test*50, y_test)

Final accuracy: 0.9043
Adam optimizer: 2.299483 seconds
Final accuracy: 0.8810
Adam optimizer: 2.303372 seconds


#### Compare this with regular Stochastic Gradient Descent

In [78]:
classifier = classifier_model(64,10) #defines the model
"""
Now, implement and run Stochastic Gradient Descent with at least twice with different scalars. Compare the change in accuracy to the Adam Optimizer.
"""
##################### Have people do?
optimizer = torch.optim.SGD(classifier.parameters(), lr=1e-3) 
L = torch.nn.CrossEntropyLoss()
#####################
with time_block("SGD optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train*(1/5), y_train)
    get_acc(X_test*(1/5), y_test)

classifier = classifier_model(64,10) #defines the model
with time_block("SGD optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train*(5), y_train)
    get_acc(X_test*(5), y_test)

Final accuracy: 0.9166
SGD optimizer: 1.641946 seconds
Final accuracy: 0.1079
SGD optimizer: 1.532072 seconds


If scaling does not affect the data, what about normalization? Compare the accuracy of the model before and after normalization. As a bonus, implement both with cross validation for more stable results!

In [82]:
"""
Normalize the training/testing data and run the Adam Optimizer
"""
classifier = classifier_model(64,10) #defines the model
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()

mean = np.mean(X_train, axis=0, keepdims=True) 
std  = np.std(X_train, axis=0, keepdims=True)   

eps = 1e-8
X_train_scaled = (X_train - mean) / (std + eps)
X_test_scaled  = (X_test  - mean) / (std + eps)


with time_block("Adam optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train_scaled, y_train)
    get_acc(X_test_scaled, y_test)

Final accuracy: 0.9299
Adam optimizer: 2.296721 seconds


In [37]:
#torch.optim.LBFGS
#torch.optim.RMSprop
#import torchmin #this is how you get the newtons method full 

classifier = classifier_model(64,10) #defines the model
##################### Have people do?
optimizer = torch.optim.RMSprop(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
#####################
with time_block("RMSprop optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train, y_train)
    get_acc(X_test, y_test)

Final accuracy: 0.8977
RMSprop optimizer: 2.095619 seconds


In [37]:
classifier = classifier_model(64,10) #defines the model
##################### Have people do?
optimizer = torch.optim.SGD(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
#####################
#train_model(L,optimizer,classifier)
with time_block("SGD optimizer"):
    final_loss = train_model(L,optimizer,classifier)
    print(f"This is the final loss of the model: {final_loss}")

This is the final loss of the model: 0.024601183763795054
SGD optimizer: 0.868933 seconds


In [47]:
classifier = classifier_model(64,10) #defines the model
##################### Have people do?
optimizer = torch.optim.Adagrad(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
#####################
#train_model(L,optimizer,classifier)
with time_block("Adagrad optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train, y_train)
    get_acc(X_test, y_test)
    print(f"This is the final loss of the model: {final_loss}")

Final accuracy: 0.4727
This is the final loss of the model: 1.31173566483347
Adagrad optimizer: 2.018979 seconds


# The Speed of the hessian matrix

One would think that if newtons method is so exact that we would want to use it more often. Unfortunualy the hessian is a slow matrix to calculate as you will see here:

In [10]:
#inputs
x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)

def f(a, b):
    return a**2 * b

output = f(x, y)
df_dx = torch.autograd.grad(outputs=output, inputs=x, create_graph=True)[0]
print(f"First derivative df/dx: {df_dx.item()}")

df_dy = torch.autograd.grad(outputs=output, inputs=y, create_graph=True)[0]
print(f"First derivative df/dy: {df_dy.item()}")

#d2f_dy2 = torch.autograd.grad(outputs=df_dy, inputs=y,retain_graph=True,allow_unused=True)[0]
#print(f"Second derivative d^2f/dy^2: {d2f_dy2.item()}")

d2f_dx2 = torch.autograd.grad(outputs=df_dx, inputs=x,retain_graph=True)[0]
print(f"Second derivative d^2f/dx^2: {d2f_dx2.item()}")

d2f_dydx = torch.autograd.grad(outputs=df_dx, inputs=y,retain_graph=True)[0]
print(f"Mixed derivative d^2f/dydx: {d2f_dydx.item()}")

d2f_dxdy = torch.autograd.grad(outputs=df_dy, inputs=x,retain_graph=True)[0]
print(f"Mixed derivative d^2f/dxdy: {d2f_dxdy.item()}")



#d2f_dy2 = torch.autograd.grad(outputs=df_dy, inputs=y,retain_graph=True,allow_unused=True)[0]
#print(f"Mixed derivative d2f_dy2: {d2f_dy2.item()}")

First derivative df/dx: 12.0
First derivative df/dy: 4.0
Second derivative d^2f/dx^2: 6.0
Mixed derivative d^2f/dydx: 4.0
Mixed derivative d^2f/dxdy: 4.0


Some sore of activtiyy where they calucate the full hessian, the code isnt finalized or even working right now but thats the concept

In [1]:
import torch
from torch.func import hessian, jacfwd, jacrev

# 1. Define a function that returns a single scalar loss
def compute_loss(model_params, input_data, targets, model, loss_fn):
    # Use functional_call to call the model with specific parameters
    outputs = torch.func.functional_call(model, model_params, input_data)
    loss = loss_fn(outputs, targets)
    return loss

# 2. Instantiate your model and loss function
model = torch.nn.Linear(10, 1) # Example model
loss_fn = torch.nn.MSELoss()    # Example loss function
input_data = torch.randn(1, 10)
targets = torch.randn(1, 1)

# 3. Extract parameters as a flat dictionary or tuple (functorch format)
# Note: make_functional is a utility to easily get functional model and params
from torch.func import make_functional
fnet, params = make_functional(model)

# 4. Compute the Hessian
# We need to wrap the loss computation in a function that takes params as input
def get_hessian_func(params):
    return compute_loss(params, input_data, targets, model, loss_fn)

# Calculate the full Hessian matrix
# argnums=0 specifies we want the Hessian with respect to the first argument (params)
full_hessian = hessian(get_hessian_func, argnums=0)(params)

ImportError: cannot import name 'make_functional' from 'torch.func' (/Users/noahwanless/Desktop/Spring2026/CSCI457/.venv/lib/python3.11/site-packages/torch/func/__init__.py)

In [None]:
def test_func(x):
    return x.sin().sum()
x = torch.randn(5) #takes in 5 inputs, so 5x5 hessian

full_hessian = hessian(test_func, argnums=0)(x)
print(full_hessian)

tensor([[ 0.1208,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.9627,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.4295,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.1072,  0.0000],
        [-0.0000, -0.0000, -0.0000, -0.0000, -0.8535]])


The Hessian encodes local curvature of the objective function, it allows the gradient to be scaled so that 'dierctions' that have little movement are downscaled, while directions with more curvuture are scaled up so more progress is made