# Gradient Descent types

In [None]:
import torch
import numpy as np 
import matplotlib.pyplot as plt
import warnings

import torch.nn as nn
import torch.nn.functional as F



# XOR example

 X1 | X2| Y 
----|----|----
0|0|0
0|1|1
1|0|1
1|1|0


<p align="center">
  <img src="images/Ann_1.jpg" alt="Computational Graph">
</p>

In [None]:
X =   torch.tensor([[0, 0],
              [0, 1],
              [1, 0],
              [1, 1]], dtype=torch.float32)


y = torch.tensor([[0],
              [1],
              [1],
              [0]], dtype=torch.float32)

In [None]:
# Set seed for reproducibility
np.random.seed(1)

# Initialize weights randomly with mean 0
input_layer_neurons = X.shape[1] #index starts with zero
hidden_layer_neurons = 3 #Total 3 hidden units
output_neuron = 1 #output size

In [None]:
def initialize(input_layer_neurons, hidden_layer_neurons, output_neuron):
    W1 = torch.nn.Parameter(2 * torch.rand((input_layer_neurons, hidden_layer_neurons)) - 1, requires_grad=True)
    b1 = torch.nn.Parameter(2 * torch.rand((1, hidden_layer_neurons)) - 1, requires_grad=True)




    #Lets print shape
    print(f"Weights shape from input to hidden layer w_1: {W1.shape}")
    print(f"Bias shape from input to hidden layer b_1: {b1.shape}")



    # Weights and biases for the hidden to output layer
    W2 = torch.nn.Parameter(2 * torch.rand((hidden_layer_neurons, output_neuron)) - 1, requires_grad=True)
    b2 = torch.nn.Parameter(2 * torch.rand((1, output_neuron)) - 1, requires_grad=True)

    #Lets print shape
    print(f"Weights shape from hidden layer to output layer w_2: {W2.shape}")
    print(f"Bias shape from hidden layer to output layer b_2: {b2.shape}")

    return W1, b1, W2, b2

In [None]:
# Forward pass
def forward_prop(X, W1, b1, W2, b2):
    # From input to hidden layer
    z1 = torch.matmul(X, W1) + b1
    a1 = torch.sigmoid(z1)
    z2 = torch.matmul(a1, W2) + b2
    a2 = torch.sigmoid(z2)

    return z1, a1, z2, a2

In [None]:
epochs = 1000

# Gradient Descent


$$
W \leftarrow W - \eta \nabla_W L \\[10pt]
b \leftarrow b - \eta \nabla_b L \\[10pt]
$$

In [None]:
#initialte weights
W1, b1, W2, b2 = initialize(input_layer_neurons, hidden_layer_neurons, output_neuron)


eta = 1e-1
losses = []

for epoch in range(epochs):
    z1, a1, z2, a2 = forward_prop(X, W1, b1, W2, b2)

    loss = torch.mean((y - a2) ** 2) / 2
    losses.append(loss.item())

    loss.backward(retain_graph=True)
    with torch.no_grad():
        W1 -= eta * W1.grad
        b1 -= eta * b1.grad
        W2 -= eta * W2.grad
        b2 -= eta * b2.grad

        # Manually zero the gradients after updating weights
        W1.grad.zero_()
        b1.grad.zero_()
        W2.grad.zero_()
        b2.grad.zero_()

    if epoch % 5000 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")



In [None]:
# Plot the data points
plt.scatter(X[:, 0], X[:, 1], c=y.numpy().ravel(), cmap='viridis', marker='o', s=100, edgecolor='k')
plt.title('XOR Problem GD')

# Create a mesh to plot the decision boundary
xx, yy = np.meshgrid(np.linspace(-0.5, 1.5, 100), np.linspace(-0.5, 1.5, 100))
grid = torch.tensor(np.c_[xx.ravel(), yy.ravel()], dtype=torch.float32)

# Forward pass on the grid
_, _, _, a2_grid = forward_prop(grid, W1, b1, W2, b2)
a2_grid = a2_grid.detach().numpy().reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, a2_grid, levels=[0, 0.5, 1], alpha=0.2, colors=['blue', 'yellow'])
plt.colorbar()
plt.show()

# Plot the loss over iterations
plt.plot(losses)
plt.title('Loss over iterations')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

# Final output
_, _, _, final_output = forward_prop(X, W1, b1, W2, b2)
predictions = np.where(final_output.detach().numpy() > 0.5, 1, 0)

print("Final predictions:\n", predictions)

# Momentum based Gradient Descent

$$
\text{Initialize: }v_0(W) = v_0(b) = 0\\[10pt]
v_t(W) =  \gamma v_{t-1}(W) + \eta \nabla_W L \\[10pt]
W \leftarrow W- v_t(W)\\[10pt]
v_t(b) = \gamma v_{t-1}(b) + \eta \nabla_b L \\[10pt]
b \leftarrow b-v_t(b)
$$

In [None]:
# Initialize weights
W1, b1, W2, b2 = initialize(input_layer_neurons, hidden_layer_neurons, output_neuron)

# Hyperparameters


eta = 1e-1
momentum = 0.9
losses = []

# Initialize velocities
v_W1 = torch.zeros_like(W1)
v_b1 = torch.zeros_like(b1)
v_W2 = torch.zeros_like(W2)
v_b2 = torch.zeros_like(b2)

for epoch in range(epochs):
    z1, a1, z2, a2 = forward_prop(X, W1, b1, W2, b2)

    loss = torch.mean((y - a2) ** 2) / 2
    losses.append(loss.item())

    loss.backward(retain_graph=True)
    with torch.no_grad():
        # Update velocities
        v_W1 = momentum * v_W1 + eta * W1.grad
        v_b1 = momentum * v_b1 + eta * b1.grad
        v_W2 = momentum * v_W2 + eta * W2.grad
        v_b2 = momentum * v_b2 + eta * b2.grad

        # Update weights
        W1 -= v_W1
        b1 -= v_b1
        W2 -= v_W2
        b2 -= v_b2

        # Manually zero the gradients after updating weights
        W1.grad.zero_()
        b1.grad.zero_()
        W2.grad.zero_()
        b2.grad.zero_()

    if epoch % 5000 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")


In [None]:
# Plot the data points
plt.scatter(X[:, 0], X[:, 1], c=y.numpy().ravel(), cmap='viridis', marker='o', s=100, edgecolor='k')
plt.title('XOR Problem Momentum')

# Create a mesh to plot the decision boundary
xx, yy = np.meshgrid(np.linspace(-0.5, 1.5, 100), np.linspace(-0.5, 1.5, 100))
grid = torch.tensor(np.c_[xx.ravel(), yy.ravel()], dtype=torch.float32)

# Forward pass on the grid
_, _, _, a2_grid = forward_prop(grid, W1, b1, W2, b2)
a2_grid = a2_grid.detach().numpy().reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, a2_grid, levels=[0, 0.5, 1], alpha=0.2, colors=['blue', 'yellow'])
plt.colorbar()
plt.show()

# Plot the loss over iterations
plt.plot(losses)
plt.title('Loss over iterations')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

# Final output
_, _, _, final_output = forward_prop(X, W1, b1, W2, b2)
predictions = np.where(final_output.detach().numpy() > 0.5, 1, 0)

print("Final predictions:\n", predictions)

# Nesterov Accelerated Gradient

$$

\text{Initialize: }v_0(W) = v_0(b) = 0 \\[10pt]

v_t(W) =  \gamma v_{t-1}(W) + \eta \nabla_W L(W-\gamma v_{t-1}(W)) \\[10pt]
W \leftarrow W- v_t(W)\\[10pt]
v_t(b) = \gamma v_{t-1}(b) + \eta \nabla_b L(b-\gamma v_{t-1}(b)) \\[10pt]
b \leftarrow b-v_t(b)
$$

In [None]:
# Initialize weights
W1, b1, W2, b2 = initialize(input_layer_neurons, hidden_layer_neurons, output_neuron)

# Hyperparameters


eta = 1e-1
momentum = 0.9
losses = []

# Initialize velocities
v_W1 = torch.zeros_like(W1)
v_b1 = torch.zeros_like(b1)
v_W2 = torch.zeros_like(W2)
v_b2 = torch.zeros_like(b2)

for epoch in range(epochs):
    # Lookahead step
    W1_lookahead = W1 - momentum * v_W1
    b1_lookahead = b1 - momentum * v_b1
    W2_lookahead = W2 - momentum * v_W2
    b2_lookahead = b2 - momentum * v_b2

    z1, a1, z2, a2 = forward_prop(X, W1_lookahead, b1_lookahead, W2_lookahead, b2_lookahead)

    loss = torch.mean((y - a2) ** 2) / 2
    losses.append(loss.item())

    loss.backward(retain_graph=True)
    with torch.no_grad():
        # Update velocities
        v_W1 = momentum * v_W1 + eta * W1.grad
        v_b1 = momentum * v_b1 + eta * b1.grad
        v_W2 = momentum * v_W2 + eta * W2.grad
        v_b2 = momentum * v_b2 + eta * b2.grad

        # Update weights
        W1 -= v_W1
        b1 -= v_b1
        W2 -= v_W2
        b2 -= v_b2

        # Manually zero the gradients after updating weights
        W1.grad.zero_()
        b1.grad.zero_()
        W2.grad.zero_()
        b2.grad.zero_()

    if epoch % 5000 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")


In [None]:
# Plot the data points
plt.scatter(X[:, 0], X[:, 1], c=y.numpy().ravel(), cmap='viridis', marker='o', s=100, edgecolor='k')
plt.title('XOR Problem Nesterov')

# Create a mesh to plot the decision boundary
xx, yy = np.meshgrid(np.linspace(-0.5, 1.5, 100), np.linspace(-0.5, 1.5, 100))
grid = torch.tensor(np.c_[xx.ravel(), yy.ravel()], dtype=torch.float32)

# Forward pass on the grid
_, _, _, a2_grid = forward_prop(grid, W1, b1, W2, b2)
a2_grid = a2_grid.detach().numpy().reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, a2_grid, levels=[0, 0.5, 1], alpha=0.2, colors=['blue', 'yellow'])
plt.colorbar()
plt.show()

# Plot the loss over iterations
plt.plot(losses)
plt.title('Loss over iterations')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

# Final output
_, _, _, final_output = forward_prop(X, W1, b1, W2, b2)
predictions = np.where(final_output.detach().numpy() > 0.5, 1, 0)

print("Final predictions:\n", predictions)

# RMS PROP

$$
\text{Initialize: }v_0(W) = v_0(b) = 0\\[10pt]
v_t(W) = \beta v_{t-1}(W) + (1- \beta)(\nabla_w L)^2 \\[10pt]
W \leftarrow W - \frac{\eta}{\sqrt{(v_t(W))} + \epsilon}\nabla_w(L) \\[10pt]
v_t(b) = \beta v_{t-1}(b) + (1- \beta)(\nabla_b L)^2 \\[10pt]
b \leftarrow b -\frac{\eta}{\sqrt{(v_t(b))} + \epsilon}\nabla_b(L) \\[10pt]




$$




In [None]:
import torch
import torch.nn.functional as F



# RMSProp parameters
beta = 0.9
epsilon = 1e-8

# Initialize weights and gradients
W1, b1, W2, b2 = initialize(input_layer_neurons, hidden_layer_neurons, output_neuron)
v_W1, v_b1, v_W2, v_b2 = torch.zeros_like(W1), torch.zeros_like(b1), torch.zeros_like(W2), torch.zeros_like(b2)


eta = 1e-1
losses = []

for epoch in range(epochs):
    z1, a1, z2, a2 = forward_prop(X, W1, b1, W2, b2)

    loss = torch.mean((y - a2) ** 2) / 2
    losses.append(loss.item())

    loss.backward(retain_graph=True)

    with torch.no_grad():
        # Update the moving averages of the squared gradients
        v_W1 = beta * v_W1 + (1 - beta) * W1.grad ** 2
        v_b1 = beta * v_b1 + (1 - beta) * b1.grad ** 2
        v_W2 = beta * v_W2 + (1 - beta) * W2.grad ** 2
        v_b2 = beta * v_b2 + (1 - beta) * b2.grad ** 2

        # Update the weights using RMSProp rule
        W1 -= eta * W1.grad / (torch.sqrt(v_W1) + epsilon)
        b1 -= eta * b1.grad / (torch.sqrt(v_b1) + epsilon)
        W2 -= eta * W2.grad / (torch.sqrt(v_W2) + epsilon)
        b2 -= eta * b2.grad / (torch.sqrt(v_b2) + epsilon)

        # Manually zero the gradients after updating weights
        W1.grad.zero_()
        b1.grad.zero_()
        W2.grad.zero_()
        b2.grad.zero_()

    if epoch % 5000 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")


In [None]:
# Plot the data points
plt.scatter(X[:, 0], X[:, 1], c=y.numpy().ravel(), cmap='viridis', marker='o', s=100, edgecolor='k')
plt.title('XOR Problem RMS Prop')

# Create a mesh to plot the decision boundary
xx, yy = np.meshgrid(np.linspace(-0.5, 1.5, 100), np.linspace(-0.5, 1.5, 100))
grid = torch.tensor(np.c_[xx.ravel(), yy.ravel()], dtype=torch.float32)

# Forward pass on the grid
_, _, _, a2_grid = forward_prop(grid, W1, b1, W2, b2)
a2_grid = a2_grid.detach().numpy().reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, a2_grid, levels=[0, 0.5, 1], alpha=0.2, colors=['blue', 'yellow'])
plt.colorbar()
plt.show()

# Plot the loss over iterations
plt.plot(losses)
plt.title('Loss over iterations')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

# Final output
_, _, _, final_output = forward_prop(X, W1, b1, W2, b2)
predictions = np.where(final_output.detach().numpy() > 0.5, 1, 0)

print("Final predictions:\n", predictions)

# ADAM optimizer (Momentum + RMS Prop)

$$
\text{Initialize: }m_0, v_0 = 0\\[10pt]
m_t(W) = \beta _1m_{t-1}(W) + (1-\beta_1)\nabla_WL \\[10pt]
v_t(W) = \beta _2v_{t-1}(W) + (1-\beta_2)(\nabla_WL)^2 \\[10pt]
\hat{m}_t(W) = \frac{m_t(W)}{1-\beta_1^t}\\[10pt]
\hat{v}_t(W) = \frac{v_t(W)}{1-\beta_2^t}\\[10pt]
W \leftarrow W - \eta \frac{\hat{m}_t(W)}{\sqrt{\hat{v}_t(W)}+\epsilon} \\[10pt]

m_t(b) = \beta_1 m_{t-1}(b) + (1-\beta_1)\nabla_bL \\[10pt]
v_t(b) = \beta_2 v_{t-1}(b) + (1-\beta_2)(\nabla_b L)^2\\[10pt]

\hat{m}_t(b) = \frac{m_t (b)}{1-\beta_1^t}\\[10pt]
\hat{v}_t(b) = \frac{v_t (b)}{1-\beta_2^t}\\[10pt]
b \leftarrow b - \eta \frac{\hat{m}_t(b)}{\sqrt{\hat{v}_t(b)}+\epsilon} \\[10pt]

$$


In [None]:
# Initialize weights
W1, b1, W2, b2 = initialize(input_layer_neurons, hidden_layer_neurons, output_neuron)

# Hyperparameters


eta = 1e-3
beta1 = 0.9
beta2 = 0.999
epsilon = 1e-8
losses = []

# Initialize moment estimates
m_W1, v_W1 = torch.zeros_like(W1), torch.zeros_like(W1)
m_b1, v_b1 = torch.zeros_like(b1), torch.zeros_like(b1)
m_W2, v_W2 = torch.zeros_like(W2), torch.zeros_like(W2)
m_b2, v_b2 = torch.zeros_like(b2), torch.zeros_like(b2)

for t in range(1, epoch + 1):
    z1, a1, z2, a2 = forward_prop(X, W1, b1, W2, b2)

    loss = torch.mean((y - a2) ** 2) / 2
    losses.append(loss.item())

    loss.backward(retain_graph=True)
    with torch.no_grad():
        # Update biased first moment estimate
        m_W1 = beta1 * m_W1 + (1 - beta1) * W1.grad
        m_b1 = beta1 * m_b1 + (1 - beta1) * b1.grad
        m_W2 = beta1 * m_W2 + (1 - beta1) * W2.grad
        m_b2 = beta1 * m_b2 + (1 - beta1) * b2.grad

        # Update biased second moment estimate
        v_W1 = beta2 * v_W1 + (1 - beta2) * W1.grad ** 2
        v_b1 = beta2 * v_b1 + (1 - beta2) * b1.grad ** 2
        v_W2 = beta2 * v_W2 + (1 - beta2) * W2.grad ** 2
        v_b2 = beta2 * v_b2 + (1 - beta2) * b2.grad ** 2

        # Compute bias-corrected first moment estimate
        m_hat_W1 = m_W1 / (1 - beta1 ** t)
        m_hat_b1 = m_b1 / (1 - beta1 ** t)
        m_hat_W2 = m_W2 / (1 - beta1 ** t)
        m_hat_b2 = m_b2 / (1 - beta1 ** t)

        # Compute bias-corrected second moment estimate
        v_hat_W1 = v_W1 / (1 - beta2 ** t)
        v_hat_b1 = v_b1 / (1 - beta2 ** t)
        v_hat_W2 = v_W2 / (1 - beta2 ** t)
        v_hat_b2 = v_b2 / (1 - beta2 ** t)

        # Update weights
        W1 -= eta * m_hat_W1 / (torch.sqrt(v_hat_W1) + epsilon)
        b1 -= eta * m_hat_b1 / (torch.sqrt(v_hat_b1) + epsilon)
        W2 -= eta * m_hat_W2 / (torch.sqrt(v_hat_W2) + epsilon)
        b2 -= eta * m_hat_b2 / (torch.sqrt(v_hat_b2) + epsilon)

        # Manually zero the gradients after updating weights
        W1.grad.zero_()
        b1.grad.zero_()
        W2.grad.zero_()
        b2.grad.zero_()

    if t % 5000 == 0:
        print(f"Epoch {t}, Loss: {loss.item()}")


In [None]:
# Plot the data points
plt.scatter(X[:, 0], X[:, 1], c=y.numpy().ravel(), cmap='viridis', marker='o', s=100, edgecolor='k')
plt.title('XOR Problem ADAM')

# Create a mesh to plot the decision boundary
xx, yy = np.meshgrid(np.linspace(-0.5, 1.5, 100), np.linspace(-0.5, 1.5, 100))
grid = torch.tensor(np.c_[xx.ravel(), yy.ravel()], dtype=torch.float32)

# Forward pass on the grid
_, _, _, a2_grid = forward_prop(grid, W1, b1, W2, b2)
a2_grid = a2_grid.detach().numpy().reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, a2_grid, levels=[0, 0.5, 1], alpha=0.2, colors=['blue', 'yellow'])
plt.colorbar()
plt.show()

# Plot the loss over iterations
plt.plot(losses)
plt.title('Loss over iterations')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

# Final output
_, _, _, final_output = forward_prop(X, W1, b1, W2, b2)
predictions = np.where(final_output.detach().numpy() > 0.5, 1, 0)

print("Final predictions:\n", predictions)

# Stochastic Gradient Descent (SGD) 

Instead of whole data we pass a batch of data

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_moons, make_regression
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

# Generate dataset with more samples

In [None]:
# Generate synthetic classification dataset (moons)
n_samples = 5000
X_cls, y_cls = make_moons(n_samples=n_samples, noise=0.10, random_state=42)
X_cls = torch.tensor(X_cls, dtype=torch.float32)
y_cls = torch.tensor(y_cls, dtype=torch.long)


# Split the dataset

In [None]:
# Split the dataset into train, val, test sets (70%, 20%, 10%)
X_train_cls, X_temp_cls, y_train_cls, y_temp_cls = train_test_split(X_cls, y_cls, test_size=0.3, random_state=42)
X_val_cls, X_test_cls, y_val_cls, y_test_cls = train_test_split(X_temp_cls, y_temp_cls, test_size=0.33, random_state=42)


# Plot the dataset

In [None]:
# Convert y_train_cls to numpy for plotting
y_train_cls_np = y_train_cls.numpy().ravel()

# Plot the training data
plt.figure(figsize=(10, 6))
scatter_0 = plt.scatter(X_train_cls[y_train_cls_np == 0][:, 0], X_train_cls[y_train_cls_np == 0][:, 1], color='blue', label='Class 0', s=10)
scatter_1 = plt.scatter(X_train_cls[y_train_cls_np == 1][:, 0], X_train_cls[y_train_cls_np == 1][:, 1], color='red', label='Class 1', s=10)
plt.title('Training Data')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend(handles=[scatter_0, scatter_1])
plt.grid(True)
plt.show()


In [None]:
# Set seed for reproducibility
np.random.seed(1)

# Initialize weights randomly with mean 0
input_layer_neurons = X_train_cls.shape[1] #index starts with zero
hidden_layer_neurons = 3 #Total 3 hidden units
output_neuron = 1 #output size

# Load dataset into batches

In [None]:
# Create DataLoader for batch processing
train_dataset_cls = TensorDataset(X_train_cls, y_train_cls)
val_dataset_cls = TensorDataset(X_val_cls, y_val_cls)
test_dataset_cls = TensorDataset(X_test_cls, y_test_cls)

train_loader_cls = DataLoader(train_dataset_cls, batch_size=32, shuffle=True)
val_loader_cls = DataLoader(val_dataset_cls, batch_size=32, shuffle=False)
test_loader_cls = DataLoader(test_dataset_cls, batch_size=32, shuffle=False)


# initialize weights

In [None]:
def initialize(input_layer_neurons, hidden_layer_neurons, output_neuron):
    W1 = torch.nn.Parameter(2 * torch.rand((input_layer_neurons, hidden_layer_neurons)) - 1, requires_grad=True)
    b1 = torch.nn.Parameter(2 * torch.rand((1, hidden_layer_neurons)) - 1, requires_grad=True)




    #Lets print shape
    print(f"Weights shape from input to hidden layer w_1: {W1.shape}")
    print(f"Bias shape from input to hidden layer b_1: {b1.shape}")



    # Weights and biases for the hidden to output layer
    W2 = torch.nn.Parameter(2 * torch.rand((hidden_layer_neurons, output_neuron)) - 1, requires_grad=True)
    b2 = torch.nn.Parameter(2 * torch.rand((1, output_neuron)) - 1, requires_grad=True)

    #Lets print shape
    print(f"Weights shape from hidden layer to output layer w_2: {W2.shape}")
    print(f"Bias shape from hidden layer to output layer b_2: {b2.shape}")

    return W1, b1, W2, b2

In [None]:
# Forward pass
def forward_prop(X, W1, b1, W2, b2):
    # From input to hidden layer
    z1 = torch.matmul(X, W1) + b1
    a1 = torch.relu(z1)
    z2 = torch.matmul(a1, W2) + b2
    a2 = torch.sigmoid(z2)

    return z1, a1, z2, a2

In [None]:
# Initialize weights
W1, b1, W2, b2 = initialize(input_layer_neurons, hidden_layer_neurons, output_neuron)

# Hyperparameters
#epochs = 1000
eta = 1e-1
losses_train_cls = []
losses_val_cls = []
accuracies_train_cls = []
accuracies_val_cls = []

for epoch in range(epochs):
    epoch_losses_train = []
    epoch_correct_train = 0
    epoch_total_train = 0

    # Training phase
    for inputs, targets in train_loader_cls:
        z1, a1, z2, a2 = forward_prop(inputs, W1, b1, W2, b2)

        loss = torch.mean((targets.float().view(-1, 1) - a2) ** 2) / 2
        epoch_losses_train.append(loss.item())
        loss.backward(retain_graph=True)
        with torch.no_grad():
            W1 -= eta * W1.grad
            b1 -= eta * b1.grad
            W2 -= eta * W2.grad
            b2 -= eta * b2.grad

            # Manually zero the gradients after updating weights
            W1.grad.zero_()
            b1.grad.zero_()
            W2.grad.zero_()
            b2.grad.zero_()

        # Calculate training accuracy
        predicted_train = (a2 >= 0.5).float()
        epoch_correct_train += (predicted_train == targets.float().view_as(predicted_train)).sum().item()
        epoch_total_train += targets.size(0)

    # Calculate mean loss and accuracy for training set
    losses_train_cls.append(np.mean(epoch_losses_train))
    accuracy_train = epoch_correct_train / epoch_total_train
    accuracies_train_cls.append(accuracy_train)

    # Evaluation phase (validation)
    epoch_losses_val = []
    epoch_correct_val = 0
    epoch_total_val = 0

    with torch.no_grad():
        for inputs, targets in val_loader_cls:
            z1, a1, z2, a2 = forward_prop(inputs, W1, b1, W2, b2)
            loss_val = torch.mean((targets.float().view(-1, 1) - a2) ** 2) / 2
            epoch_losses_val.append(loss_val.item())

            # Calculate validation accuracy
            predicted_val = (a2 >= 0.5).float()
            epoch_correct_val += (predicted_val == targets.float().view_as(predicted_val)).sum().item()
            epoch_total_val += targets.size(0)

    # Calculate mean loss and accuracy for validation set
    losses_val_cls.append(np.mean(epoch_losses_val))
    accuracy_val = epoch_correct_val / epoch_total_val
    accuracies_val_cls.append(accuracy_val)

    # Print progress every epoch
    print(f"Epoch {epoch}, Train Loss: {losses_train_cls[-1]:.4f}, Val Loss: {losses_val_cls[-1]:.4f}, "
          f"Train Acc: {accuracy_train:.4f}, Val Acc: {accuracy_val:.4f}")


In [None]:
# Plot the loss and accuracy over iterations side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Plot for Loss over iterations
ax1.plot(losses_train_cls, label='Training Loss')
ax1.plot(losses_val_cls, label='Validation Loss')
ax1.set_title('Loss over epochs')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.grid(True)

# Plot for Accuracy over iterations
ax2.plot(accuracies_train_cls, label='Training Accuracy')
ax2.plot(accuracies_val_cls, label='Validation Accuracy')
ax2.set_title('Accuracy over epochs')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend()
ax2.grid(True)

plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()


In [None]:
# Plot the data points
plt.scatter(X_test_cls[:, 0], X_test_cls[:, 1], c=y_test_cls.numpy().ravel(), cmap='viridis', marker='o', s=100, edgecolor='k')
plt.title('Moon Problem Test set')

# Create a mesh to plot the decision boundary
xx, yy = np.meshgrid(np.linspace(-3.5, 3.5, 100), np.linspace(-2.5, 2.5, 100))
grid = torch.tensor(np.c_[xx.ravel(), yy.ravel()], dtype=torch.float32)

# Forward pass on the grid
_, _, _, a2_grid = forward_prop(grid, W1, b1, W2, b2)
a2_grid = a2_grid.detach().numpy().reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, a2_grid, levels=[0, 0.5, 1], alpha=0.2, colors=['blue', 'yellow'])
plt.colorbar()
plt.show()


# Exercise
1. Experiment SGD with Momentum, Nesterov momentum and ADAM and plot the decision surfaces
1. Observe the key difference in decision surface among optimization strategies