# Assignment 3

This project compares three feedforward neural network training algorithms: Stochastic Gradient Descent (SGD), Scaled Conjugate Gradient (SCG), and LeapFrog. Using six datasets—three for classification and three for regression—the study evaluates convergence speed, stability, and predictive accuracy. Each network has a single hidden layer, with experiments across different hidden layer sizes and hyperparameters. Performance is measured using accuracy and F1-score for classification, and MSE, RMSE, and R² for regression, alongside training time and convergence behavior. The results highlight the strengths and weaknesses of each optimizer across problems of varying complexity.

## Data

In [4]:
! pip3 install -r requirements.txt



In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.datasets import load_iris
import torch
import torch.nn as nn
import torch.optim as optim
from typing import List, Tuple, Optional

### Classification

In [14]:
# Iris Dataset
iris = sns.load_dataset("iris")
print(iris.head())

X = iris.drop("species", axis=1)
y = iris["species"]

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


### Function approx.

## Pre Processing

### Classification

In [15]:
# Iris Dataset

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Function Approx.

## Model

In [16]:
class FeedforwardNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, activation_fn=nn.ReLU):
        super(FeedforwardNN, self).__init__()
        self.hidden = nn.Linear(input_dim, hidden_dim)
        self.activation = activation_fn()
        self.output = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.hidden(x)
        x = self.activation(x)
        x = self.output(x)
        return x

### Training Algorithms

In [17]:
# SGD Training Function
def train_sgd(model, X_train, y_train, X_test, y_test, epochs=50, lr=0.01, batch_size=32):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    train_losses = []
    test_losses = []

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long)

    dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for xb, yb in loader:
            optimizer.zero_grad()
            outputs = model(xb)
            loss = criterion(outputs, yb)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * xb.size(0)
        train_loss = running_loss / len(loader.dataset)
        train_losses.append(train_loss)

        # Evaluate on test set
        model.eval()
        with torch.no_grad():
            outputs = model(X_test_tensor)
            loss = criterion(outputs, y_test_tensor)
            test_losses.append(loss.item())

    return train_losses, test_losses

In [24]:
model = FeedforwardNN(input_dim=4, hidden_dim=16, output_dim=3)
train_losses, test_losses = train_sgd(model, X_train_scaled, y_train, X_test_scaled, y_test)
print("Train Losses:", train_losses)
print("Test Losses:", test_losses)

Train Losses: [1.1124460776646932, 1.0765247583389281, 1.04386301835378, 1.0144209702809652, 0.9876567800839742, 0.9623398423194885, 0.9400183161099752, 0.9194311340649922, 0.9003729701042176, 0.8827384432156881, 0.8662549535433451, 0.8513033986091614, 0.837253475189209, 0.8241028110186259, 0.8117499510447185, 0.7997944355010986, 0.7890545566876729, 0.7787570675214132, 0.7689792116483053, 0.7597044746081034, 0.7508195241292318, 0.7420593738555908, 0.7340576489766438, 0.726137085755666, 0.7183952450752258, 0.7110705574353536, 0.7041035016377767, 0.6971396883328755, 0.6904532392819722, 0.6839908321698507, 0.6777046680450439, 0.6715979139010112, 0.6656497041384379, 0.6597405234972636, 0.6540937701861064, 0.64858078956604, 0.6431231141090393, 0.6379727005958558, 0.632970949014028, 0.6280665834744771, 0.6230239272117615, 0.6182572523752848, 0.6136135737101237, 0.6090578039487203, 0.6047172824541728, 0.6004226088523865, 0.5961479902267456, 0.5919347723325094, 0.5878912289937337, 0.5840573946

In [28]:
# SCG Training Function
def train_scg(
    model: nn.Module,
    X_train: torch.Tensor,
    y_train: torch.Tensor,
    X_test: torch.Tensor,
    y_test: torch.Tensor,
    max_epochs: int = 1000,
    tolerance: float = 1e-6,
    sigma: float = 5e-5,
    lambda_init: float = 5e-7,
    verbose: bool = True,
    eval_freq: int = 10
) -> Tuple[List[float], List[float]]:
    """
    Train a PyTorch neural network using Scaled Conjugate Gradient algorithm.
    
    Args:
        model: PyTorch neural network model
        X_train: Training input data
        y_train: Training target data
        X_test: Test input data  
        y_test: Test target data
        max_epochs: Maximum number of training epochs
        tolerance: Convergence tolerance for gradient norm
        sigma: Parameter for Hessian approximation
        lambda_init: Initial regularization parameter
        verbose: Whether to print progress
        eval_freq: Frequency of evaluation and printing
        
    Returns:
        Tuple of (train_losses, test_losses)
    """
    
    device = next(model.parameters()).device
    X_train = X_train.to(device)
    y_train = y_train.to(device)
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    
    # Determine loss function based on model output
    if y_train.dtype == torch.long or (y_train.ndim == 1 and len(torch.unique(y_train)) <= 10):
        criterion = nn.CrossEntropyLoss()
        task_type = 'classification'
    else:
        criterion = nn.MSELoss()
        task_type = 'regression'
    
    # Get total number of parameters
    n_params = sum(p.numel() for p in model.parameters())
    
    # Helper functions
    def get_weights():
        """Extract all model parameters as a single vector"""
        return torch.cat([p.view(-1) for p in model.parameters()])
    
    def set_weights(weights):
        """Set model parameters from a single vector"""
        idx = 0
        for p in model.parameters():
            param_length = p.numel()
            p.data = weights[idx:idx + param_length].view(p.shape)
            idx += param_length
    
    def compute_loss_and_gradient(weights):
        """Compute loss and gradient for given weights"""
        set_weights(weights)
        model.zero_grad()
        
        outputs = model(X_train)
        if task_type == 'classification':
            loss = criterion(outputs, y_train)
        else:
            loss = criterion(outputs, y_train)
        
        loss.backward()
        
        # Extract gradients
        grad = torch.cat([p.grad.view(-1) for p in model.parameters()])
        
        return loss.item(), grad
    
    def evaluate_model():
        """Evaluate model on train and test sets"""
        model.eval()
        with torch.no_grad():
            # Training loss
            train_outputs = model(X_train)
            if task_type == 'classification':
                train_loss = criterion(train_outputs, y_train).item()
            else:
                train_loss = criterion(train_outputs, y_train).item()
            
            # Test loss
            test_outputs = model(X_test)
            if task_type == 'classification':
                test_loss = criterion(test_outputs, y_test).item()
            else:
                test_loss = criterion(test_outputs, y_test).item()
        
        model.train()
        return train_loss, test_loss
    
    # Initialize SCG variables
    w_k = get_weights()
    f_k, g_k = compute_loss_and_gradient(w_k)
    r_k = g_k.clone()
    r_k_prev = None  # Will store previous gradient for beta calculation
    p_k = -r_k.clone()
    
    lambda_k = lambda_init
    lambda_bar = 0.0
    success = True
    k = 0
    
    train_losses = []
    test_losses = []
    
    # Initial evaluation
    train_loss, test_loss = evaluate_model()
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    
    if verbose:
        print(f"Initial - Train Loss: {train_loss:.6f}, Test Loss: {test_loss:.6f}")
    
    # Main SCG loop
    for epoch in range(max_epochs):
        # Step 1: Calculate scaling parameter if successful step
        if success:
            sigma_k = sigma / torch.sqrt(torch.dot(p_k, p_k))
        
        # Step 2: Approximate Hessian-vector product
        w_temp = w_k + sigma_k * p_k
        _, g_temp = compute_loss_and_gradient(w_temp)
        s_k = (g_temp - g_k) / sigma_k
        
        # Step 3 & 4: Scale the search direction
        delta_k = torch.dot(p_k, s_k)
        
        if delta_k <= 0:
            s_k = s_k + (lambda_k - delta_k) * p_k
            delta_k = lambda_k * torch.dot(p_k, p_k)
            lambda_k = 2 * lambda_k
        
        # Step 5: Calculate step length
        mu_k = torch.dot(p_k, r_k)
        alpha_k = mu_k / delta_k
        
        # Step 6: Calculate comparison parameter
        Delta_k = -alpha_k * mu_k
        
        # Step 7: Update weights and evaluate
        w_new = w_k + alpha_k * p_k
        f_new, _ = compute_loss_and_gradient(w_new)
        Delta_f = f_new - f_k
        
        # Step 8: Test for successful reduction
        if Delta_f < 0.25 * Delta_k:
            success = True
            lambda_bar = 0
            
            if Delta_f >= 0.75 * Delta_k:
                lambda_k = lambda_k / 4
            
            # Accept the step
            w_k = w_new
            f_k = f_new
            r_k_prev = r_k.clone()  # Store previous gradient
            _, g_k = compute_loss_and_gradient(w_k)
            r_k = g_k.clone()
            lambda_bar = lambda_bar + lambda_k
            lambda_k = lambda_bar
            
        else:
            success = False
            lambda_bar = lambda_bar + lambda_k
            lambda_k = lambda_bar
        
        # Step 9: Update search direction (only if successful)
        if success:
            # Check for restart condition
            if k % n_params == 0 or r_k_prev is None:
                p_k = -r_k.clone()  # Restart with steepest descent
            else:
                # Polak-Ribiere formula
                beta_k = torch.dot(r_k, r_k - r_k_prev) / (torch.dot(r_k_prev, r_k_prev) + 1e-10)
                p_k = -r_k + beta_k * p_k
            
            k += 1
            
            # Check convergence
            grad_norm = torch.norm(r_k).item()
            if grad_norm < tolerance:
                if verbose:
                    print(f"Converged at epoch {epoch}, gradient norm: {grad_norm:.2e}")
                break
            
            # Evaluate and store losses
            if epoch % eval_freq == 0 or epoch == max_epochs - 1:
                train_loss, test_loss = evaluate_model()
                train_losses.append(train_loss)
                test_losses.append(test_loss)
                
                if verbose:
                    print(f"Epoch {epoch:4d} - Train Loss: {train_loss:.6f}, "
                          f"Test Loss: {test_loss:.6f}, Grad Norm: {grad_norm:.2e}")
    
    # Final evaluation if not already done
    if (max_epochs - 1) % eval_freq != 0:
        train_loss, test_loss = evaluate_model()
        train_losses.append(train_loss)
        test_losses.append(test_loss)
    
    if verbose:
        print(f"Training completed. Final - Train Loss: {train_losses[-1]:.6f}, "
              f"Test Loss: {test_losses[-1]:.6f}")
    
    return train_losses, test_losses

In [32]:
model = FeedforwardNN(input_dim=4, hidden_dim=16, output_dim=3)
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_losses, test_losses = train_scg(
    model=model,
    X_train=X_train_tensor,
    y_train=y_train_tensor,
    X_test=X_test_tensor,
    y_test=y_test_tensor,
    max_epochs=500,
    tolerance=1e-5,
    verbose=True,
    eval_freq=1
)
print("Train Losses:", train_losses)
print("Test Losses:", test_losses)

Initial - Train Loss: 1.319261, Test Loss: 1.270033
Training completed. Final - Train Loss: 1.319261, Test Loss: 1.270033
Train Losses: [1.3192613124847412]
Test Losses: [1.2700326442718506]


### Visualisatin and Results