# Assignment 10 — Gradient Descent Optimization
California Housing — Batch, Stochastic, Mini‑Batch

## Import Libraries

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error


## Load and Inspect Dataset

In [None]:

data = fetch_california_housing(as_frame=True)
df = data.frame
df.head()


## Prepare Features and Target

In [None]:

X = df.drop('MedHouseVal', axis=1).values
y = df['MedHouseVal'].values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train.shape, y_train.shape


## Helper Functions — Cost and Predictions

In [None]:

def predict(X, W):
    return np.dot(X, W)

def cost(X, y, W):
    m = len(y)
    return (1/(2*m)) * np.sum((predict(X, W) - y)**2)


## Batch Gradient Descent

In [None]:

def batch_gradient_descent(X, y, lr=0.01, epochs=200):
    m, n = X.shape
    W = np.zeros((n, 1))
    history = []
    for epoch in range(epochs):
        grad = (1/m) * X.T.dot(predict(X, W) - y)
        W -= lr * grad
        history.append(cost(X, y, W))
    return W, history

W_bgd, hist_bgd = batch_gradient_descent(X_train, y_train, lr=0.05, epochs=150)


## Stochastic Gradient Descent

In [None]:

def stochastic_gradient_descent(X, y, lr=0.01, epochs=15):
    m, n = X.shape
    W = np.zeros((n, 1))
    history = []
    for epoch in range(epochs):
        for i in range(m):
            xi = X[i:i+1]
            yi = y[i:i+1]
            grad = xi.T.dot(predict(xi, W) - yi)
            W -= lr * grad
        history.append(cost(X, y, W))
    return W, history

W_sgd, hist_sgd = stochastic_gradient_descent(X_train, y_train, lr=0.01, epochs=40)


## Mini‑Batch Gradient Descent

In [None]:

def mini_batch_gradient_descent(X, y, lr=0.02, epochs=60, batch_size=32):
    m, n = X.shape
    W = np.zeros((n, 1))
    history = []
    for epoch in range(epochs):
        permutation = np.random.permutation(m)
        X_shuffled = X[permutation]
        y_shuffled = y[permutation]
        for i in range(0, m, batch_size):
            xb = X_shuffled[i:i+batch_size]
            yb = y_shuffled[i:i+batch_size]
            grad = (1/len(yb)) * xb.T.dot(predict(xb, W) - yb)
            W -= lr * grad
        history.append(cost(X, y, W))
    return W, history

W_mbgd, hist_mbgd = mini_batch_gradient_descent(X_train, y_train)


## Compare Convergence Curves

In [None]:

plt.figure(figsize=(8,4))
plt.plot(hist_bgd, label="Batch GD")
plt.plot(hist_sgd, label="Stochastic GD")
plt.plot(hist_mbgd, label="Mini‑Batch GD")
plt.xlabel("Iterations")
plt.ylabel("Cost")
plt.title("Convergence Comparison")
plt.legend()
plt.show()


## Evaluate on Test Data

In [None]:

def rmse(X, y, W):
    preds = predict(X, W)
    return np.sqrt(mean_squared_error(y, preds))

rmse_bgd = rmse(X_test, y_test, W_bgd)
rmse_sgd = rmse(X_test, y_test, W_sgd)
rmse_mbgd = rmse(X_test, y_test, W_mbgd)

rmse_bgd, rmse_sgd, rmse_mbgd


## Discussion — Write Your Observations
- Which converged fastest?
- Which curve was smoothest?
- Which had more noise?
- Which optimizer would you choose and why?