In this notebook well test how Gradient Descent impacts the results of our previous regression approach

In [1]:
from sklearn.datasets import fetch_california_housing

# Step 1: Load dataset
housing = fetch_california_housing()
X = housing.data
y = housing.target
print("Data shape:", X.shape)
print("Target shape:", y.shape)


Data shape: (20640, 8)
Target shape: (20640,)


In [2]:
from sklearn.model_selection import train_test_split

# Step 2: Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)


Training data shape: (16512, 8)
Testing data shape: (4128, 8)


In [3]:
# reviewing our dataset
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((16512, 8), (4128, 8), (16512,), (4128,))

In [4]:
# let's bring in scaling into the picture
from sklearn.preprocessing import StandardScaler

# Step 3: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Using Sklearn's model first

In [7]:
from sklearn.linear_model import SGDRegressor

# 4. Fit SGDRegressor
model = SGDRegressor(max_iter=1000, eta0=0.01, learning_rate='invscaling', penalty='l2', random_state=42)
model.fit(X_train_scaled, y_train)


from sklearn.metrics import mean_squared_error, r2_score

# 5. Predict
y_pred = model.predict(X_test_scaled)

# Step 7: Evaluate sklearn model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 0.550598777585777
R^2 Score: 0.5798267665069695


# From Scratch
Let's try to code the algorithm from scratch

In [9]:
import numpy as np
# -------------------------------
# Compute Mean Squared Error loss with L2 regularization
# -------------------------------
def compute_loss(X, y, w, b, alpha):
    # Predict outputs with current weights and bias
    y_pred = X.dot(w) + b
    # Compute Mean Squared Error (MSE)
    mse = np.mean((y - y_pred) ** 2)
    # Compute L2 penalty (Ridge regularization)
    reg = alpha * np.sum(w ** 2)
    # Return combined loss
    return 0.5 * (mse + reg)

# -------------------------------
# Train linear regression model using Mini-Batch Stochastic Gradient Descent
# -------------------------------
def train_sgd_regressor(X, y, max_iter=100, batch_size=32, eta0=0.01, alpha=0.0001, power_t=0.25, learning_rate='invscaling'):
    # Get number of samples (m) and features (n)
    m, n = X.shape
    # Initialize weights randomly (n x 1)
    w = np.random.randn(n, 1)
    # Initialize bias to zero
    b = 0.0

    # Loop over epochs
    for epoch in range(max_iter):
        # Shuffle the dataset at the beginning of each epoch
        indices = np.random.permutation(m)
        X_shuffled = X[indices]
        y_shuffled = y[indices]

        # Loop over mini-batches
        for start in range(0, m, batch_size):
            end = start + batch_size
            # Get current mini-batch of features and targets
            xb = X_shuffled[start:end]
            yb = y_shuffled[start:end].reshape(-1, 1)  # Ensure column vector

            # Forward pass: predict outputs
            y_pred = xb.dot(w) + b
            # Compute prediction error
            error = y_pred - yb

            # Compute learning rate using inverse scaling schedule
            t = epoch * (m // batch_size) + (start // batch_size) + 1  # Step count
            eta = eta0 / (t ** power_t) if learning_rate == 'invscaling' else eta0

            # Compute gradient for weights and bias (with L2 regularization)
            grad_w = xb.T.dot(error) / len(xb) + alpha * w
            grad_b = np.mean(error)

            # Update weights and bias
            w -= eta * grad_w
            b -= eta * grad_b

    # Return the final learned weights and bias
    return w, b

# -------------------------------
# Predict using learned linear model
# -------------------------------
def predict(X, w, b):
    # Compute and return predictions
    return X.dot(w) + b


# -------------------------------
# Train the custom SGD model
# -------------------------------
w, b = train_sgd_regressor(
    X_train_scaled, y_train,   # Scaled training data
    max_iter=100,              # Number of epochs
    batch_size=64,             # Mini-batch size
    eta0=0.01,                 # Initial learning rate
    alpha=0.0001,              # L2 regularization strength
    learning_rate='invscaling' # Learning rate schedule
)

# -------------------------------
# Make predictions and evaluate model
# -------------------------------

# Predict on test set
y_pred = predict(X_test_scaled, w, b)

# Evaluate using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 0.5659752078049892
R^2 Score: 0.5680926968580774
