# Lab Assignment 3 - Machine Learning (UML501)

## Q1: K-Fold Cross Validation for Multiple Linear Regression (Least Squares Fit)

Download the USA Housing dataset and implement **5-fold cross validation** using Least Squares Error Fit.

In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split

# Step a: Load dataset
df = pd.read_csv('USA_Housing.csv')

# Separate input features (X) and target (y)
X = df.drop("Price", axis=1).values
y = df["Price"].values.reshape(-1, 1)

# Step b: Scale input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step c: 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_beta = None
best_r2 = -np.inf
r2_scores = []

# Step d: Perform CV
for fold, (train_idx, test_idx) in enumerate(kf.split(X_scaled)):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Add bias term
    X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

    # Compute beta using Least Squares
    beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ (X_train_bias.T @ y_train)

    # Predictions
    y_pred = X_test_bias @ beta

    # R² score
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    print(f"Fold {fold+1}: R² = {r2:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

print("\nAverage R² across folds:", np.mean(r2_scores))
print("Best R² score:", best_r2)

# Step e: Train on 70%, Test on 30% with best beta
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

y_train_pred = X_train_bias @ best_beta
y_test_pred = X_test_bias @ best_beta

print("\nFinal Model Performance:")
print("Train R²:", r2_score(y_train, y_train_pred))
print("Test R²:", r2_score(y_test, y_test_pred))


## Q2: Concept of Validation Set (Gradient Descent Optimization)

Use the same dataset, split into Training (56%), Validation (14%), and Test (30%). Train using **Gradient Descent** with learning rates {0.001, 0.01, 0.1, 1}.

In [None]:

# Gradient Descent Implementation for Linear Regression
def gradient_descent(X, y, lr=0.01, epochs=1000):
    m, n = X.shape
    beta = np.zeros((n, 1))
    for _ in range(epochs):
        gradients = (2/m) * X.T @ (X @ beta - y)
        beta -= lr * gradients
    return beta

# Split into train (56%), validation (14%), test (30%)
X_temp, X_test, y_temp, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

# Add bias
X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_val_bias = np.c_[np.ones((X_val.shape[0], 1)), X_val]
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

lrs = [0.001, 0.01, 0.1, 1]
best_beta, best_r2_val = None, -np.inf

for lr in lrs:
    beta = gradient_descent(X_train_bias, y_train, lr=lr, epochs=1000)
    val_pred = X_val_bias @ beta
    test_pred = X_test_bias @ beta
    r2_val = r2_score(y_val, val_pred)
    r2_test = r2_score(y_test, test_pred)
    print(f"Learning Rate {lr}: Validation R² = {r2_val:.4f}, Test R² = {r2_test:.4f}")
    if r2_val > best_r2_val:
        best_r2_val = r2_val
        best_beta = beta

print("\nBest Beta coefficients found with Validation R²:", best_r2_val)


## Q3: Pre-processing and Multiple Linear Regression (Car Price Prediction)

Download the Car Price dataset and perform preprocessing as described. Train regression before and after PCA.

In [None]:

# Load Car Price dataset
columns = ["symboling","normalized_losses","make","fuel_type","aspiration","num_doors",
           "body_style","drive_wheels","engine_location","wheel_base","length","width",
           "height","curb_weight","engine_type","num_cylinders","engine_size","fuel_system",
           "bore","stroke","compression_ratio","horsepower","peak_rpm","city_mpg","highway_mpg","price"]

car_df = pd.read_csv("imports-85.data", names=columns, na_values="?")

# Step 1: Handle missing values
car_df = car_df.dropna(subset=["price"])
car_df = car_df.fillna(car_df.median(numeric_only=True))

# Step 2: Convert categorical values
car_df["num_doors"] = car_df["num_doors"].replace({"two":2, "four":4})
car_df["num_cylinders"] = car_df["num_cylinders"].replace({"two":2,"three":3,"four":4,"five":5,"six":6,"eight":8,"twelve":12})

car_df = pd.get_dummies(car_df, columns=["body_style","drive_wheels"])
from sklearn.preprocessing import LabelEncoder
for col in ["make","aspiration","engine_location","fuel_type"]:
    car_df[col] = LabelEncoder().fit_transform(car_df[col].astype(str))

car_df["fuel_system"] = car_df["fuel_system"].apply(lambda x: 1 if "pfi" in str(x) else 0)
car_df["engine_type"] = car_df["engine_type"].apply(lambda x: 1 if "ohc" in str(x) else 0)

# Step 3: Features and target
X_car = car_df.drop("price", axis=1)
y_car = car_df["price"].astype(float)

# Scale features
X_car_scaled = StandardScaler().fit_transform(X_car)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_car_scaled, y_car, test_size=0.3, random_state=42)

from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

# Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
print("Without PCA - Test R²:", lr.score(X_test, y_test))

# PCA reduction
pca = PCA(n_components=10)
X_car_pca = pca.fit_transform(X_car_scaled)
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_car_pca, y_car, test_size=0.3, random_state=42)

lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train)
print("With PCA - Test R²:", lr_pca.score(X_test_pca, y_test))
