# Assignment 3(1)

### Import libraries

In [29]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold,train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

### Q1.

In [30]:
df_house = pd.read_csv("USA_Housing.csv")
X = df_house.drop(columns=["Price"])
y = df_house["Price"].values

In [31]:
X = pd.get_dummies(X,drop_first=True)

In [32]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [33]:
def add_intercept(X):
    return np.hstack([np.ones((X.shape[0],1)),X])

In [34]:
def closed_form_beta(X,y):
    return np.linalg.pinv(X.T @ X) @ (X.T @ y)

In [35]:
kf = KFold(n_splits=5,shuffle=True,random_state=42)
fold_results = []

In [36]:
for i, (train_idx,test_idx) in enumerate(kf.split(X_scaled), start=1):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    beta = closed_form_beta(add_intercept(X_train), y_train)
    y_pred = add_intercept(X_test) @ beta

    r2 = r2_score(y_test, y_pred)
    fold_results.append((i, r2))

    print(f"Fold {i} → R2 Score: {r2:.4f}")

Fold 1 → R2 Score: 0.9180
Fold 2 → R2 Score: 0.9146
Fold 3 → R2 Score: 0.9116
Fold 4 → R2 Score: 0.9193
Fold 5 → R2 Score: 0.9244


In [37]:
best_fold = max(fold_results, key=lambda x: x[1])
print("Best Fold:", best_fold)

Best Fold: (5, 0.9243869413350317)


### Q2.

In [38]:
X_temp, X_test, y_temp, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.20, random_state=42)

In [39]:
X_train_i, X_val_i, X_test_i = add_intercept(X_train), add_intercept(X_val), add_intercept(X_test)

In [40]:
def gradient_descent(X, y, lr=0.01, n_iter=1000):
    n, d = X.shape
    beta = np.zeros(d)
    for _ in range(n_iter):
        error = X @ beta - y
        grad = (2/n) * (X.T @ error)
        beta -= lr * grad
    return beta

In [41]:
for lr in [0.001, 0.01, 0.1, 1]:
    beta = gradient_descent(X_train_i, y_train, lr=lr, n_iter=1000)

    val_r2 = r2_score(y_val, X_val_i @ beta)
    test_r2 = r2_score(y_test, X_test_i @ beta)

    print(f"LR={lr} → Val R2={val_r2:.4f}, Test R2={test_r2:.4f}")

LR=0.001 → Val R2=0.6820, Test R2=0.6490
LR=0.01 → Val R2=0.9098, Test R2=0.9148
LR=0.1 → Val R2=0.9098, Test R2=0.9148
LR=1 → Val R2=-inf, Test R2=-inf


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


### Q3.

In [42]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
columns = ["symboling","normalized_losses","make","fuel_type","aspiration","num_doors",
           "body_style","drive_wheels","engine_location","wheel_base","length","width",
           "height","curb_weight","engine_type","num_cylinders","engine_size","fuel_system",
           "bore","stroke","compression_ratio","horsepower","peak_rpm","city_mpg",
           "highway_mpg","price"]
df_auto = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
    header=None,
    names=columns,
    na_values="?"
)

In [43]:
df_auto["price"] = pd.to_numeric(df_auto["price"], errors="coerce")
df_auto = df_auto.dropna(subset=["price"])

##### Preprocessing

In [44]:
num_map = {"one":1,"two":2,"three":3,"four":4,"five":5,
           "six":6,"seven":7,"eight":8,"nine":9,"ten":10}

df_auto["num_doors"] = df_auto["num_doors"].map(num_map)
df_auto["num_cylinders"] = df_auto["num_cylinders"].map(num_map)

In [45]:
df_auto = pd.get_dummies(df_auto, columns=["body_style","drive_wheels"], drop_first=True)

In [46]:
for col in ["make","aspiration","engine_location","fuel_type"]:
    df_auto[col] = LabelEncoder().fit_transform(df_auto[col].astype(str))

In [47]:
df_auto["fuel_system"] = df_auto["fuel_system"].astype(str).apply(lambda x: 1 if "pfi" in x else 0)

In [48]:
df_auto["engine_type"] = df_auto["engine_type"].astype(str).apply(lambda x: 1 if "ohc" in x else 0)

##### Linear Regression with and without PCA

In [49]:
X = df_auto.drop(columns=["price"])
y = df_auto["price"].astype(float)

In [50]:
X = X.fillna(X.mean())

In [51]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [53]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print("Baseline R2:", r2_score(y_test, lr.predict(X_test)))

Baseline R2: 0.8721906380914359


In [54]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)
Xp_train, Xp_test, yp_train, yp_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

In [55]:
lr_pca = LinearRegression()
lr_pca.fit(Xp_train, yp_train)
print("PCA R2:", r2_score(yp_test, lr_pca.predict(Xp_test)))

PCA R2: 0.8757822928276745
