In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import r2_score

url = "https://drive.google.com/uc?id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX"
df = pd.read_csv(url)

X = df.drop(columns=['Price']).to_numpy()
y = df['Price'].to_numpy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores, betas = [], []

for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    X_train_bias = np.c_[np.ones(X_train.shape[0]), X_train]
    beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ X_train_bias.T @ y_train
    X_test_bias = np.c_[np.ones(X_test.shape[0]), X_test]
    y_pred = X_test_bias @ beta
    score = r2_score(y_test, y_pred)
    r2_scores.append(score)
    betas.append(beta)

best_idx = np.argmax(r2_scores)
best_beta = betas[best_idx]

print("R2 scores:", r2_scores)
print("Best R2 Score:", r2_scores[best_idx])
print("Best Beta Matrix:", best_beta)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train_bias = np.c_[np.ones(X_train.shape[0]), X_train]
X_test_bias = np.c_[np.ones(X_test.shape[0]), X_test]
y_pred = X_test_bias @ best_beta
print("Final R2 Score :", r2_score(y_test, y_pred))


R2 scores: [0.9179971706985147, 0.9145677884802818, 0.9116116385364478, 0.9193091764960816, 0.9243869413350316]
Best R2 Score: 0.9243869413350316
Best Beta Matrix: [1.23161736e+06 2.30225051e+05 1.63956839e+05 1.21115120e+05
 7.83467170e+02 1.50662447e+05]
Final R2 Score : 0.9147458156636434


In [4]:
from sklearn.metrics import r2_score

X_train_val, X_test, y_train_val, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

X_train_bias = np.c_[np.ones(X_train.shape[0]), X_train]
X_val_bias = np.c_[np.ones(X_val.shape[0]), X_val]
X_test_bias = np.c_[np.ones(X_test.shape[0]), X_test]

def gradient_descent(X, y, lr, iterations=1000):
    n_samples, n_features = X.shape
    beta = np.zeros(n_features)
    for i in range(iterations):
        gradients = -(2/n_samples) * (X.T @ (y - X @ beta))
        beta -= lr * gradients
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
results = {}

for lr in learning_rates:
    beta = gradient_descent(X_train_bias, y_train, lr=lr, iterations=1000)
    y_val_pred = X_val_bias @ beta
    y_test_pred = X_test_bias @ beta
    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)
    results[lr] = {"beta": beta, "R2_val": r2_val, "R2_test": r2_test}

for lr, res in results.items():
    print(f"LR={lr}, Validation R2={res['R2_val']:.4f}, Test R2={res['R2_test']:.4f}")


LR=0.001, Validation R2=0.6820, Test R2=0.6490
LR=0.01, Validation R2=0.9098, Test R2=0.9148
LR=0.1, Validation R2=0.9098, Test R2=0.9148
LR=1, Validation R2=-inf, Test R2=-inf


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration","num_doors", 
           "body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width", 
           "height", "curb_weight", "engine_type", "num_cylinders", "engine_size", "fuel_system", 
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg", 
           "highway_mpg", "price"]

df = pd.read_csv(url, names=columns, na_values="?")

for col in df.columns:
    if df[col].dtype == "object":
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

df.dropna(subset=['price'], inplace=True)
df['price'] = df['price'].astype(float)

df['num_doors'].replace({"two": 2, "four": 4}, inplace=True)
df['num_cylinders'].replace({"two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "eight": 8, "twelve": 12}, inplace=True)

df = pd.get_dummies(df, columns=["body_style", "drive_wheels"], drop_first=True)

for col in ["make", "aspiration", "engine_location", "fuel_type"]:
    df[col] = LabelEncoder().fit_transform(df[col])

df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if "pfi" in str(x).lower() else 0)
df['engine_type'] = df['engine_type'].apply(lambda x: 1 if "ohc" in str(x).lower() else 0)

X = df.drop(columns=['price']).values
y = df['price'].values

X_scaled = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
print("R2 score without PCA:", r2_score(y_test, lr.predict(X_test)))

pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train_pca)
print("R2 score with PCA:", r2_score(y_test_pca, lr_pca.predict(X_test_pca)))


R2 score without PCA: 0.79622312209087
R2 score with PCA: 0.7757066649686492


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values