# Multiple Linear Regression — Toyota Corolla

This notebook performs EDA, preprocessing, trains multiple linear regression models, evaluates them, and applies Ridge/Lasso regularization. Update `DATA_PATH` if needed.

In [None]:
# Configuration & Imports
import warnings
warnings.filterwarnings("ignore")

%pip install scikit-learn

import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV, LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import statsmodels.api as sm

DATA_PATH = r"/mnt/data/ToyotaCorolla - MLR.csv"  # change to your file path (e.g., r"D:\...")
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Quick EDA
print("Shape:", df.shape)
display(df.describe().T)
# Pairplot commented (may be heavy)
# sns.pairplot(df.sample(200))

In [None]:
# Preprocessing example: encode FuelType, ensure numeric types
df['Doors'] = pd.to_numeric(df['Doors'], errors='coerce')
df['Automatic'] = df['Automatic'].replace({'Yes':1,'No':0}).fillna(df['Automatic'])
df = df.dropna(subset=['Price'])
df['Automatic'] = pd.to_numeric(df['Automatic'], errors='coerce').fillna(0).astype(int)
X = df[['Age','KM','HP','Automatic','CC','Doors','Weight','Quarterly_Tax']].copy()
if 'FuelType' in df.columns:
    X = pd.get_dummies(pd.concat([X, df[['FuelType']]], axis=1), columns=['FuelType'], drop_first=True)
y = df['Price']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Baseline OLS using statsmodels
X_train_sm = sm.add_constant(X_train)
model_ols = sm.OLS(y_train, X_train_sm).fit()
print(model_ols.summary())

# Evaluate on test
X_test_sm = sm.add_constant(X_test)
y_pred = model_ols.predict(X_test_sm)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2:", r2_score(y_test, y_pred))

In [None]:
# RidgeCV and LassoCV (with scaling)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

ridge = RidgeCV(alphas=np.logspace(-3,3,50), cv=5).fit(X_train_s, y_train)
print("Ridge alpha:", ridge.alpha_)
y_ridge = ridge.predict(X_test_s)
print("Ridge RMSE:", mean_squared_error(y_test, y_ridge, squared=False))

lasso = LassoCV(cv=5, random_state=42, max_iter=5000).fit(X_train_s, y_train)
print("Lasso alpha:", lasso.alpha_)
y_lasso = lasso.predict(X_test_s)
print("Lasso RMSE:", mean_squared_error(y_test, y_lasso, squared=False))

## End of notebook