
# House Price Prediction — Complete ML Pipeline (Module 6)

This notebook performs the full workflow on the **Ames Housing** dataset (Kaggle: 1460 rows, ~80 features):

1. **EDA**  
2. **Simple Linear Regression** (SalePrice ~ Area/GrLivArea)  
3. **Multiple Linear Regression**  
4. **Dimensionality Reduction** (PCA-equivalent via TruncatedSVD for sparse one-hot features) + Linear Regression  
5. **Lasso & Ridge Regression**  
6. **Support Vector Regression (SVR)**  
7. **Decision Tree Regressor**  
8. **Random Forest Regressor**  
9. **Hyperparameter Tuning** with `GridSearchCV` and `RandomizedSearchCV`  
10. **Model Selection**: compare all models on a common test set (R², RMSE, MAE) and save each model with a different file name.

> **Instructions**  
> - Put your dataset file (`train.csv`) from Kaggle in the same folder as this notebook.  
> - If your dataset uses a different file name or has a column named `Area`, update the config cell accordingly.


In [None]:

# ====== Config ======
DATA_PATH = "train.csv"  # change if needed
TARGET_COL = "SalePrice"
AREA_COL_CANDIDATES = ["GrLivArea", "Area"]  # tries first that exists
RANDOM_STATE = 42
TEST_SIZE = 0.2
N_JOBS = -1  # use all cores

# Output paths
MODELS_DIR = "models"
RESULTS_PATH = "model_results.csv"

import os
os.makedirs(MODELS_DIR, exist_ok=True)
print("Models will be saved to:", MODELS_DIR)


In [None]:

# ====== Imports ======
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import joblib


In [None]:

# ====== Load Data ======
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()


## 1) EDA — Structure & Missingness

In [None]:

print("Columns:", df.columns.tolist())
print("\nData types:")
print(df.dtypes.value_counts())

print("\nMissing values (top 30):")
missing = df.isna().sum().sort_values(ascending=False)
display(missing.head(30))

# Basic target stats
print("\nTarget stats:")
display(df[TARGET_COL].describe(percentiles=[.05,.25,.5,.75,.95]))

# Target distribution plot (matplotlib only)
plt.figure(figsize=(8,5))
plt.hist(df[TARGET_COL].dropna(), bins=50)
plt.title("SalePrice Distribution")
plt.xlabel("SalePrice")
plt.ylabel("Count")
plt.show()


### Numeric correlations with SalePrice (Top 15)

In [None]:

numeric_cols = df.select_dtypes(include=[np.number]).columns.drop(TARGET_COL)
corr = df[numeric_cols.tolist() + [TARGET_COL]].corr(numeric_only=True)[TARGET_COL].drop(TARGET_COL).sort_values(ascending=False)
top15 = corr.head(15)

plt.figure(figsize=(8,5))
plt.bar(top15.index, top15.values)
plt.xticks(rotation=45, ha='right')
plt.title("Top 15 numeric correlations with SalePrice")
plt.ylabel("Correlation")
plt.tight_layout()
plt.show()

top15


In [None]:

# ====== Feature columns ======
# Determine area column for simple regression
AREA_COL = None
for c in AREA_COL_CANDIDATES:
    if c in df.columns:
        AREA_COL = c
        break

if AREA_COL is None:
    raise ValueError("No area column found. Make sure 'GrLivArea' or 'Area' exists in the dataset.")

print("Using area column for Simple Linear Regression:", AREA_COL)

# Split train/test once for fair comparison
X_full = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X_full, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

# Identify numeric & categorical columns
num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

len(num_features), len(cat_features), AREA_COL


In [None]:

# ====== Helpers ======
def evaluate_and_save(model, name, X_test, y_test, save_dir=MODELS_DIR):
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    rmse = mean_squared_error(y_test, preds, squared=False)
    mae = mean_absolute_error(y_test, preds)
    path = os.path.join(save_dir, f"{name}.pkl")
    import joblib
    joblib.dump(model, path)
    return {"model": name, "r2": r2, "rmse": rmse, "mae": mae, "path": path}

results = []


## 2) Simple Linear Regression (SalePrice ~ Area)

In [None]:

# Pipeline: impute numeric, select AREA_COL, fit LinearRegression
X_train_area = X_train[[AREA_COL]].copy()
X_test_area = X_test[[AREA_COL]].copy()

from sklearn.impute import SimpleImputer
area_imputer = SimpleImputer(strategy='median')
X_train_area_imputed = area_imputer.fit_transform(X_train_area)
X_test_area_imputed = area_imputer.transform(X_test_area)

from sklearn.linear_model import LinearRegression
lin_simple = LinearRegression()
lin_simple.fit(X_train_area_imputed, y_train)

res = evaluate_and_save(lin_simple, "model_simple_linear_area", X_test_area_imputed, y_test)
results.append(res)
res


## 3) Multiple Linear Regression

In [None]:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

preprocessor_dense = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ],
    remainder="drop"
)

pipe_lin = Pipeline(steps=[
    ("preprocess", preprocessor_dense),
    ("model", LinearRegression())
])

pipe_lin.fit(X_train, y_train)
res = evaluate_and_save(pipe_lin, "model_linear_multiple", X_test, y_test)
results.append(res)
res


## 4) Dimensionality Reduction (TruncatedSVD as PCA for OHE) + Linear Regression

In [None]:

from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV

pipe_svd_lin = Pipeline(steps=[
    ("preprocess", preprocessor_dense),
    ("svd", TruncatedSVD(n_iter=10, random_state=RANDOM_STATE)),
    ("model", LinearRegression())
])

param_grid_svd = {
    "svd__n_components": [50, 100, 150, 200]
}

grid_svd = GridSearchCV(pipe_svd_lin, param_grid=param_grid_svd, cv=5, n_jobs=-1)
grid_svd.fit(X_train, y_train)

best_svd_model = grid_svd.best_estimator_
res = evaluate_and_save(best_svd_model, "model_linear_with_svd", X_test, y_test)
results.append(res)

grid_svd.best_params_, res


## 5) Lasso & Ridge Regression

In [None]:

from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV

# Ridge
pipe_ridge = Pipeline(steps=[
    ("preprocess", preprocessor_dense),
    ("model", Ridge(random_state=RANDOM_STATE))
])

param_grid_ridge = {
    "model__alpha": [0.1, 1.0, 3.0, 10.0, 30.0, 100.0]
}

grid_ridge = GridSearchCV(pipe_ridge, param_grid=param_grid_ridge, cv=5, n_jobs=-1)
grid_ridge.fit(X_train, y_train)

res = evaluate_and_save(grid_ridge.best_estimator_, "model_ridge", X_test, y_test)
results.append(res)

# Lasso
pipe_lasso = Pipeline(steps=[
    ("preprocess", preprocessor_dense),
    ("model", Lasso(random_state=RANDOM_STATE, max_iter=20000))
])

param_grid_lasso = {
    "model__alpha": [0.0005, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0]
}

grid_lasso = GridSearchCV(pipe_lasso, param_grid=param_grid_lasso, cv=5, n_jobs=-1)
grid_lasso.fit(X_train, y_train)

res2 = evaluate_and_save(grid_lasso.best_estimator_, "model_lasso", X_test, y_test)
results.append(res2)

grid_ridge.best_params_, grid_lasso.best_params_, res, res2


## 6) Support Vector Regression (SVR)

In [None]:

from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV

pipe_svr = Pipeline(steps=[
    ("preprocess", preprocessor_dense),
    ("model", SVR())
])

param_dist_svr = {
    "model__kernel": ["rbf", "linear"],
    "model__C": [0.1, 1, 10, 30, 100],
    "model__epsilon": [0.01, 0.1, 0.2, 0.3],
    "model__gamma": ["scale", "auto"]
}

rand_svr = RandomizedSearchCV(
    pipe_svr, param_distributions=param_dist_svr, n_iter=20,
    cv=5, random_state=42, n_jobs=-1
)
rand_svr.fit(X_train, y_train)

best_svr = rand_svr.best_estimator_
res = evaluate_and_save(best_svr, "model_svr", X_test, y_test)
results.append(res)

rand_svr.best_params_, res


## 7) Decision Tree Regressor

In [None]:

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

pipe_dt = Pipeline(steps=[
    ("preprocess", preprocessor_dense),
    ("model", DecisionTreeRegressor(random_state=42))
])

param_grid_dt = {
    "model__max_depth": [None, 5, 10, 20, 30],
    "model__min_samples_split": [2, 5, 10, 20],
    "model__min_samples_leaf": [1, 2, 4, 8]
}

grid_dt = GridSearchCV(pipe_dt, param_grid=param_grid_dt, cv=5, n_jobs=-1)
grid_dt.fit(X_train, y_train)

best_dt = grid_dt.best_estimator_
res = evaluate_and_save(best_dt, "model_decision_tree", X_test, y_test)
results.append(res)

grid_dt.best_params_, res


## 8) Random Forest Regressor

In [None]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

pipe_rf = Pipeline(steps=[
    ("preprocess", preprocessor_dense),
    ("model", RandomForestRegressor(random_state=42, n_jobs=-1))
])

param_dist_rf = {
    "model__n_estimators": [100, 200, 300, 500, 800],
    "model__max_depth": [None, 10, 20, 30, 50],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["auto", "sqrt", 0.3, 0.5, 0.7]
}

rand_rf = RandomizedSearchCV(
    pipe_rf, param_distributions=param_dist_rf, n_iter=25,
    cv=5, random_state=42, n_jobs=-1
)
rand_rf.fit(X_train, y_train)

best_rf = rand_rf.best_estimator_
res = evaluate_and_save(best_rf, "model_random_forest", X_test, y_test)
results.append(res)

rand_rf.best_params_, res


## 9–10) Model Comparison & Selection

In [None]:

import pandas as pd
import matplotlib.pyplot as plt

results_df = pd.DataFrame(results).sort_values(by="rmse")
results_df.to_csv("model_results.csv", index=False)
display(results_df)

plt.figure(figsize=(10,6))
plt.bar(results_df["model"], results_df["rmse"])
plt.xticks(rotation=45, ha='right')
plt.title("Model RMSE (lower is better)")
plt.ylabel("RMSE")
plt.tight_layout()
plt.show()

print("Results saved to: model_results.csv")



> **Note on LDA:** Linear Discriminant Analysis (LDA) is primarily a **classification** technique and not suitable for a continuous target like `SalePrice`.  
> Therefore, PCA-equivalent dimensionality reduction via **TruncatedSVD** is used for the regression setting here.
