In [14]:
# Cell 1: imports & load data
import os
import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error

DATA_PATH = r"F:\Data Science\housing-prices\data\train.csv"
assert os.path.exists(DATA_PATH), f"Data file not found at {DATA_PATH}"

df = pd.read_csv(DATA_PATH)
print("Loaded dataframe:", df.shape)
# create log target for modeling stability
df['log_SalePrice'] = np.log1p(df['SalePrice'])
TARGET = 'log_SalePrice'


Loaded dataframe: (1460, 81)


In [15]:
# Cell 2: numeric-only baseline: Dummy (median) and LinearRegression
num_df = df.select_dtypes(include=[np.number]).drop(columns=['Id','SalePrice','log_SalePrice'])
y = df[TARGET].copy()

# fill na with zeros for this quick baseline
X = num_df.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Dummy regressor
dummy = DummyRegressor(strategy='median').fit(X_train, y_train)
dummy_preds_log = dummy.predict(X_test)
dummy_preds_price = np.expm1(dummy_preds_log)
y_test_price = np.expm1(y_test)

# Linear regression
lr = LinearRegression().fit(X_train, y_train)
lr_preds_log = lr.predict(X_test)
lr_preds_price = np.expm1(lr_preds_log)

# ✅ Fix: use round() function instead of .round()
print("Dummy MAE ($):", round(mean_absolute_error(y_test_price, dummy_preds_price), 2))
print("Linear MAE ($):", round(mean_absolute_error(y_test_price, lr_preds_price), 2))


Dummy MAE ($): 59568.25
Linear MAE ($): 19160.83


In [16]:
# Cell 3: identify feature types and build ColumnTransformer
X_full = df.drop(columns=['Id','SalePrice','log_SalePrice'])
y_full = df['log_SalePrice']

num_cols = X_full.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_full.select_dtypes(include=['object','category']).columns.tolist()

print(f"Number numeric cols: {len(num_cols)}  Number categorical cols: {len(cat_cols)}")

# numeric pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# categorical pipeline (handle new sklearn versions)
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # ✅ updated
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
], remainder='drop')

Number numeric cols: 36  Number categorical cols: 43


In [17]:
# Cell 4: full pipeline with RandomForest and 5-fold CV
pipe = Pipeline([
    ('preproc', preprocessor),
    ('model', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
])

print("Running 5-fold cross-validation (this may take a bit)...")
cv_scores = -cross_val_score(pipe, X_full, y_full, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
print("CV MAE (log-target):", cv_scores.mean().round(4), "±", cv_scores.std().round(4))


Running 5-fold cross-validation (this may take a bit)...
CV MAE (log-target): 0.0954 ± 0.0028


In [18]:
# Cell 5: train on a fixed train/test split and evaluate in dollar space (fixed RMSE)
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.20, random_state=42)

print("Fitting pipeline on training data (this may take a little)...")
pipe.fit(X_train, y_train)

print("Predicting on test set...")
preds_log = pipe.predict(X_test)
preds_price = np.expm1(preds_log)
y_test_price = np.expm1(y_test)

mae = mean_absolute_error(y_test_price, preds_price)
rmse = np.sqrt(mean_squared_error(y_test_price, preds_price))  # compute RMSE this way for compatibility
print(f"Test MAE ($): {mae:,.2f}")
print(f"Test RMSE ($): {rmse:,.2f}")

Fitting pipeline on training data (this may take a little)...
Predicting on test set...
Test MAE ($): 17,279.35
Test RMSE ($): 29,619.66


In [19]:
# Cell 6: save the trained pipeline to disk
OUT_PATH = r"F:\Data Science\housing-prices\models\final_model.joblib"
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
joblib.dump(pipe, OUT_PATH)
print("Saved pipeline to:", OUT_PATH)


Saved pipeline to: F:\Data Science\housing-prices\models\final_model.joblib


In [20]:
# Cell 7: feature names + top importances (works because we used sparse=False for OHE)
print("Extracting feature names and importances...")

pre = pipe.named_steps['preproc']
model = pipe.named_steps['model']

# numeric feature names
num_features = num_cols

# OHE names
ohe = pre.named_transformers_['cat'].named_steps['ohe'] if 'cat' in pre.named_transformers_ else None
if ohe is not None and len(cat_cols) > 0:
    try:
        ohe_names = ohe.get_feature_names_out(cat_cols).tolist()
    except:
        ohe_names = []
        for i, c in enumerate(cat_cols):
            cats = ohe.categories_[i]
            ohe_names += [f"{c}__{val}" for val in cats]
else:
    ohe_names = []

feature_names = num_features + ohe_names
importances = model.feature_importances_
imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
imp_df = imp_df.sort_values('importance', ascending=False).reset_index(drop=True)
display(imp_df.head(30))


Extracting feature names and importances...


Unnamed: 0,feature,importance
0,OverallQual,0.542774
1,GrLivArea,0.115815
2,TotalBsmtSF,0.044078
3,GarageCars,0.040398
4,GarageArea,0.024956
5,BsmtFinSF1,0.022028
6,1stFlrSF,0.018957
7,YearBuilt,0.015243
8,LotArea,0.014246
9,OverallCond,0.009728


In [21]:
# Cell 8 (optional): small RandomizedSearchCV for a few RF hyperparams (can be slow)
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'model__n_estimators': [100, 200, 400],
    'model__max_depth': [None, 10, 20, 30],
    'model__max_features': ['sqrt', 'log2', 0.3, 0.5]
}

rs = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=6, cv=3,
                        scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42, verbose=1)
print("Starting randomized search (this can take a while)...")
rs.fit(X_train, y_train)
print("Best CV MAE (log):", -rs.best_score_)
print("Best params:", rs.best_params_)

# save best estimator
joblib.dump(rs.best_estimator_, r"F:\Data Science\housing-prices\models\final_model_tuned.joblib")
print("Saved tuned pipeline to models/final_model_tuned.joblib")


Starting randomized search (this can take a while)...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best CV MAE (log): 0.0932241842668189
Best params: {'model__n_estimators': 200, 'model__max_features': 0.3, 'model__max_depth': 30}
Saved tuned pipeline to models/final_model_tuned.joblib
