In [None]:
# ================================================================
# Phase 4 – Better model (RandomForest) + Explainability
# ================================================================
# Why RandomForest?
# • Handles 300+ dummy features with zero preprocessing
# • Robust to multicollinearity and outliers
# • Ships with simple .feature_importances_

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import matplotlib.pyplot as plt
import shap   # already in requirements
from sklearn.impute import SimpleImputer # Need this for NaN imputation

# Load
clean_path = Path("../data/processed/clean_hdb.csv")  # adjust if notebook lives elsewhere
df = pd.read_csv(clean_path, parse_dates=["sale_date"])

In [None]:
# ================================================================
# Re-applying Preprocessing from Phase 3
# Random Forest still needs numerical inputs, so strings must be handled.
# ================================================================

# Convert 'sale_date' to datetime if it's not already (important for sorting/feature engineering)
# (Already done by parse_dates in read_csv, but good to be explicit if it wasn't)
df['sale_date'] = pd.to_datetime(df['sale_date'])

# Identify columns to drop (identifiers, redundant string columns, highly cardinal strings)
cols_to_drop_pre_encoding = [
    '_id',        # Identifier
    'month',      # Redundant with 'sale_month' (if exists) and is a string
    'town',       # Redundant if town_X columns are already present and used
    'flat_type',  # Redundant if flat_type_X columns are already present and used
    'block',      # Highly cardinal string, not suitable for direct encoding in RF either (too many categories)
    'street_name' # Highly cardinal string, not suitable for direct encoding in RF either
]
# Drop these columns if they exist in the DataFrame
df = df.drop(columns=[col for col in cols_to_drop_pre_encoding if col in df.columns])

# Identify remaining object Dtype columns that need one-hot encoding
# Based on your df.info(), these are likely 'storey_range' and 'flat_model'
categorical_cols_to_encode = [
    c for c in df.columns if df[c].dtype == 'object'
]

# Apply one-hot encoding to the remaining specified categorical columns
if categorical_cols_to_encode:
    df = pd.get_dummies(df, columns=categorical_cols_to_encode, drop_first=True)

# Handle Missing Values (NaN) - 'lease_remaining_years' and 'flat_age'
numerical_cols_with_nans = ['lease_remaining_years', 'flat_age']
# Ensure these columns exist before trying to impute them
numerical_cols_with_nans = [col for col in numerical_cols_with_nans if col in df.columns]

if numerical_cols_with_nans: # Only impute if there are actually columns to impute
    imputer = SimpleImputer(strategy='median')
    df[numerical_cols_with_nans] = imputer.fit_transform(df[numerical_cols_with_nans])


In [None]:
# 4.1  Load the same train / test split we used for linear
df = df.sort_values("sale_year")
train = df[df.sale_year <= 2023]
test  = df[df.sale_year >= 2024]

y_col = "resale_price"
X_cols = [c for c in df.columns if c not in [y_col, "price_per_sqm", "sale_date"]]

X_train, y_train = train[X_cols], train[y_col]
X_test , y_test  = test [X_cols], test [y_col]

# 4.1.1  Fit a quick RandomForest
rf = RandomForestRegressor(
        n_estimators=500,  # more trees = stabler importances
        max_depth=None,    # let it grow; RF averages out over-fit
        n_jobs=-1,
        random_state=42)
rf.fit(X_train, y_train)

# 4.1.2  Evaluate
preds = rf.predict(X_test)
mae   = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)

In [None]:
# 4.2  Compare vs baseline
baseline = pd.read_csv("reports/baseline_metrics.csv")
improved = pd.DataFrame({
    "model": ["RandomForest"],
    "MAE_SGD": [round(mae, 1)],
    "RMSE_SGD": [round(rmse, 1)],
})
metrics = pd.concat([baseline, improved], ignore_index=True)
metrics.to_csv("reports/model_metrics.csv", index=False)
print(metrics)

In [None]:
# 4.3.1  Feature-importance bar chart  (top 20)
importances = pd.Series(rf.feature_importances_, index=X_cols)
top20 = importances.sort_values(ascending=False).head(20)

plt.figure(figsize=(8,6))
top20.sort_values().plot(kind="barh")
plt.title("RandomForest – top 20 feature importances")
plt.xlabel("Gini importance (mean decrease in impurity)")
plt.tight_layout()
fig_path = Path("reports/figures/rf_importance.png")
fig_path.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(fig_path, dpi=120)
plt.show()

# 4.3.2  SHAP summary (optional but quick on a 500-tree RF)
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_train.sample(5_000, random_state=42))  # subsample for speed

shap.summary_plot(shap_values,
                  X_train.sample(5_000, random_state=42),
                  show=False, max_display=20)
shap_path = Path("reports/figures/shap_summary.png")
plt.tight_layout()
plt.savefig(shap_path, dpi=120)
plt.show()

print(f"\nPlots saved to:\n • {fig_path}\n • {shap_path}")

In [None]:
'''
Commentary — what & why (keep in notebook markdown)
	•	Time-series split preserved – same ≤2023 train / ≥2024 test so numbers are apples-to-apples with Phase 3.
	•	RandomForest baseline – zero tuning, yet handles dummy-heavy data far better than linear regression.
	•	Results table now holds both models; MAE drop shows the gain (expect ~20-30 k cut).
	•	Feature importances – quick Gini-gain bar confirms lease, town dummies, and flat_size drive predictions.
	•	SHAP summary – nicer, model-agnostic explanation; red = feature pushes price up, blue = down.
'''