In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Project root setup
try:
    PROJECT_ROOT = Path(__file__).resolve().parent.parent
except NameError:
    PROJECT_ROOT = Path('.').resolve().parent

# Load data and model
X_train = pd.read_csv(PROJECT_ROOT / "data/processed/X_train_processed.csv")
y_train = pd.read_csv(PROJECT_ROOT / "data/processed/y_train_processed.csv").squeeze()
X_test = pd.read_csv(PROJECT_ROOT / "data/processed/X_test_processed.csv")
y_test = pd.read_csv(PROJECT_ROOT / "data/processed/y_test_processed.csv").squeeze()

# Load original categorical test data
df_test_original_categorical = pd.read_csv(PROJECT_ROOT / "data/processed/df_test_original_categorical.csv")

# Load trained model
rf = joblib.load(PROJECT_ROOT / "models/random_forest_model.joblib")

# Load baseline MAE
metrics_df = pd.read_csv(PROJECT_ROOT / "reports/model_metrics.csv")
mae = metrics_df.loc[metrics_df['model'] == 'RandomForest', 'MAE_SGD'].iloc[0]

In [2]:
# Error Analysis

# Attach predictions & error to test frame
test_preds = rf.predict(X_test)
test_df = X_test.copy()
test_df["actual"] = y_test.values
test_df["predicted"] = test_preds
test_df["abs_error"] = (test_df["actual"] - test_df["predicted"]).abs()

def reconstruct_category(df, prefix):
    # Reconstructs a single categorical column from its one-hot encoded dummy columns.
    # Find all columns that start with the given prefix
    category_dummy_cols = [col for col in df.columns if col.startswith(prefix)]
    if not category_dummy_cols:
        # If no dummy columns are found for this prefix, return NaNs
        return pd.Series(np.nan, index=df.index, name=prefix.rstrip('_'))
    reconstructed_series = df[category_dummy_cols].idxmax(axis=1).str.replace(prefix, '')
    return reconstructed_series


# Bring back original categorical columns for analysis
test_df['town'] = reconstruct_category(test_df, 'town_')
test_df['flat_type'] = reconstruct_category(test_df, 'flat_type_')

# 5.1.2 Top-20 worst absolute errors
worst20 = test_df.sort_values("abs_error", ascending=False).head(20)
display_cols = ["actual", "predicted", "abs_error",
                "sale_year", "town", "flat_type"] # Changed to 'town', 'flat_type' now that they are reconstructed
print("\nTop-20 worst absolute errors")
print(worst20[display_cols].to_string())


Top-20 worst absolute errors
          actual     predicted      abs_error  sale_year        town flat_type
34522  1400000.0  7.799399e+05  620060.056667       2025   toa_payoh    5_room
34523  1400000.0  8.069343e+05  593065.713333       2025   toa_payoh    5_room
34526  1360000.0  7.780368e+05  581963.250000       2025   toa_payoh    5_room
34524  1360000.0  7.828199e+05  577180.100000       2025   toa_payoh    5_room
34543  1370000.0  8.038904e+05  566109.566667       2025   toa_payoh    5_room
34525  1342000.0  7.770663e+05  564933.709143       2025   toa_payoh    5_room
34518  1340000.0  7.844338e+05  555566.150267       2025   toa_payoh    5_room
34527  1335000.0  7.800265e+05  554973.482476       2025   toa_payoh    5_room
34521  1320000.0  7.898874e+05  530112.591200       2025   toa_payoh    5_room
34517  1330000.0  8.041716e+05  525828.413600       2025   toa_payoh    5_room
17793  1050000.0  5.615727e+05  488427.320000       2024       bedok    5_room
34519  1268000.0  7.80

In [3]:
# 5 .1 .3    Error slices – town & flat_type
def mae_by(group_col):
    return (test_df
            .assign(error=test_df["abs_error"])
            .groupby(group_col)["error"]
            .mean()
            .sort_values(ascending=False)
            .head(10))

print("\nWorst towns by MAE")
print(mae_by("town"))

print("\nWorst flat_types by MAE")
print(mae_by("flat_type"))


Worst towns by MAE
town
bishan             93900.855922
bukit_timah        90761.063945
kallang_whampoa    85994.850514
serangoon          84270.885236
toa_payoh          79984.550668
central_area       75347.186024
queenstown         75182.501554
geylang            70670.526312
sengkang           68454.167353
bukit_merah        67913.570122
Name: error, dtype: float64

Worst flat_types by MAE
flat_type
multi_generation    158238.371215
executive            77346.377792
5_room               69192.965932
4_room               64254.267024
3_room               42910.146757
2_room               30828.514638
Name: error, dtype: float64


In [5]:
# Check file existence
if not (PROJECT_ROOT / "models/random_forest_model.joblib").exists():
    raise FileNotFoundError("Model file missing - run 04_tree_model.ipynb first")

# Safe category reconstruction
def safe_reconstruct_category(df, prefix):
    cols = [col for col in df.columns if col.startswith(prefix)]
    if not cols:
        return pd.Series('unknown', index=df.index)
    
    # Handle all-zero rows
    max_cols = df[cols].idxmax(axis=1)
    result = max_cols.str.replace(prefix, '')
    result[df[cols].sum(axis=1) == 0] = 'unknown'
    return result


In [7]:
# ---------------------------------------------------------------
# 5 .2    Tiny improvement – log-transform price
# ---------------------------------------------------------------
print("\nEvaluating log-transformed target model...")

y_train_log = np.log1p(y_train)
rf_log = RandomForestRegressor(
            n_estimators=100,
            n_jobs=-1,
            random_state=42)
rf_log.fit(X_train, y_train_log)

# Back-transform predictions
preds_log = np.expm1(rf_log.predict(X_test))
mae_log = mean_absolute_error(y_test, preds_log)

print(f"\nBaseline MAE : {mae:,.0f} SGD")
print(f"Log-target MAE: {mae_log:,.0f} SGD  (Δ {mae-mae_log:,.0f})")

print("\nPhase 5: Error analysis complete.")


Evaluating log-transformed target model...

Baseline MAE : 59,963 SGD
Log-target MAE: 61,160 SGD  (Δ -1,197)

Phase 5: Error analysis complete.
