In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# I'll set up my project root.
try:
    PROJECT_ROOT = Path(__file__).resolve().parent.parent
except NameError:
    PROJECT_ROOT = Path('.').resolve().parent

# I need to load my data and model.
X_train = pd.read_csv(PROJECT_ROOT / "data/processed/X_train_processed.csv")
y_train = pd.read_csv(PROJECT_ROOT / "data/processed/y_train_processed.csv").squeeze()
X_test = pd.read_csv(PROJECT_ROOT / "data/processed/X_test_processed.csv")
y_test = pd.read_csv(PROJECT_ROOT / "data/processed/y_test_processed.csv").squeeze()

# I'll also load the original categorical test data.
df_test_original_categorical = pd.read_csv(PROJECT_ROOT / "data/processed/df_test_original_categorical.csv")

# And the trained model.
rf = joblib.load(PROJECT_ROOT / "models/random_forest_model.joblib")

# I'll load my baseline MAE.
metrics_df = pd.read_csv(PROJECT_ROOT / "reports/model_metrics.csv")
mae = metrics_df.loc[metrics_df['model'] == 'RandomForest', 'MAE_SGD'].iloc[0]

In [None]:
# I'm doing some error analysis.

# I'll attach the predictions and error to my test frame.
test_preds = rf.predict(X_test)
test_df = X_test.copy()
test_df["actual"] = y_test.values
test_df["predicted"] = test_preds
test_df["abs_error"] = (test_df["actual"] - test_df["predicted"]).abs()

def reconstruct_category(df, prefix):
    # This reconstructs a single categorical column from its one-hot encoded dummies.
    # I'll find all the columns that start with the prefix.
    category_dummy_cols = [col for col in df.columns if col.startswith(prefix)]
    if not category_dummy_cols:
        # If there are no dummy columns, I'll just return NaNs.
        return pd.Series(np.nan, index=df.index, name=prefix.rstrip('_'))
    reconstructed_series = df[category_dummy_cols].idxmax(axis=1).str.replace(prefix, '')
    return reconstructed_series


# I'll bring back the original categorical columns for analysis.
test_df['town'] = reconstruct_category(test_df, 'town_')
test_df['flat_type'] = reconstruct_category(test_df, 'flat_type_')

# 5.1.2 I'll look at the top 20 worst absolute errors.
worst20 = test_df.sort_values("abs_error", ascending=False).head(20)
display_cols = ["actual", "predicted", "abs_error",
                "sale_year", "town", "flat_type"] # I've changed these to the reconstructed columns.
print("\nTop-20 worst absolute errors")
print(worst20[display_cols].to_string())

In [None]:
# 5.1.3 I'll slice the errors by town and flat_type.
def mae_by(group_col):
    return (test_df
            .assign(error=test_df["abs_error"])
            .groupby(group_col)["error"]
            .mean()
            .sort_values(ascending=False)
            .head(10))

print("\nWorst towns by MAE")
print(mae_by("town"))

print("\nWorst flat_types by MAE")
print(mae_by("flat_type"))

In [None]:
# I'll check if the file exists.
if not (PROJECT_ROOT / "models/random_forest_model.joblib").exists():
    raise FileNotFoundError("Model file missing - run 04_tree_model.ipynb first")

# I'll create a safe category reconstruction function.
def safe_reconstruct_category(df, prefix):
    cols = [col for col in df.columns if col.startswith(prefix)]
    if not cols:
        return pd.Series('unknown', index=df.index)
    
    # I need to handle any all-zero rows.
    max_cols = df[cols].idxmax(axis=1)
    result = max_cols.str.replace(prefix, '')
    result[df[cols].sum(axis=1) == 0] = 'unknown'
    return result


In [None]:
# ---------------------------------------------------------------
# 5.2    I'll try a tiny improvement by log-transforming the price.
# ---------------------------------------------------------------
print("\nEvaluating log-transformed target model...")

y_train_log = np.log1p(y_train)
rf_log = RandomForestRegressor(
            n_estimators=100,
            n_jobs=-1,
            random_state=42)
rf_log.fit(X_train, y_train_log)

# I'll back-transform the predictions.
preds_log = np.expm1(rf_log.predict(X_test))
mae_log = mean_absolute_error(y_test, preds_log)

print(f"\nBaseline MAE : {mae:,.0f} SGD")
print(f"Log-target MAE: {mae_log:,.0f} SGD  (Δ {mae-mae_log:,.0f})")

print("\nPhase 5: Error analysis complete.")