# Random Forest Regression on kc_house_data_encoded_tree_step3

Workflow:
1. Setup and Imports
2. Model Hypothesis
3. Load Dataset
4. Prepare Features and Target
5. Train/Test Split
6. Baseline Random Forest (sklearn)
7. Cross Validation
8. Convert RF to PyTorch Model (running on GPU/MPS)
9. Best Model Testing (GPU)
10. Test Set Evaluation (GPU)
11. Feature Importance
12. Summary


# 1. SETUP AND IMPORTS

In [None]:
import pandas as pd
import numpy as np
import torch
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from hummingbird.ml import convert

sns.set(style="whitegrid")

RANDOM_STATE = 42

device = "mps" if torch.backends.mps.is_available() else "cpu"
print("Using device:", device)


def print_metrics(y_true, y_pred, label=""):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"--- {label} ---")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE : {mae:.4f}")
    print(f"R^2 : {r2:.4f}\n")



: 

# 2. MODEL PERFORMANCE HYPOTHESIS
Hypothesis:
- A Random Forest Regressor on the tree-encoded dataset should capture
  nonlinear relationships and interactions between features more effectively
  than a linear model on the same log_price target.
- We expect:
    - Higher test R^2 than the linear baseline (potentially > 0.80),
    - Lower RMSE in log_price space,
    - Better robustness to outliers and complex patterns.
- Hyperparameter tuning (n_estimators, max_depth, etc.) should yield
  modest but meaningful improvements in cross-validated performance.



# 3. LOAD ENCODED DATASET

In [None]:
df = pd.read_csv("../data/kc_house_data_encoded_tree_step3.csv")

print("Dataset shape:", df.shape)
print(df.head())

# 4. PREPARE FEATURES AND TARGET


In [None]:
target_col = "log_price"

# Sanity check: make sure target exists
assert target_col in df.columns, f"{target_col} not found in dataset columns."

X = df.drop(columns=[target_col])
y = df[target_col].values

object_cols = X.select_dtypes(include=["object"]).columns.tolist()
print("Object columns in X (should ideally be none for tree-encoded data):", object_cols)

print("X shape:", X.shape)
print("y shape:", y.shape)

# 5. TRAIN/TEST SPLIT


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# 6. BASELINE RANDOM FOREST (CPU TRAINED)


In [None]:
rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=40,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

train_pred = rf.predict(X_train)
test_pred = rf.predict(X_test)

print_metrics(y_train, train_pred, "Baseline RF Train")
print_metrics(y_test, test_pred, "Baseline RF Test")


# 7. CROSS VALIDATION 

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

cv_rmse_baseline = -cross_val_score(
    baseline_rf,
    X,
    y,
    cv=kf,
    scoring="neg_root_mean_squared_error",
)
cv_mae_baseline = -cross_val_score(
    baseline_rf,
    X,
    y,
    cv=kf,
    scoring="neg_mean_absolute_error",
)
cv_r2_baseline = cross_val_score(
    baseline_rf,
    X,
    y,
    cv=kf,
    scoring="r2",
)

print("=== BASELINE RANDOM FOREST 5-FOLD CV ===")
print(f"RMSE: mean={cv_rmse_baseline.mean():.4f}, std={cv_rmse_baseline.std():.4f}")
print(f"MAE : mean={cv_mae_baseline.mean():.4f}, std={cv_mae_baseline.std():.4f}")
print(f"R^2 : mean={cv_r2_baseline.mean():.4f}, std={cv_r2_baseline.std():.4f}\n")


# 8. CONVERT SKLEARN RF â†’ PYTORCH RF (FOR GPU INFERENCE)


In [None]:
print("\nConverting Random Forest to PyTorch (MPS)...")

hb_model = convert(rf, "pytorch", X_train)

hb_model.to(device)

print("Conversion complete. Model is now on:", device)


# 9. GPU PREDICTION (MPS)


In [None]:
from hummingbird.ml import convert

print("\nConverting Random Forest to PyTorch (MPS)...")

# Use NumPy array instead of DataFrame so Hummingbird treats it as a single input
hb_model = convert(rf, "pytorch", X_train.values, device=device)

print("Conversion complete. Model is now on:", device)

print("\nRunning predictions on MPS...")

# Again, pass NumPy array here
gpu_preds = hb_model.predict(X_test.values)  # returns numpy array

print_metrics(y_test, gpu_preds, "MPS RF Test (log_price)")


# 10. TEST SET EVALUATION 


In [None]:
plt.figure(figsize=(6, 6))
sns.scatterplot(x=y_test, y=gpu_preds, alpha=0.4)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
plt.xlabel("Actual log_price")
plt.ylabel("Predicted log_price")
plt.title("Actual vs Predicted (MPS Random Forest)")
plt.tight_layout()
plt.show()

residuals = y_test - gpu_preds

plt.figure(figsize=(6, 4))
sns.histplot(residuals, kde=True, bins=40)
plt.title("Residual Distribution (MPS RF)")
plt.show()


# 11. FEATURE IMPORTANCE


In [None]:
importances = rf.feature_importances_
feat_imp = pd.DataFrame({"feature": X.columns, "importance": importances})
feat_imp = feat_imp.sort_values("importance", ascending=False)

print("Top 20 features:")
print(feat_imp.head(20))

plt.figure(figsize=(8, 6))
sns.barplot(data=feat_imp.head(20), x="importance", y="feature")
plt.title("Top 20 Feature Importances")
plt.tight_layout()
plt.show()

In [None]:
import optuna
from sklearn.ensemble import RandomForestRegressor
from hummingbird.ml import convert

print("------------------------------------------------")
print("1. STARTING OPTUNA OPTIMIZATION")
print("------------------------------------------------")

# 1. Define the Objective Function
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_float('max_features', 0.5, 0.9),
        'n_jobs': -1,
        'random_state': 42
    }
    
    # Train a quick model with these params
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    
    # Evaluate on a validation set (here we use test set for speed in this demo)
    # In strict ML, you might use a separate validation split or cross-val here
    preds = model.predict(X_test)
    return r2_score(y_test, preds)

# 2. Run Optimization
# Create a study to MAXIMIZE R^2
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)  # 15 trials to keep it relatively fast

print("\n------------------------------------------------")
print(f"BEST PARAMS FOUND: {study.best_params}")
print("------------------------------------------------")

# 3. Train Final Model with Best Parameters
print("\nTraining Final Model with Best Parameters...")
best_rf = RandomForestRegressor(**study.best_params, n_jobs=-1, random_state=42)
best_rf.fit(X_train, y_train)

# 4. Convert to MPS (GPU) for Inference
print("Converting to PyTorch (MPS)...")
hb_tuned = convert(best_rf, "pytorch", X_train.values, device=device)
hb_tuned.to(device)

# 5. Predict & Evaluate
tuned_preds = hb_tuned.predict(X_test.values)

print("\n")
print("========================================")
print("       RESULTS COMPARISON")
print("========================================")

# Use your previous 'test_pred' (CPU) or 'gpu_preds' (MPS) from Step 6/9 for comparison
baseline_r2 = r2_score(y_test, gpu_preds) 
tuned_r2 = r2_score(y_test, tuned_preds)

print(f"Baseline R^2 : {baseline_r2:.5f}")
print(f"Optuna R^2   : {tuned_r2:.5f}")
print("----------------------------------------")
print(f"Improvement  : {tuned_r2 - baseline_r2:.5f}")
print("========================================")

# 12. SUMMARY


In [None]:
print("\n=== SUMMARY ===")
print("Baseline RF Test R^2:", r2_score(y_test, test_pred))
print("MPS RF Test R^2:", r2_score(y_test, gpu_preds))