# 03 - Model Training and Hyperparameter Tuning

## Objectives
- Load cleaned data
- Split into train/test sets
- Train Random Forest Regressor
- Perform hyperparameter tuning with GridSearchCV
- Evaluate model performance
- Save trained model for deployment

In [1]:
import os
current_dir = os.getcwd()
current_dir

'/Users/sararosati/Desktop/vscode-projects/Heritage-Housing/Heritage-housing/jupyter_notebooks'

In [2]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")
current_dir = os.getcwd()
current_dir

You set a new current directory


'/Users/sararosati/Desktop/vscode-projects/Heritage-Housing/Heritage-housing'

## Section 1: Import Libraries and Load Data

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("All libraries imported successfully!")

All libraries imported successfully!


In [4]:
# Load cleaned data from previous notebook
df_cleaned = pd.read_csv('outputs/X_y_cleaned.csv')

# Separate X and y
X = df_cleaned.drop('SalePrice', axis=1)
y = df_cleaned['SalePrice']

print("Data loaded successfully!")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nFirst 5 rows of X:")
print(X.head())

Data loaded successfully!
X shape: (1460, 21)
y shape: (1460,)

First 5 rows of X:
   1stFlrSF  2ndFlrSF  BedroomAbvGr  BsmtExposure  BsmtFinSF1  BsmtFinType1  \
0       856     854.0           3.0             3         706             2   
1      1262       0.0           3.0             1         978             0   
2       920     866.0           3.0             2         486             2   
3       961       0.0           3.0             3         216             0   
4      1145       0.0           4.0             0         655             2   

   BsmtUnfSF  GarageArea  GarageFinish  GarageYrBlt  ...  KitchenQual  \
0        150         548             1       2003.0  ...            2   
1        284         460             1       1976.0  ...            3   
2        434         608             1       2001.0  ...            2   
3        540         642             2       1998.0  ...            2   
4        490         836             1       2000.0  ...            2   

   

## Section 2: Train/Test Split

In [5]:
# Split data into train and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train/Test Split Summary:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

print(f"\nTrain set size: {len(X_train)} cases (80%)")
print(f"Test set size: {len(X_test)} cases (20%)")

print("\n" + "="*50)
print("Data split successfully!")

Train/Test Split Summary:
X_train shape: (1168, 21)
X_test shape: (292, 21)
y_train shape: (1168,)
y_test shape: (292,)

Train set size: 1168 cases (80%)
Test set size: 292 cases (20%)

Data split successfully!


## Section 3: Model Training - Random Forest Baseline

In [6]:
# Train baseline Random Forest model
print("Training baseline Random Forest Regressor...")
print("This may take a minute...\n")

rf_baseline = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf_baseline.fit(X_train, y_train)

# Make predictions
y_pred_train_baseline = rf_baseline.predict(X_train)
y_pred_test_baseline = rf_baseline.predict(X_test)

# Evaluate
r2_train_baseline = r2_score(y_train, y_pred_train_baseline)
r2_test_baseline = r2_score(y_test, y_pred_test_baseline)
mae_test_baseline = mean_absolute_error(y_test, y_pred_test_baseline)
rmse_test_baseline = np.sqrt(mean_squared_error(y_test, y_pred_test_baseline))

print("="*50)
print("BASELINE MODEL PERFORMANCE")
print("="*50)
print(f"R² Score (Train): {r2_train_baseline:.4f}")
print(f"R² Score (Test):  {r2_test_baseline:.4f}")
print(f"MAE (Test): ${mae_test_baseline:,.2f}")
print(f"RMSE (Test): ${rmse_test_baseline:,.2f}")
print("\nTarget R² Score: ≥ 0.75")
if r2_test_baseline >= 0.75:
    print("✅ TARGET ACHIEVED!")
else:
    print(f"⚠️  Need improvement: {0.75 - r2_test_baseline:.4f}")

Training baseline Random Forest Regressor...
This may take a minute...

BASELINE MODEL PERFORMANCE
R² Score (Train): 0.9801
R² Score (Test):  0.8936
MAE (Test): $18,023.90
RMSE (Test): $28,569.06

Target R² Score: ≥ 0.75
✅ TARGET ACHIEVED!
