<a href="https://colab.research.google.com/github/Taweilo/usa-house-price-prediction/blob/main/hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install optuna
! pip install catboost
!pip uninstall -y scikit-learn
!pip install scikit-learn==1.3.2

Found existing installation: scikit-learn 1.3.2
Uninstalling scikit-learn-1.3.2:
  Successfully uninstalled scikit-learn-1.3.2
Collecting scikit-learn==1.3.2
  Using cached scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Using cached scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.3.2


In [None]:
import numpy as np
import pandas as pd

import re
import optuna
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import catboost as cb

from xgboost import XGBRegressor, callback
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, r2_score
from scipy.optimize import minimize


import warnings
warnings.filterwarnings("ignore")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



## Import data

In [None]:
file_path = 'https://raw.githubusercontent.com/Taweilo/house-price-prediction/main/Data/train.csv'
df = pd.read_csv(file_path)
df = df.drop(columns=["Id"], axis = 1)
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


## Data preparation

In [None]:
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
cat_features = df.select_dtypes('object').columns
# Reference: https://www.kaggle.com/code/ambrosm/pss4e8-eda-which-makes-sense

for feature in cat_features:
    categories = sorted(list(set(df[feature].dropna())))
    dtype = pd.CategoricalDtype(categories=categories, ordered=False)

    df.loc[~df[feature].isin(categories), feature] = np.nan

    df[feature] = df[feature].astype(dtype)

In [None]:
# Define X
X = df.drop(columns=['SalePrice'], axis = 1)
y = df["SalePrice"]

In [None]:
# Split 80% train, 20% temp (val + test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split temp into 10% val, 10% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print shapes to verify
print("Train:", X_train.shape, y_train.shape)
print("Validation:", X_val.shape, y_val.shape)
print("Test:", X_test.shape, y_test.shape)

Train: (1168, 79) (1168,)
Validation: (146, 79) (146,)
Test: (146, 79) (146,)


## Modeling

- XGB

In [None]:
import xgboost
print(xgboost.__version__)

2.1.3


In [None]:
# Set logging to ERROR to suppress INFO messages from Optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

# Define the Optuna objective function
def objective(trial):
    params = {
        "lambda": trial.suggest_loguniform("reg_lambda", 1e-6, 10),  # Match LGBM's reg_lambda
        "alpha": trial.suggest_loguniform("reg_alpha", 1e-6, 10),  # Match LGBM's reg_alpha
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "learning_rate": trial.suggest_loguniform('learning_rate', 1e-2, 1.0),  # Match LGBM
        "max_depth": trial.suggest_int('max_depth', 2, 50),  # Match LGBM
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-4, 100),  # Match LGBM
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),  # Match LGBM
        "tree_method": "hist",  # Using GPU for speedup
        "device": "cuda",  # Use GPU
        "random_state": 42
    }

    # Initialize XGBoost Regressor
    model = XGBRegressor(**params, enable_categorical=True)

    # Cross-validation using KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

    # Return the mean of the CV scores (negative MSE)
    return scores.mean()

# Create a study and optimize
study = optuna.create_study(direction='maximize')  # 'maximize' because we want to minimize the negative MSE
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print("Best parameters found by Optuna: ", best_params)

Best parameters found by Optuna:  {'reg_lambda': 0.12688228439958346, 'reg_alpha': 0.000889636209268654, 'colsample_bytree': 0.613956213715118, 'subsample': 0.7536161508442696, 'learning_rate': 0.0531454528384571, 'max_depth': 3, 'min_child_weight': 0.1476017401560544, 'n_estimators': 912}


In [None]:
# Define the XGBRegressor model with the best parameters
xgb_model = XGBRegressor(
    **best_params,
    enable_categorical=True,
    random_state=42,
    eval_metric="rmse",  # Pass eval_metric here
    early_stopping_rounds=10  # Pass early stopping here
)

# Train the model with early stopping using the validation set
eval_set = [(X_val, y_val)]
xgb_model.fit(X_train, y_train, eval_set=eval_set, verbose=0)

# Evaluate on the validation set
xgb_val_pred = xgb_model.predict(X_val)
xgb_mse = mean_squared_error(y_val, xgb_val_pred)
xgb_r2 = r2_score(y_val, xgb_val_pred)

print(f"MSE on validation set: {xgb_mse}")
print(f"R² on validation set: {xgb_r2}")

MSE on validation set: 440113553.4553324
R² on validation set: 0.9248783987198703


- LightGB

In [None]:
# Set logging to ERROR to suppress INFO messages from Optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

# Define the Optuna objective function
def objective(trial):
    params = {
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-6, 10),  # Match XGBoost's lambda
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-6, 10),  # Match XGBoost's alpha
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 1.0),  # Match XGBoost
        "max_depth": trial.suggest_int("max_depth", 2, 50),  # Match XGBoost
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-4, 100),  # Match XGBoost
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),  # Match XGBoost
        "num_leaves": trial.suggest_int("num_leaves", 2, 500),  # Unique to LGBM
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),  # Unique to LGBM
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),  # Similar to XGB’s colsample_bytree
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),  # Similar to XGB’s subsample
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),  # Unique to LGBM
        "cat_smooth": trial.suggest_float("cat_smooth", 1, 100, log=True),  # Unique to LGBM
        "random_state": 42,
        "verbose": -1
    }

    # Create the LightGBM Regressor model
    model = lgb.LGBMRegressor(**params)

    # Cross-validation using KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    # Return the mean of the CV scores (negative MSE)
    return scores.mean()


# Create a study and optimize
study = optuna.create_study(direction='maximize')  # Minimize the RMSE
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print("Best parameters found by Optuna: ", best_params)

Best parameters found by Optuna:  {'reg_lambda': 2.0246287271463874, 'reg_alpha': 0.008333484840564514, 'colsample_bytree': 0.5299443544711497, 'subsample': 0.5421615536336953, 'learning_rate': 0.09941176488664437, 'max_depth': 39, 'min_child_weight': 15.96738750769411, 'n_estimators': 855, 'num_leaves': 229, 'min_data_in_leaf': 44, 'feature_fraction': 0.5652203298763192, 'bagging_fraction': 0.826706041753661, 'bagging_freq': 2, 'cat_smooth': 4.350099519287858}


In [None]:
# Define the LightGBM Regressor with the best parameters
lgb_model = lgb.LGBMRegressor(**best_params, random_seed = 42)
callbacks = [log_evaluation(period=150), early_stopping(stopping_rounds=10)]

# Train the model with early stopping using the validation set
eval_set = [(X_val, y_val)]  # Validation set for early stopping
lgb_model.fit(
    X_train, y_train,
    eval_set=eval_set,
    eval_metric='rmse',
    callbacks=callbacks
)

# Evaluate on the validation set
lgb_val_pred = lgb_model.predict(X_val)
lgb_mse = mean_squared_error(y_val, lgb_val_pred)
lgb_r2 = r2_score(y_val, lgb_val_pred)
print(f"MSE on validation set: {lgb_mse}")
print(f"r2 on validation set: {lgb_r2}")

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[76]	valid_0's rmse: 19586.7	valid_0's l2: 3.8364e+08
MSE on validation set: 383640298.198284
r2 on validation set: 0.9345176414359925


- CatBoost

In [None]:
# Identify categorical features
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()
cat_feature_indices = [X.columns.get_loc(col) for col in cat_features]  # Convert to indices

In [None]:
X_cat_train = X_train.copy()
X_cat_val = X_val.copy()
X_cat_test = X_test.copy()

#X_cat_train[cat_features] = X_cat_train[cat_features].astype(str).fillna('missing')
#X_cat_val[cat_features] = X_cat_val[cat_features].astype(str).fillna('missing')
#X_cat_test[cat_features] = X_cat_test[cat_features].astype(str).fillna('missing')


X_cat_train[cat_features] = X_cat_train[cat_features].astype(str)
X_cat_val[cat_features] = X_cat_val[cat_features].astype(str)
X_cat_test[cat_features] = X_cat_test[cat_features].astype(str)

In [None]:
# Set logging level to avoid clutter
# optuna.logging.set_verbosity(optuna.logging.ERROR)

# Define Optuna's objective function
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.5),
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-3, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1, 10),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "cat_features": cat_features,  # Use categorical features
        "verbose": 0,
        "random_seed": 42,
        "task_type": "GPU",  # Use GPU instead of CPU
        "nan_mode": "Min"  # CatBoost automatically handles NaN
    }

    # Initialize CatBoost Regressor
    model = cb.CatBoostRegressor(**params)

    # Cross-validation using KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_cat_train, y_train, cv=kf, scoring='neg_mean_squared_error')

    # Compute mean score
    mean_score = scores.mean()

    # Pruning logic: Report intermediate results
    trial.report(mean_score, step=1)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    return mean_score  # Optuna minimizes MSE by maximizing negative MSE


# Create an Optuna study with pruning
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print("Best parameters found by Optuna: ", best_params)

[I 2025-02-09 10:22:36,395] A new study created in memory with name: no-name-69d39cbd-998f-47e7-bd41-d65e1d4ee2fb
[I 2025-02-09 10:30:52,705] Trial 0 finished with value: -857456250.1195538 and parameters: {'max_depth': 10, 'learning_rate': 0.1481582188478661, 'n_estimators': 943, 'l2_leaf_reg': 0.06721480123848561, 'bagging_temperature': 0.14461214642268172, 'random_strength': 4.122967267004657, 'border_count': 117}. Best is trial 0 with value: -857456250.1195538.
[I 2025-02-09 10:31:47,836] Trial 1 finished with value: -802957526.3835422 and parameters: {'max_depth': 7, 'learning_rate': 0.13738790902552409, 'n_estimators': 262, 'l2_leaf_reg': 0.0020639836512623814, 'bagging_temperature': 0.2643008977125484, 'random_strength': 4.978602478650439, 'border_count': 255}. Best is trial 1 with value: -802957526.3835422.
[I 2025-02-09 10:42:05,334] Trial 2 finished with value: -880919009.1047513 and parameters: {'max_depth': 11, 'learning_rate': 0.016345358932732118, 'n_estimators': 734, 'l2

Best parameters found by Optuna:  {'max_depth': 5, 'learning_rate': 0.1295643155523358, 'n_estimators': 304, 'l2_leaf_reg': 0.015683846010523354, 'bagging_temperature': 0.1863686770153567, 'random_strength': 4.385386925403732, 'border_count': 91}


In [None]:
# Define the CatBoost model with the best parameters
cat_model = cb.CatBoostRegressor(**best_params, random_seed=42, verbose=200, early_stopping_rounds=10)

# Train the model with validation set
cat_model.fit(X_cat_train, y_train, eval_set=(X_cat_val, y_val), cat_features=cat_features)

# Evaluate on validation set
cat_val_pred = cat_model.predict(X_cat_val)
cat_mse = mean_squared_error(y_val, cat_val_pred)
cat_r2 = r2_score(y_val, cat_val_pred)

print(f"MSE on validation set: {cat_mse}")
print(f"R2 on validation set: {cat_r2}")

0:	learn: 71744.3115264	test: 71274.2183807	best: 71274.2183807 (0)	total: 10.6ms	remaining: 3.21s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 21226.59237
bestIteration = 108

Shrink model to first 109 iterations.
MSE on validation set: 450568223.55186427
R2 on validation set: 0.9230939238898156
