CatBoost Regression Modeling

In [23]:
import pickle
import pathlib
import numpy as np
import pandas as pd
import catboost

In [24]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)
clean_data_path = DATA_DIR / 'processed' / 'ames_clean_eng.pkl'
with open(clean_data_path, 'rb') as file:
    model_data = pickle.load(file)

/Users/femcdias/Documents/Insper/Semestre 4/Machine Learning/Projeto1/ames/data


In [25]:
y = model_data['SalePrice'].copy()
X = model_data.drop(columns=['SalePrice']).copy()

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
from catboost import CatBoostRegressor

# CatBoost model setup
model = CatBoostRegressor(verbose=0)
model.fit(X_train, y_train)

# Predictions on test data
ypred = model.predict(X_test)


In [28]:
from sklearn.metrics import mean_squared_error
RMSE = np.sqrt(mean_squared_error(y_test, ypred))
print(f'RMSE: {RMSE}')
error_percentage = 100 * (10**RMSE - 1)
print(f'Average error is {error_percentage:.2f}%')

RMSE: 0.04598116542355027
Average error is 11.17%


In [66]:
# Hyperparameter tuning through grid search
from sklearn.model_selection import GridSearchCV

# CatBoost model setup
model = CatBoostRegressor(verbose=0)

best_params_already_tuned = {
'bagging_temperature': [0],
'border_count': [32],
'depth': [6],
'iterations': [4000],
'l2_leaf_reg': [0.75],
'model_size_reg': [0.5],
'learning_rate': [0.05],
'od_wait': [100],
'random_strength': [3],
'random_seed': [42]}

# param_grid_pre_tuned_infinito = {
#     'depth': [3, 5, 6],
#     'learning_rate': [0.05, 0.1, 0.2],
#     'iterations': [100, 4000, 9000],
#     'l2_leaf_reg': [1, 2, 3],
#     'border_count': [32, 128],
#     'bagging_temperature': [0, 5],
#     'random_strength': [0.5, 1, 3],
#     'model_size_reg': [0.5, 0.75, 1],
#     'od_wait': [100, 200, 500],
#     'random_seed': [42],
# }

grid_search = GridSearchCV(estimator=model, param_grid=best_params_already_tuned, cv=3, n_jobs=-1, verbose=0)
grid_search.fit(X_train, y_train)

best_params_catboost = grid_search.best_params_
print(best_params_catboost)
best_model_catboost = grid_search.best_estimator_

{'bagging_temperature': 0, 'border_count': 32, 'depth': 6, 'iterations': 4000, 'l2_leaf_reg': 0.75, 'learning_rate': 0.05, 'model_size_reg': 0.5, 'od_wait': 100, 'random_seed': 42, 'random_strength': 3}


In [67]:
y_pred_catboost = best_model_catboost.predict(X_test)
RMSE_catboost = np.sqrt(mean_squared_error(y_test, y_pred_catboost))
print(f'RMSE: {RMSE_catboost}')
error_percentage_catboost = 100 * (10**RMSE_catboost - 1)
print(f'Average error is {error_percentage_catboost:.2f}%')

RMSE: 0.04430943781385159
Average error is 10.74%
