CatBoost Regression Modeling

In [1]:
import pickle
import pathlib
import numpy as np
import pandas as pd
import catboost

In [2]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)
clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

/Users/femcdias/Documents/Insper/Semestre 4/Machine Learning/Projeto1/ames/data


In [3]:
model_data = data.copy()
categorical_columns = []
ordinal_columns = []
for col in model_data.select_dtypes('category').columns:
    if model_data[col].cat.ordered:
        ordinal_columns.append(col)
    else:
        categorical_columns.append(col)
for col in ordinal_columns:
    codes, _ = pd.factorize(data[col], sort=True)
    model_data[col] = codes
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data, drop_first=True)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()
model_data = pd.get_dummies(model_data, drop_first=True)
model_data.info()
for cat in categorical_columns:
    dummies = []
    for col in model_data.columns:
        if col.startswith(cat + "_"):
            dummies.append(f'"{col}"')
    dummies_str = ', '.join(dummies)
    print(f'From column "{cat}" we made {dummies_str}\n')

<class 'pandas.core.frame.DataFrame'>
Index: 2877 entries, 0 to 2929
Columns: 165 entries, Lot.Frontage to Exterior_Other
dtypes: bool(119), float64(34), int64(12)
memory usage: 1.4 MB
From column "MS.SubClass" we made "MS.SubClass_30", "MS.SubClass_50", "MS.SubClass_60", "MS.SubClass_70", "MS.SubClass_80", "MS.SubClass_85", "MS.SubClass_90", "MS.SubClass_120", "MS.SubClass_160", "MS.SubClass_190", "MS.SubClass_Other"

From column "MS.Zoning" we made "MS.Zoning_RH", "MS.Zoning_RL", "MS.Zoning_RM"

From column "Land.Contour" we made "Land.Contour_HLS", "Land.Contour_Low", "Land.Contour_Lvl"

From column "Lot.Config" we made "Lot.Config_CulDSac", "Lot.Config_FR2", "Lot.Config_FR3", "Lot.Config_Inside"

From column "Neighborhood" we made "Neighborhood_BrDale", "Neighborhood_BrkSide", "Neighborhood_ClearCr", "Neighborhood_CollgCr", "Neighborhood_Crawfor", "Neighborhood_Edwards", "Neighborhood_Gilbert", "Neighborhood_IDOTRR", "Neighborhood_MeadowV", "Neighborhood_Mitchel", "Neighborhood_NAm

In [4]:
y = model_data['SalePrice'].copy()
X = model_data.drop(columns=['SalePrice']).copy()

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from catboost import CatBoostRegressor

# CatBoost model setup
model = CatBoostRegressor(verbose=0)
model.fit(X_train, y_train)

# Predictions on test data
ypred = model.predict(X_test)


In [15]:
from sklearn.metrics import mean_squared_error
RMSE = np.sqrt(mean_squared_error(y_test, ypred))
print(f'RMSE: {RMSE}')
error_percentage = 100 * (10**RMSE - 1)
print(f'Average error is {error_percentage:.2f}%')

RMSE: 0.04596950899383926
Average error is 11.17%


In [8]:
# Hyperparameter tuning through grid search
from sklearn.model_selection import GridSearchCV

# CatBoost model setup
model = CatBoostRegressor(verbose=0)

# best params that were already tuned = {
# 'bagging_temperature': 0,
# 'border_count': 32,
# 'depth': 6,
# 'iterations': 500,
# 'l2_leaf_reg': 1,
# 'learning_rate': 0.05,
# 'random_strength': 1}

param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 500],
    'l2_leaf_reg': [1, 5, 9],
    'border_count': [32, 128],
    'bagging_temperature': [0, 5],
    'random_strength': [0.5, 1, 2]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=0)
grid_search.fit(X_train, y_train)

best_params_catboost = grid_search.best_params_
print(best_params_catboost)
best_model_catboost = grid_search.best_estimator_

{'bagging_temperature': 0, 'border_count': 32, 'depth': 6, 'iterations': 500, 'l2_leaf_reg': 1, 'learning_rate': 0.05, 'random_strength': 1}


In [13]:
y_pred_catboost = best_model_catboost.predict(X_test)
RMSE_catboost = np.sqrt(mean_squared_error(y_test, y_pred_catboost))
print(f'RMSE: {RMSE_catboost}')
error_percentage_catboost = 100 * (10**RMSE_catboost - 1)
print(f'Average error is {error_percentage_catboost:.2f}%')

RMSE: 0.046184052293911766
Average error is 11.22%
