In [2]:
# Target log transformation

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder


In [3]:
def prepare_dataset(df, one_hot=False, fillna=True):
    # Encode categorical features with one hot, and replace NA with "Missing"
    # With one hot encoding support
    categorical_columns = df.select_dtypes(include=['object']).columns
    for key in categorical_columns:
        if fillna:
            df[key] = df[key].astype('category').cat.add_categories("Missing").fillna("Missing")
        else:
            df[key] = df[key].astype('category')
    if not one_hot:
        return df
    
    categorical_columns = df.select_dtypes(include=['category']).columns
    encoder = OneHotEncoder(sparse_output=False)
    encoded_features = encoder.fit_transform(df[categorical_columns])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns.to_list()))
    df = pd.concat([df.drop(categorical_columns, axis=1), encoded_df], axis=1)
    return df

In [4]:
train_file_path = "../data/house-prices/train.csv"
dataset_df = pd.read_csv(train_file_path)
dataset_df = dataset_df.drop('Id', axis=1)
target_column = 'SalePrice'
dataset_df = prepare_dataset(dataset_df, fillna=True)
y = dataset_df.pop(target_column)
y = np.log(y)

In [8]:
# Grid Search
from sklearn.model_selection import GridSearchCV

# Define the model
model = CatBoostRegressor()

# Set up the parameter grid
param_grid = {
    'depth': [4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.03, 0.05, 0.08, 0.1, 0.12, 0.15, 0.18, 0.2],
    'iterations': [1000]
}

# Configure GridSearchCV
# When cv=None, default is 5-fold cross validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=None, scoring='neg_mean_squared_error', verbose=2)

# Fit GridSearchCV
grid_search.fit(dataset_df, y, cat_features=dataset_df.select_dtypes(include=['category']).columns.to_list(), verbose=200)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", (-grid_search.best_score_) ** 0.5)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
0:	learn: 0.3997156	total: 196ms	remaining: 3m 15s
200:	learn: 0.1806922	total: 10.4s	remaining: 41.3s
400:	learn: 0.1375961	total: 20.9s	remaining: 31.3s
600:	learn: 0.1225262	total: 31.1s	remaining: 20.6s
800:	learn: 0.1141174	total: 41.4s	remaining: 10.3s
999:	learn: 0.1079667	total: 51.5s	remaining: 0us
[CV] END .......depth=4, iterations=1000, learning_rate=0.01; total time=  51.8s
0:	learn: 0.3901106	total: 34.4ms	remaining: 34.4s
200:	learn: 0.1760092	total: 10.4s	remaining: 41.4s
400:	learn: 0.1331497	total: 20.7s	remaining: 30.9s
600:	learn: 0.1193744	total: 30.8s	remaining: 20.5s
800:	learn: 0.1126035	total: 40.8s	remaining: 10.1s
999:	learn: 0.1079057	total: 51.5s	remaining: 0us
[CV] END .......depth=4, iterations=1000, learning_rate=0.01; total time=  51.7s
0:	learn: 0.3942021	total: 55.4ms	remaining: 55.3s
200:	learn: 0.1767404	total: 10.3s	remaining: 41.1s
400:	learn: 0.1341839	total: 20.7s	remaining: 30.9s
600

In [5]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(
    cat_features=dataset_df.select_dtypes(include=['category']).columns.to_list(),
    verbose=200,
    depth=6,
    iterations=5000,
    learning_rate=0.08,
)

# Train the model
model.fit(dataset_df, y)

0:	learn: 0.3787746	total: 307ms	remaining: 25m 35s
200:	learn: 0.0856170	total: 24.3s	remaining: 9m 41s
400:	learn: 0.0646964	total: 48.6s	remaining: 9m 17s
600:	learn: 0.0512697	total: 1m 13s	remaining: 9m
800:	learn: 0.0422379	total: 1m 34s	remaining: 8m 13s
1000:	learn: 0.0352846	total: 1m 57s	remaining: 7m 48s
1200:	learn: 0.0297889	total: 2m 18s	remaining: 7m 18s
1400:	learn: 0.0251030	total: 2m 41s	remaining: 6m 55s
1600:	learn: 0.0213352	total: 3m 2s	remaining: 6m 26s
1800:	learn: 0.0184435	total: 3m 24s	remaining: 6m 3s
2000:	learn: 0.0160680	total: 3m 47s	remaining: 5m 40s
2200:	learn: 0.0139790	total: 4m 10s	remaining: 5m 18s
2400:	learn: 0.0122665	total: 4m 32s	remaining: 4m 55s
2600:	learn: 0.0110854	total: 4m 54s	remaining: 4m 31s
2800:	learn: 0.0099428	total: 5m 18s	remaining: 4m 9s
3000:	learn: 0.0090341	total: 5m 42s	remaining: 3m 48s
3200:	learn: 0.0081762	total: 6m 4s	remaining: 3m 24s
3400:	learn: 0.0074293	total: 6m 27s	remaining: 3m 2s
3600:	learn: 0.0067749	total

<catboost.core.CatBoostRegressor at 0x7f6b5d836dd0>

In [6]:
feature_importances = model.get_feature_importance()
print(pd.DataFrame({'Feature': dataset_df.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False))

         Feature  Importance
16   OverallQual   17.472277
45     GrLivArea   14.205994
37   TotalBsmtSF    4.581659
56   FireplaceQu    4.113695
52   KitchenQual    3.913788
..           ...         ...
47  BsmtHalfBath    0.013107
21      RoofMatl    0.006144
4         Street    0.002258
74       MiscVal    0.001903
8      Utilities    0.000150

[79 rows x 2 columns]


In [9]:
test_file_path = "../data/house-prices/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_data = prepare_dataset(test_data)
test_data.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,Missing,Reg,Lvl,AllPub,Inside,...,120,0,Missing,MnPrv,Missing,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,Missing,IR1,Lvl,AllPub,Corner,...,0,0,Missing,Missing,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,Missing,IR1,Lvl,AllPub,Inside,...,0,0,Missing,MnPrv,Missing,0,3,2010,WD,Normal


In [11]:
sample_submission_df = pd.read_csv('../data/house-prices/sample_submission.csv')
sample_submission_df['SalePrice'] = np.exp(model.predict(test_data))
sample_submission_df.to_csv('../working/catboost-cv-log-iter5000.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,121209.319749
1,1462,164330.053613
2,1463,185092.610925
3,1464,199428.110962
4,1465,186339.811064
