In [1]:
import pandas as pd
from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [2]:
def prepare_dataset(df):
    # For categorical features, replace NA with "Missing"
    categorical_columns = df.select_dtypes(include=['object']).columns
    for key in categorical_columns:
        df[key] = df[key].astype('category').cat.add_categories("Missing").fillna("Missing")
    categorical_columns = df.select_dtypes(include=['category']).columns
    return df, categorical_columns

In [3]:
train_file_path = "data/house-prices/train.csv"
dataset_df = pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(dataset_df.shape))
dataset_df.head(3)

Full train dataset shape is (1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [4]:
dataset_df, categorical_columns = prepare_dataset(dataset_df)
dataset_df[categorical_columns].head(10)

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
1,RL,Pave,Missing,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
2,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
3,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,Missing,Missing,Missing,WD,Abnorml
4,RL,Pave,Missing,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
5,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,...,Attchd,Unf,TA,TA,Y,Missing,MnPrv,Shed,WD,Normal
6,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
7,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,PosN,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Shed,WD,Normal
8,RM,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Artery,...,Detchd,Unf,Fa,TA,Y,Missing,Missing,Missing,WD,Abnorml
9,RL,Pave,Missing,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,Artery,...,Attchd,RFn,Gd,TA,Y,Missing,Missing,Missing,WD,Normal


In [5]:
# Prepare dataset
target_column = 'SalePrice'
y = dataset_df.pop(target_column)
X = dataset_df.drop('Id', axis=1)

In [6]:
# Define model parameters
params = {
    'iterations': 1000,
    'depth': 6,
    'loss_function': 'RMSE',
    'verbose': 200,
    'early_stopping_rounds': 50
}

# Cross-validation parameters
cv_params = {
    'fold_count': 5,     # Number of folds in CV
    'shuffle': True,     # Shuffle data before splitting into batches
    'partition_random_seed': 0,  # Random seed for shuffling
    'stratified': False,  # Whether to perform stratified sampling
    'plot': True         # Whether to plot curve of metrics during training
}

In [7]:
# Perform cross-validation
pool_data = Pool(data=X, label=y, cat_features=categorical_columns.to_list())
cv_results = cv(pool=pool_data, params=params, fold_count=cv_params['fold_count'],
                shuffle=cv_params['shuffle'], partition_random_seed=cv_params['partition_random_seed'],
                stratified=cv_params['stratified'], plot=cv_params['plot'])

# Output results
print(cv_results)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: 191655.8001431	test: 194801.4789916	best: 194801.4789916 (0)	total: 138ms	remaining: 2m 17s
200:	learn: 21665.9476024	test: 29790.6289848	best: 29790.6289848 (200)	total: 15.8s	remaining: 1m 2s
400:	learn: 16166.1062903	test: 26405.8589363	best: 26405.8589363 (400)	total: 31.8s	remaining: 47.5s
600:	learn: 13465.6417160	test: 25652.3909561	best: 25652.3909561 (600)	total: 48.3s	remaining: 32.1s
800:	learn: 11410.6290921	test: 25326.0068387	best: 25326.0068387 (800)	total: 1m 4s	remaining: 16.1s
999:	learn: 9915.3679869	test: 25128.0093982	best: 25125.7908877 (996)	total: 1m 21s	remaining: 0us

bestTest = 25125.79089
bestIteration = 996

Training on fold [1/5]
0:	learn: 191961.4288738	test: 194413.9678392	best: 194413.9678392 (0)	total: 88ms	remaining: 1m 27s
200:	learn: 20629.1848262	test: 27029.1179947	best: 27029.1179947 (200)	total: 15.6s	remaining: 1m 1s
400:	learn: 14718.5804019	test: 25678.7221359	best: 25678.7221359 (400)	total: 32.3s	remaining: 

In [6]:
# Grid Search
from sklearn.model_selection import GridSearchCV

# Define the model
model = CatBoostRegressor()

# Set up the parameter grid
param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 500, 1000]
}

# Configure GridSearchCV
# When cv=None, default is 5-fold cross validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=None, scoring='neg_mean_squared_error', verbose=2)

# Fit GridSearchCV
grid_search.fit(X, y, cat_features=categorical_columns.to_list(), verbose=200)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", (-grid_search.best_score_) ** 0.5)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
0:	learn: 80152.8606922	total: 66.3ms	remaining: 6.56s
99:	learn: 48961.3040980	total: 778ms	remaining: 0us
[CV] END ........depth=4, iterations=100, learning_rate=0.01; total time=   0.8s
0:	learn: 78514.9687236	total: 16.5ms	remaining: 1.63s
99:	learn: 47567.0671572	total: 666ms	remaining: 0us
[CV] END ........depth=4, iterations=100, learning_rate=0.01; total time=   0.7s
0:	learn: 76877.3102851	total: 18.3ms	remaining: 1.82s
99:	learn: 46559.0335290	total: 661ms	remaining: 0us
[CV] END ........depth=4, iterations=100, learning_rate=0.01; total time=   0.7s
0:	learn: 80713.0572734	total: 19.2ms	remaining: 1.91s
99:	learn: 48921.5965406	total: 693ms	remaining: 0us
[CV] END ........depth=4, iterations=100, learning_rate=0.01; total time=   0.7s
0:	learn: 78286.3423412	total: 15.6ms	remaining: 1.55s
99:	learn: 46952.6925053	total: 647ms	remaining: 0us
[CV] END ........depth=4, iterations=100, learning_rate=0.01; total time= 

In [8]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=4,
    cat_features=categorical_columns.to_list(),
    verbose=200
)

# Train the model
model.fit(X, y)

0:	learn: 76999.4225144	total: 65.1ms	remaining: 1m 4s
200:	learn: 19505.4212971	total: 8.71s	remaining: 34.6s
400:	learn: 16590.3198499	total: 17.2s	remaining: 25.7s
600:	learn: 14940.6049306	total: 25.9s	remaining: 17.2s
800:	learn: 13493.0420810	total: 34.4s	remaining: 8.55s
999:	learn: 12331.5963840	total: 43.1s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7fc164263dd0>

In [9]:
feature_importances = model.get_feature_importance()
print(pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False))

         Feature  Importance
16   OverallQual   24.347809
45     GrLivArea   11.958646
37   TotalBsmtSF    5.981899
33    BsmtFinSF1    5.022012
60    GarageCars    3.929818
..           ...         ...
73   MiscFeature    0.003274
47  BsmtHalfBath    0.002181
74       MiscVal    0.000809
38       Heating    0.000621
8      Utilities    0.000546

[79 rows x 2 columns]


In [11]:
test_file_path = "data/house-prices/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_data, _ = prepare_dataset(test_data)
test_data.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,Missing,Reg,Lvl,AllPub,Inside,...,120,0,Missing,MnPrv,Missing,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,Missing,IR1,Lvl,AllPub,Corner,...,0,0,Missing,Missing,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,Missing,IR1,Lvl,AllPub,Inside,...,0,0,Missing,MnPrv,Missing,0,3,2010,WD,Normal


In [12]:
preds = model.predict(test_data)
output = pd.DataFrame({'Id': ids,
                       'SalePrice': preds.squeeze()})

output.head()

Unnamed: 0,Id,SalePrice
0,1461,119826.252238
1,1462,164152.877002
2,1463,186901.44099
3,1464,195226.430084
4,1465,193014.675126


In [13]:
sample_submission_df = pd.read_csv('data/house-prices/sample_submission.csv')
sample_submission_df['SalePrice'] = model.predict(test_data)
sample_submission_df.to_csv('working/catboost-cv.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,119826.252238
1,1462,164152.877002
2,1463,186901.44099
3,1464,195226.430084
4,1465,193014.675126
