In [1]:
# Random forest with catboost

import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '6'

def prepare_dataset(df):
    # For categorical features, replace NA with "Missing"
    categorical_columns = df.select_dtypes(include=['object']).columns
    for key in categorical_columns:
        df[key] = df[key].astype('category').cat.add_categories("Missing").fillna("Missing")
    categorical_columns = df.select_dtypes(include=['category']).columns
    return df, categorical_columns

In [2]:
train_file_path = "data/house-prices/train.csv"
dataset_df = pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(dataset_df.shape))
dataset_df.head(3)

Full train dataset shape is (1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [3]:
dataset_df, categorical_columns = prepare_dataset(dataset_df)
dataset_df[categorical_columns].head(10)

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
1,RL,Pave,Missing,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
2,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
3,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,Missing,Missing,Missing,WD,Abnorml
4,RL,Pave,Missing,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
5,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,...,Attchd,Unf,TA,TA,Y,Missing,MnPrv,Shed,WD,Normal
6,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
7,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,PosN,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Shed,WD,Normal
8,RM,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Artery,...,Detchd,Unf,Fa,TA,Y,Missing,Missing,Missing,WD,Abnorml
9,RL,Pave,Missing,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,Artery,...,Attchd,RFn,Gd,TA,Y,Missing,Missing,Missing,WD,Normal


In [4]:
# Prepare dataset
target_column = 'SalePrice'
y = dataset_df.pop(target_column)
X = dataset_df.drop('Id', axis=1)

In [5]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(
    iterations=3000,                # Number of trees
    depth=6,                       # Depth of each tree
    boosting_type='Plain',         # Use random forest mode
    bootstrap_type='Poisson',    # Random sampling of data points
    task_type='GPU',               # GPU is needed for Poisson
    subsample=0.9,                 # Fraction of data to sample
    # random_seed=42,                # Seed for reproducibility
    verbose=100,                   # Output training progress every 100 iterations
    cat_features=categorical_columns.to_list()
)

# Train the model
model.fit(X, y)

Learning rate set to 0.022029
0:	learn: 78258.8342291	total: 52.8ms	remaining: 2m 38s
100:	learn: 29644.3000123	total: 5.32s	remaining: 2m 32s
200:	learn: 21281.6559990	total: 11.3s	remaining: 2m 37s
300:	learn: 18314.0267202	total: 17.3s	remaining: 2m 35s
400:	learn: 16687.7984936	total: 22.8s	remaining: 2m 28s
500:	learn: 15511.8152235	total: 27.8s	remaining: 2m 18s
600:	learn: 14572.1774516	total: 33.6s	remaining: 2m 14s
700:	learn: 13870.4655927	total: 39.1s	remaining: 2m 8s
800:	learn: 13199.8683065	total: 43.6s	remaining: 1m 59s
900:	learn: 12613.9864795	total: 47.4s	remaining: 1m 50s
1000:	learn: 11999.8366966	total: 52.4s	remaining: 1m 44s
1100:	learn: 11434.9199181	total: 58s	remaining: 1m 40s
1200:	learn: 10963.0767324	total: 1m 3s	remaining: 1m 34s
1300:	learn: 10489.8794049	total: 1m 8s	remaining: 1m 29s
1400:	learn: 10080.8809878	total: 1m 14s	remaining: 1m 24s
1500:	learn: 9665.6734467	total: 1m 19s	remaining: 1m 19s
1600:	learn: 9286.1186442	total: 1m 25s	remaining: 1m 1

<catboost.core.CatBoostRegressor at 0x7f220bade910>

In [6]:
feature_importances = model.get_feature_importance()
print(pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False))

         Feature  Importance
16   OverallQual   21.458480
45     GrLivArea   17.153099
37   TotalBsmtSF    5.644861
33    BsmtFinSF1    4.968758
42      1stFlrSF    4.419964
..           ...         ...
73   MiscFeature    0.013106
44  LowQualFinSF    0.011376
4         Street    0.005609
74       MiscVal    0.000865
8      Utilities    0.000840

[79 rows x 2 columns]


In [7]:
test_file_path = "data/house-prices/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_data, _ = prepare_dataset(test_data)
test_data.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,Missing,Reg,Lvl,AllPub,Inside,...,120,0,Missing,MnPrv,Missing,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,Missing,IR1,Lvl,AllPub,Corner,...,0,0,Missing,Missing,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,Missing,IR1,Lvl,AllPub,Inside,...,0,0,Missing,MnPrv,Missing,0,3,2010,WD,Normal


In [8]:
sample_submission_df = pd.read_csv('data/house-prices/sample_submission.csv')
sample_submission_df['SalePrice'] = model.predict(test_data)
sample_submission_df.to_csv('working/catboost-rf-poisson-subsample09-iter3000.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,117525.778059
1,1462,164646.564414
2,1463,189288.738346
3,1464,195433.785009
4,1465,180462.706093


# Cross Validation

In [5]:
# Grid Search
from sklearn.model_selection import GridSearchCV

# Define the model
model = CatBoostRegressor(
    boosting_type='Plain',         # Use random forest mode
    # bootstrap_type='Poisson',    # Random sampling of data points
    task_type='GPU',               # needed for Poisson            
    verbose=200,                    # Output training progress every 100 iterations
    cat_features=categorical_columns.to_list()
)

# Set up the parameter grid
param_grid = {
    'bootstrap_type': ['Poisson'],
    'depth': [4, 6, 8],
    'iterations': [500, 1000],
    'subsample': [0.6,0.7,0.8],
}

# Configure GridSearchCV
# When cv=None, default is 5-fold cross validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=None, scoring='neg_mean_squared_error', verbose=2)

# Fit GridSearchCV
grid_search.fit(X, y)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", (-grid_search.best_score_) ** 0.5)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Learning rate set to 0.062354
0:	learn: 77476.3105634	total: 23.1ms	remaining: 11.5s
200:	learn: 17998.7794776	total: 3.77s	remaining: 5.61s
400:	learn: 14637.4835475	total: 7.04s	remaining: 1.74s
499:	learn: 13452.7459457	total: 8.76s	remaining: 0us
[CV] END bootstrap_type=Poisson, depth=4, iterations=500, subsample=0.6; total time=   9.3s
Learning rate set to 0.062354
0:	learn: 76120.9652470	total: 21.2ms	remaining: 10.6s
200:	learn: 18169.3439482	total: 3.56s	remaining: 5.3s
400:	learn: 14881.1533505	total: 7.08s	remaining: 1.75s
499:	learn: 13826.1860271	total: 8.93s	remaining: 0us
[CV] END bootstrap_type=Poisson, depth=4, iterations=500, subsample=0.6; total time=  10.3s
Learning rate set to 0.062354
0:	learn: 74371.7949660	total: 17.2ms	remaining: 8.57s
200:	learn: 17717.8246848	total: 3.12s	remaining: 4.64s
400:	learn: 14173.1904820	total: 6.44s	remaining: 1.59s
499:	learn: 12998.2186345	total: 8.22s	remaining: 0us
[CV