In [1]:
# Random forest with catboost

import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def prepare_dataset(df):
    # For categorical features, replace NA with "Missing"
    categorical_columns = df.select_dtypes(include=['object']).columns
    for key in categorical_columns:
        df[key] = df[key].astype('category').cat.add_categories("Missing").fillna("Missing")
    categorical_columns = df.select_dtypes(include=['category']).columns
    return df, categorical_columns

In [2]:
train_file_path = "data/house-prices/train.csv"
dataset_df = pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(dataset_df.shape))
dataset_df.head(3)

Full train dataset shape is (1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [3]:
dataset_df, categorical_columns = prepare_dataset(dataset_df)
dataset_df[categorical_columns].head(10)

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
1,RL,Pave,Missing,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
2,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
3,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,Missing,Missing,Missing,WD,Abnorml
4,RL,Pave,Missing,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
5,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,...,Attchd,Unf,TA,TA,Y,Missing,MnPrv,Shed,WD,Normal
6,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
7,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,PosN,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Shed,WD,Normal
8,RM,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Artery,...,Detchd,Unf,Fa,TA,Y,Missing,Missing,Missing,WD,Abnorml
9,RL,Pave,Missing,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,Artery,...,Attchd,RFn,Gd,TA,Y,Missing,Missing,Missing,WD,Normal


In [4]:
# Prepare dataset
target_column = 'SalePrice'
y = dataset_df.pop(target_column)
X = dataset_df.drop('Id', axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(
    iterations=1000,                # Number of trees
    depth=6,                       # Depth of each tree
    boosting_type='Plain',         # Use random forest mode
    bootstrap_type='Poisson',    # Random sampling of data points
    task_type='GPU',               # needed for Poisson
    subsample=0.66,                # Fraction of data to sample
    random_seed=42,                # Seed for reproducibility
    verbose=100,                    # Output training progress every 100 iterations
    cat_features=categorical_columns.to_list()
)

# Train the model
model.fit(X_train, y_train)



Learning rate set to 0.041224
0:	learn: 75109.2652257	total: 902ms	remaining: 15m 1s
100:	learn: 22358.7289466	total: 1m 15s	remaining: 11m 11s
200:	learn: 17156.9700426	total: 2m 34s	remaining: 10m 13s
300:	learn: 14535.6923694	total: 3m 44s	remaining: 8m 41s
400:	learn: 12817.8538498	total: 4m 59s	remaining: 7m 27s
500:	learn: 11495.0634217	total: 6m 13s	remaining: 6m 11s
600:	learn: 10402.2674030	total: 7m 31s	remaining: 4m 59s
700:	learn: 9377.4892454	total: 8m 48s	remaining: 3m 45s
800:	learn: 8524.2766551	total: 10m 3s	remaining: 2m 29s
900:	learn: 7751.2390030	total: 11m 19s	remaining: 1m 14s
999:	learn: 7116.4676477	total: 12m 35s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7f32039af710>

In [15]:
# Make predictions
predictions = model.predict(X_test)

# Calculate MSE
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 660696070.7809154


In [16]:
feature_importances = model.get_feature_importance()
print(pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False))

         Feature  Importance
16   OverallQual   22.378857
45     GrLivArea   15.596854
37   TotalBsmtSF    4.596000
33    BsmtFinSF1    4.247072
42      1stFlrSF    3.790216
..           ...         ...
73   MiscFeature    0.013882
4         Street    0.009293
8      Utilities    0.003388
47  BsmtHalfBath    0.001965
74       MiscVal    0.000061

[79 rows x 2 columns]


In [18]:
test_file_path = "data/house-prices/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_data, _ = prepare_dataset(test_data)
test_data.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,Missing,Reg,Lvl,AllPub,Inside,...,120,0,Missing,MnPrv,Missing,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,Missing,IR1,Lvl,AllPub,Corner,...,0,0,Missing,Missing,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,Missing,IR1,Lvl,AllPub,Inside,...,0,0,Missing,MnPrv,Missing,0,3,2010,WD,Normal


In [19]:
preds = model.predict(test_data)
output = pd.DataFrame({'Id': ids,
                       'SalePrice': preds.squeeze()})

output.head()

Unnamed: 0,Id,SalePrice
0,1461,120546.779999
1,1462,167760.192834
2,1463,187146.883549
3,1464,195195.432913
4,1465,187951.324836


In [20]:
sample_submission_df = pd.read_csv('data/house-prices/sample_submission.csv')
sample_submission_df['SalePrice'] = model.predict(test_data)
sample_submission_df.to_csv('working/catboost-rf-poisson.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,120546.779999
1,1462,167760.192834
2,1463,187146.883549
3,1464,195195.432913
4,1465,187951.324836


# Cross Validation

In [5]:
# Grid Search
from sklearn.model_selection import GridSearchCV

# Define the model
model = CatBoostRegressor(
    boosting_type='Plain',         # Use random forest mode
    # bootstrap_type='Poisson',    # Random sampling of data points
    task_type='CPU',               # needed for Poisson
    verbose=200,                    # Output training progress every 100 iterations
    cat_features=categorical_columns.to_list()
)

# Set up the parameter grid
param_grid = {
    'bootstrap_type': ['Bernoulli'],
    'depth': [4, 6, 8],
    'iterations': [100, 500, 1000],
    'subsample': [0.4,0.5,0.6,0.7,0.8],
}

# Configure GridSearchCV
# When cv=None, default is 5-fold cross validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=None, scoring='neg_mean_squared_error', verbose=2)

# Fit GridSearchCV
grid_search.fit(X, y)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", (-grid_search.best_score_) ** 0.5)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
Learning rate set to 0.272794
0:	learn: 69903.1641739	total: 70.7ms	remaining: 7s
99:	learn: 16951.7647358	total: 667ms	remaining: 0us
[CV] END bootstrap_type=Bernoulli, depth=4, iterations=100, subsample=0.4; total time=   0.7s
Learning rate set to 0.272794
0:	learn: 67387.4337981	total: 19ms	remaining: 1.88s
99:	learn: 16512.6470568	total: 624ms	remaining: 0us
[CV] END bootstrap_type=Bernoulli, depth=4, iterations=100, subsample=0.4; total time=   0.7s
Learning rate set to 0.272794
0:	learn: 66703.5027234	total: 20.6ms	remaining: 2.04s
99:	learn: 15899.3044560	total: 628ms	remaining: 0us
[CV] END bootstrap_type=Bernoulli, depth=4, iterations=100, subsample=0.4; total time=   0.7s
Learning rate set to 0.272794
0:	learn: 67729.3535928	total: 20.1ms	remaining: 1.99s
99:	learn: 17092.8048114	total: 647ms	remaining: 0us
[CV] END bootstrap_type=Bernoulli, depth=4, iterations=100, subsample=0.4; total time=   0.7s
Learning rate s

KeyboardInterrupt: 