# CatBoost

In [None]:
# @title
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [18]:
import numpy as np
import catboost
from catboost.datasets import titanic
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score


## 1. Data Preparation

### 1.1 Data Loading

In [6]:
# load data:
train_df, test_df = titanic()

In [7]:
# handle missing values:
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

In [8]:
# target and data:
X = train_df.drop("Survived", axis=1)
y = train_df.Survived

In [9]:
# types of features:
X.dtypes

Unnamed: 0,0
PassengerId,int64
Pclass,int64
Name,object
Sex,object
Age,float64
SibSp,int64
Parch,int64
Ticket,object
Fare,float64
Cabin,object


In [14]:
# indices of cat features:
categorical_features_indices = np.where(X.dtypes != float)[0]

In [17]:
# data splitting:
X_train, X_validation, y_train, y_validation = train_test_split(
    X,
    y,
    train_size=0.75,
    random_state=42)
X_test = test_df


## CatBoost basics

In [26]:
# define the model:
model = CatBoostClassifier(
    eval_metric= "F1",
    random_seed=42,
    verbose=200
)

In [27]:
# fit the model:
model.fit(X_train, y_train,
          cat_features=categorical_features_indices,
          eval_set=(X_validation, y_validation))

Learning rate set to 0.028683
0:	learn: 0.6831683	test: 0.6849315	best: 0.6849315 (0)	total: 11.9ms	remaining: 11.9s
200:	learn: 0.8322440	test: 0.7453416	best: 0.7625000 (150)	total: 1.13s	remaining: 4.49s
400:	learn: 0.8736617	test: 0.7500000	best: 0.7625000 (150)	total: 2.37s	remaining: 3.54s
600:	learn: 0.9152542	test: 0.7088608	best: 0.7625000 (150)	total: 3.65s	remaining: 2.43s
800:	learn: 0.9372385	test: 0.7215190	best: 0.7625000 (150)	total: 4.93s	remaining: 1.22s
999:	learn: 0.9632653	test: 0.7125000	best: 0.7625000 (150)	total: 7.01s	remaining: 0us

bestTest = 0.7625
bestIteration = 150

Shrink model to first 151 iterations.


<catboost.core.CatBoostClassifier at 0x7a1fe1173140>

In [28]:
# predictions:
predictions = model.predict(X_test)
predictions

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

## 3. CatBoost Features

In [23]:
# define some parameters:
params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': metrics.Accuracy(),
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}


In [29]:
# Let's define some params and create Pool for more convenience. It stores all information about
#dataset (features, labeles, categorical features indices, weights and and much more).
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

In [30]:
# train the model with use_best_model = False:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))


Simple model validation accuracy: 0.7982


In [31]:
# train the model with use_best_model = True:
best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool)
print('Best model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, best_model.predict(X_validation))
))

Best model validation accuracy: 0.8251


### 3.1 Early Stopping

In [32]:
earlystop_params = params.copy()
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool)

<catboost.core.CatBoostClassifier at 0x7a1fe1170b00>

In [33]:
print('Early-stopped model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, earlystop_model.predict(X_validation))
))

Early-stopped model validation accuracy: 0.8072


## 3.2 Feature Importance

In [34]:
feature_importances = best_model.get_feature_importance(train_pool)
feature_names = X_train.columns

In [35]:
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Sex: 42.17474818544473
Pclass: 16.294343982697637
Ticket: 9.797084805047552
Parch: 7.53610285333354
Age: 7.041597602831637
Fare: 4.9866064486621715
Cabin: 4.836829329849025
Embarked: 3.6878977676281557
SibSp: 3.6447890245054957
PassengerId: 0.0
Name: 0.0
