**Implementing CatBoost algorithm**

Boosting is a technique that can be used to improve the weak learners to strong ones in a seqential manner.

In [None]:
! pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/7e/c1/c1c4707013f9e2f8a96899dd3a87f66c9167d6d776a6dc8fe7ec8678d446/catboost-0.24.3-cp36-none-manylinux1_x86_64.whl (66.3MB)
[K     |████████████████████████████████| 66.3MB 102kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.3


In [None]:
!pip install shap



In [None]:
# Import required libraries
import catboost
from catboost import CatBoostClassifier
from catboost import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
import shap
from catboost import Pool

In [None]:
# Load data
train_data, test_data = datasets.amazon() 

In [None]:
# Check dataset shape
train_data.shape

(32769, 10)

In [None]:
test_data.shape

(58921, 10)

In [None]:
# Check dataset
train_data.head()

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


In [None]:
# Assign features x and label y 
y = train_data['ACTION']
x = train_data.drop(columns='ACTION', axis = 1)

In [None]:
x_test = test_data.drop(columns='id')

In [None]:
# Split dataset
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.30, random_state=1)

In [None]:
# CatBoost Classifier
params = {'loss_function':'Logloss',
          'eval_metric':'AUC', # metric
          'verbose': 200, 
          'random_seed': 1
         }
catboost = CatBoostClassifier(**params)
catboost.fit(x_train, y_train,
          eval_set=(x_valid, y_valid),
          use_best_model=True, 
          plot=True
         );

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.068702
0:	test: 0.5411584	best: 0.5411584 (0)	total: 92.5ms	remaining: 1m 32s
200:	test: 0.8054496	best: 0.8054496 (200)	total: 3.31s	remaining: 13.2s
400:	test: 0.8232289	best: 0.8235644 (398)	total: 6.49s	remaining: 9.7s
600:	test: 0.8346123	best: 0.8347460 (591)	total: 9.75s	remaining: 6.47s
800:	test: 0.8401571	best: 0.8401571 (800)	total: 13.1s	remaining: 3.25s
999:	test: 0.8438098	best: 0.8440074 (989)	total: 16.4s	remaining: 0us

bestTest = 0.8440073802
bestIteration = 989

Shrink model to first 990 iterations.


In [None]:
# Check catboost features
catboost_feature_names = x.columns 
catboost_feature = [x.columns.get_loc(col) for col in catboost_feature_names]
print(catboost_features)

[0, 1, 2, 3, 4, 5, 6, 7, 8]


In [None]:
# Define pool objects
train_data_1 = Pool(data=x_train,
                  label=y_train,
                  cat_features=catboost_features
                 )

valid_data_1 = Pool(data=x_valid,
                  label=y_valid,
                  cat_features=catboost_features
                 )

In [None]:
# Catboost classifier
params = {'loss_function':'Logloss',
          'eval_metric':'AUC',
          'cat_features': catboost_features,
          'verbose': 200,
          'random_seed': 1
         }
catboost_1 = CatBoostClassifier(**params)
catboost_1.fit(x_train, y_train,
          eval_set=(x_valid, y_valid),
          use_best_model=True,
          plot=True
         );

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.068702
0:	test: 0.5069213	best: 0.5069213 (0)	total: 91.2ms	remaining: 1m 31s
200:	test: 0.8891873	best: 0.8891977 (197)	total: 11.6s	remaining: 46.3s
400:	test: 0.8919557	best: 0.8922049 (341)	total: 24.5s	remaining: 36.7s
600:	test: 0.8898214	best: 0.8922049 (341)	total: 37.8s	remaining: 25.1s
800:	test: 0.8893703	best: 0.8922049 (341)	total: 51.3s	remaining: 12.7s
999:	test: 0.8881854	best: 0.8922049 (341)	total: 1m 5s	remaining: 0us

bestTest = 0.8922049414
bestIteration = 341

Shrink model to first 342 iterations.


In [None]:
# check feature importance
catboost_1.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,RESOURCE,20.791085
1,MGR_ID,16.569638
2,ROLE_DEPTNAME,15.833254
3,ROLE_FAMILY_DESC,10.670041
4,ROLE_ROLLUP_2,9.430817
5,ROLE_TITLE,8.46062
6,ROLE_ROLLUP_1,7.305282
7,ROLE_FAMILY,6.04127
8,ROLE_CODE,4.897993
