# CatBoost

In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

### Read Dataset

In [4]:
df = pd.read_csv("AER_credit_card_data.csv")
df

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.5200,0.033270,124.983300,yes,no,3,54,1,12
1,yes,0,33.25000,2.4200,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5000,0.004156,15.000000,yes,no,4,58,1,5
3,yes,0,30.50000,2.5400,0.065214,137.869200,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.503300,yes,no,2,64,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1314,yes,0,33.58333,4.5660,0.002146,7.333333,yes,no,0,94,1,19
1315,no,5,23.91667,3.1920,0.000376,0.000000,no,no,3,12,1,5
1316,yes,0,40.58333,4.6000,0.026513,101.298300,yes,no,2,1,1,2
1317,yes,0,32.83333,3.7000,0.008999,26.996670,no,yes,0,60,1,7


##### Check if there are any null values  

In [5]:
df.isnull().sum()

card           0
reports        0
age            0
income         0
share          0
expenditure    0
owner          0
selfemp        0
dependents     0
months         0
majorcards     0
active         0
dtype: int64

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
reports,1319.0,0.456406,1.345267,0.0,0.0,0.0,0.0,14.0
age,1319.0,33.213103,10.142783,0.166667,25.41667,31.25,39.41667,83.5
income,1319.0,3.365376,1.693902,0.21,2.24375,2.9,4.0,13.5
share,1319.0,0.068732,0.094656,0.000109,0.002316,0.038827,0.093617,0.90632
expenditure,1319.0,185.057071,272.218917,0.0,4.583333,101.2983,249.0358,3099.505
dependents,1319.0,0.993935,1.247745,0.0,0.0,1.0,2.0,6.0
months,1319.0,55.267627,66.271746,0.0,12.0,30.0,72.0,540.0
majorcards,1319.0,0.817286,0.386579,0.0,1.0,1.0,1.0,1.0
active,1319.0,6.996967,6.305812,0.0,2.0,6.0,11.0,46.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   card         1319 non-null   object 
 1   reports      1319 non-null   int64  
 2   age          1319 non-null   float64
 3   income       1319 non-null   float64
 4   share        1319 non-null   float64
 5   expenditure  1319 non-null   float64
 6   owner        1319 non-null   object 
 7   selfemp      1319 non-null   object 
 8   dependents   1319 non-null   int64  
 9   months       1319 non-null   int64  
 10  majorcards   1319 non-null   int64  
 11  active       1319 non-null   int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 123.8+ KB


##### Show Categorical Data Description

In [8]:
df.describe(include='O').T

Unnamed: 0,count,unique,top,freq
card,1319,2,yes,1023
owner,1319,2,no,738
selfemp,1319,2,no,1228


##### Encode Categorical Data

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
def encode(dataframe):
    lec = LabelEncoder()
    for j in dataframe.columns:
        if(dataframe[j].dtype == 'object'):
            dataframe[j] = lec.fit_transform(dataframe[j])
            
encode(df)

In [11]:
df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,1,0,37.66667,4.52,0.03327,124.9833,1,0,3,54,1,12
1,1,0,33.25,2.42,0.005217,9.854167,0,0,3,34,1,13
2,1,0,33.66667,4.5,0.004156,15.0,1,0,4,58,1,5
3,1,0,30.5,2.54,0.065214,137.8692,0,0,0,25,1,7
4,1,0,32.16667,9.7867,0.067051,546.5033,1,0,2,64,1,5


In [13]:
X = df.drop('card', axis=1)
y = df['card']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [14]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-win_amd64.whl (77.3 MB)
Collecting graphviz
  Downloading graphviz-0.17-py3-none-any.whl (18 kB)
Collecting plotly
  Downloading plotly-5.3.1-py2.py3-none-any.whl (23.9 MB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly, graphviz, catboost
Successfully installed catboost-1.0.3 graphviz-0.17 plotly-5.3.1 tenacity-8.0.1


In [15]:
from catboost import CatBoostRegressor

In [16]:
catb = CatBoostRegressor()
catb_model = catb.fit(X_train, y_train)

Learning rate set to 0.040871
0:	learn: 0.4018555	total: 151ms	remaining: 2m 31s
1:	learn: 0.3883765	total: 153ms	remaining: 1m 16s
2:	learn: 0.3750567	total: 155ms	remaining: 51.5s
3:	learn: 0.3627154	total: 157ms	remaining: 39.1s
4:	learn: 0.3500679	total: 158ms	remaining: 31.5s
5:	learn: 0.3388565	total: 160ms	remaining: 26.5s
6:	learn: 0.3277735	total: 162ms	remaining: 22.9s
7:	learn: 0.3167823	total: 163ms	remaining: 20.2s
8:	learn: 0.3065445	total: 164ms	remaining: 18.1s
9:	learn: 0.2971303	total: 166ms	remaining: 16.4s
10:	learn: 0.2880773	total: 168ms	remaining: 15.1s
11:	learn: 0.2789853	total: 169ms	remaining: 13.9s
12:	learn: 0.2704076	total: 170ms	remaining: 12.9s
13:	learn: 0.2627273	total: 172ms	remaining: 12.1s
14:	learn: 0.2549371	total: 173ms	remaining: 11.4s
15:	learn: 0.2475974	total: 174ms	remaining: 10.7s
16:	learn: 0.2406935	total: 176ms	remaining: 10.2s
17:	learn: 0.2338362	total: 177ms	remaining: 9.67s
18:	learn: 0.2273030	total: 179ms	remaining: 9.23s
19:	learn

### Predict

In [17]:
y_pred = catb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

0.13000786162156555

### Model Tuning

In [18]:
catb_grid = {
    'iterations': [200,500,1000,2000],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'depth': [3,4,5,6,7,8] }

In [19]:
catb = CatBoostRegressor()
catb_cv_model = GridSearchCV(catb, catb_grid, cv=5, n_jobs = -1, verbose = 2)

In [20]:
catb_cv_model.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
0:	learn: 0.4050925	total: 805us	remaining: 160ms
1:	learn: 0.3943046	total: 1.8ms	remaining: 178ms
2:	learn: 0.3836545	total: 2.63ms	remaining: 173ms
3:	learn: 0.3743387	total: 3.61ms	remaining: 177ms
4:	learn: 0.3649379	total: 4.45ms	remaining: 174ms
5:	learn: 0.3559634	total: 5.29ms	remaining: 171ms
6:	learn: 0.3469930	total: 6.08ms	remaining: 168ms
7:	learn: 0.3380015	total: 6.84ms	remaining: 164ms
8:	learn: 0.3299683	total: 7.66ms	remaining: 162ms
9:	learn: 0.3216011	total: 8.58ms	remaining: 163ms
10:	learn: 0.3136473	total: 9.31ms	remaining: 160ms
11:	learn: 0.3060649	total: 10.2ms	remaining: 160ms
12:	learn: 0.2986612	total: 11ms	remaining: 159ms
13:	learn: 0.2913795	total: 11.9ms	remaining: 157ms
14:	learn: 0.2843471	total: 12.7ms	remaining: 157ms
15:	learn: 0.2773521	total: 13.6ms	remaining: 156ms
16:	learn: 0.2712990	total: 14.4ms	remaining: 155ms
17:	learn: 0.2650679	total: 15.1ms	remaining: 153ms
18:	learn: 0.258

GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostRegressor object at 0x000001FFF5AAE348>,
             n_jobs=-1,
             param_grid={'depth': [3, 4, 5, 6, 7, 8],
                         'iterations': [200, 500, 1000, 2000],
                         'learning_rate': [0.01, 0.03, 0.05, 0.1]},
             verbose=2)

In [21]:
catb_cv_model.best_params_

{'depth': 3, 'iterations': 200, 'learning_rate': 0.03}

In [23]:
catb_tuned = CatBoostRegressor(iterations = 200, 
                               learning_rate = 0.03, 
                               depth = 3)

catb_tuned = catb_tuned.fit(X_train,y_train)

0:	learn: 0.4050925	total: 529us	remaining: 105ms
1:	learn: 0.3943046	total: 1.23ms	remaining: 122ms
2:	learn: 0.3836545	total: 1.82ms	remaining: 120ms
3:	learn: 0.3743387	total: 2.43ms	remaining: 119ms
4:	learn: 0.3649379	total: 2.95ms	remaining: 115ms
5:	learn: 0.3559634	total: 3.48ms	remaining: 113ms
6:	learn: 0.3469930	total: 4.01ms	remaining: 111ms
7:	learn: 0.3380015	total: 4.57ms	remaining: 110ms
8:	learn: 0.3299683	total: 5.1ms	remaining: 108ms
9:	learn: 0.3216011	total: 5.72ms	remaining: 109ms
10:	learn: 0.3136473	total: 6.2ms	remaining: 106ms
11:	learn: 0.3060649	total: 6.73ms	remaining: 105ms
12:	learn: 0.2986612	total: 7.27ms	remaining: 105ms
13:	learn: 0.2913795	total: 7.82ms	remaining: 104ms
14:	learn: 0.2843471	total: 8.46ms	remaining: 104ms
15:	learn: 0.2773521	total: 9.17ms	remaining: 105ms
16:	learn: 0.2712990	total: 9.76ms	remaining: 105ms
17:	learn: 0.2650679	total: 10.5ms	remaining: 106ms
18:	learn: 0.2589122	total: 11.1ms	remaining: 105ms
19:	learn: 0.2528514	tota

In [24]:
y_pred = catb_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

0.13166987001922814