In [1]:
import catboost as ctb
import hyperopt
import graphviz
from sklearn import model_selection
import pandas as pd
import numpy as np
import datetime
import sys
sys.path.append('..')

## Read (feature, target) file

In [2]:
df_feature = pd.read_csv('./datasets/refined_features.csv', encoding='utf8' )
df_target = pd.read_csv('./datasets/refined_targets.csv', encoding='utf8' )
df_predict = pd.read_csv('./datasets/refined_predicts.csv', encoding='utf8' )

In [3]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(df_feature, df_target, test_size=0.3)

In [4]:
P_train = ctb.Pool(x_train, y_train, cat_features=[x for x in range(0, len(x_train.columns))])
P_test = ctb.Pool(x_test, y_test, cat_features=[x for x in range(0, len(x_test.columns))])
P_predict = ctb.Pool(df_predict, cat_features=[x for x in range(0, len(df_predict.columns))])

# Hyperparameter tuning using Hyperopt

In [5]:
def objective(params):
    model = ctb.CatBoostClassifier(iterations=3000,
                                   learning_rate=params['learning_rate'],
                                   max_depth=int(params['max_depth']),
                                   loss_function='Logloss',
                                   use_best_model=True,
                                   eval_metric='Logloss',
                                   random_seed=123,
                                   verbose=False,
                                  )
    print(model.get_params())
    res = ctb.cv(params=model.get_params(), dtrain=P_train, num_boost_round=300, nfold=10,
                 early_stopping_rounds=100, verbose=True
                 )
    return np.min(res['test-Logloss-mean'])

In [None]:
from numpy.random import RandomState

params_space = {
    'max_depth': hyperopt.hp.randint('max_depth', 10),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
    
}

trials = hyperopt.Trials()
best = hyperopt.fmin(fn=objective,
                     space=params_space,
                     algo=hyperopt.tpe.suggest,
                     max_evals=100,
                     trials=trials,
                     rstate=RandomState(123))

{'max_depth': 4, 'eval_metric': 'Logloss', 'verbose': False, 'use_best_model': True, 'random_seed': 123, 'loss_function': 'Logloss', 'learning_rate': 0.38214701126985373, 'iterations': 3000}
0:	learn: 0.1979862	test: 0.1979734	best: 0.1979734 (0)	total: 5.66s	remaining: 28m 12s

1:	learn: 0.0745416	test: 0.0743158	best: 0.0743158 (1)	total: 10.7s	remaining: 26m 31s

2:	learn: 0.0494403	test: 0.0490770	best: 0.0490770 (2)	total: 15.6s	remaining: 25m 42s

3:	learn: 0.0439457	test: 0.0434381	best: 0.0434381 (3)	total: 20.9s	remaining: 25m 46s

4:	learn: 0.0403242	test: 0.0396442	best: 0.0396442 (4)	total: 25s	remaining: 24m 36s

5:	learn: 0.0378788	test: 0.0370823	best: 0.0370823 (5)	total: 30s	remaining: 24m 32s

6:	learn: 0.0365727	test: 0.0356919	best: 0.0356919 (6)	total: 34s	remaining: 23m 44s

7:	learn: 0.0357873	test: 0.0348861	best: 0.0348861 (7)	total: 39.2s	remaining: 23m 51s

8:	learn: 0.0354203	test: 0.0345194	best: 0.0345194 (8)	total: 43.8s	remaining: 23m 35s

9:	learn: 0.03

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



296:	learn: 0.0378252	test: 0.0372680	best: 0.0372680 (296)	total: 7m 40s	remaining: 4.65s

297:	learn: 0.0378243	test: 0.0372665	best: 0.0372665 (297)	total: 7m 41s	remaining: 3.1s

298:	learn: 0.0378230	test: 0.0372665	best: 0.0372665 (297)	total: 7m 43s	remaining: 1.55s

299:	learn: 0.0378224	test: 0.0372652	best: 0.0372652 (299)	total: 7m 44s	remaining: 0us

{'max_depth': 6, 'eval_metric': 'Logloss', 'verbose': False, 'use_best_model': True, 'random_seed': 123, 'loss_function': 'Logloss', 'learning_rate': 0.327590450627555, 'iterations': 3000}
0:	learn: 0.2366768	test: 0.2366588	best: 0.2366588 (0)	total: 5.1s	remaining: 25m 23s 

1:	learn: 0.0897853	test: 0.0897590	best: 0.0897590 (1)	total: 10.6s	remaining: 26m 14s

2:	learn: 0.0526859	test: 0.0525497	best: 0.0525497 (2)	total: 16.9s	remaining: 27m 49s

3:	learn: 0.0428584	test: 0.0425348	best: 0.0425348 (3)	total: 24.4s	remaining: 30m 5s 

4:	learn: 0.0395125	test: 0.0390399	best: 0.0390399 (4)	total: 30.6s	remaining: 30m 5s 

5

In [None]:
print(best)

In [None]:
model = ctb.CatBoostClassifier(iterations=5000,
                               learning_rate=best['learning_rate'],
                               max_depth=int(best['max_depth']),
                               loss_function='Logloss',
                               use_best_model=True,
                               eval_metric='Logloss',
                               random_seed=123,
                               verbose=False,
                               )



In [None]:
model.fit(P_train, eval_set=P_test, plot=True, early_stopping_rounds=1000)

In [None]:
preds_class = model.predict(P_predict)

In [None]:

preds_class = pd.DataFrame(preds_class, columns=['fraud_ind']).astype(int)

In [None]:
txkey = pd.read_csv('../dataset/test.csv', encoding='utf8')['txkey']
results = pd.concat([txkey, preds_class], axis=1)
results.to_csv('./submits/ctb_hp_refined_features_0002.csv', header=True, index=None)