In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import ensemble
from scipy.stats import spearmanr

import time
import warnings
warnings.filterwarnings("ignore")

import catboost as cb
# from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold

import utils

In [2]:
df_train = pd.read_csv('../data/train.csv')
target = df_train.pop('target')

df_train.drop(['ID_code'], axis=1, inplace=True)

In [3]:
test_file = '../data/test.csv'

df_test = pd.read_csv(test_file)
test_id = df_test.pop('ID_code')

In [4]:
params = {
        'max_depth': 2,
        'colsample_bytree': 0.3,
        'learning_rate': 0.01,
        'objective': 'Logloss',
        'eval_metric' : 'AUC',
        'n_jobs': 8,
        'colsample_bylevel': 0.03
        }



In [5]:
%%time
fold_n = 5

folds = StratifiedKFold(n_splits = fold_n, shuffle=True, random_state=30)
y_pred_cb = np.zeros(len(df_test))

for fold_n, (train_index, valid_index) in enumerate(folds.split(df_train, target)):
    print('Fold', fold_n, 'started at', time.ctime())
    dtrain = cb.Pool(df_train.iloc[train_index], label=target.iloc[train_index])
    dvalid = cb.Pool(df_train.iloc[valid_index], label=target.iloc[valid_index])

    model_cb = cb.train(dtrain, params, num_boost_round = 10000,
              evals = dvalid, verbose=300, early_stopping_rounds=200)
              
    y_pred_cb += model_cb.predict(df_test)/fold_n

Fold 0 started at Wed Mar  6 05:16:45 2019
0:	test: 0.5104196	best: 0.5104196 (0)	total: 149ms	remaining: 24m 52s
300:	test: 0.8295888	best: 0.8300199 (249)	total: 22.8s	remaining: 12m 13s
600:	test: 0.8384848	best: 0.8389167 (598)	total: 44.6s	remaining: 11m 37s
900:	test: 0.8426173	best: 0.8426173 (900)	total: 1m 7s	remaining: 11m 19s
1200:	test: 0.8465968	best: 0.8465968 (1200)	total: 1m 29s	remaining: 10m 55s
1500:	test: 0.8496976	best: 0.8497143 (1498)	total: 1m 52s	remaining: 10m 35s
1800:	test: 0.8532707	best: 0.8533178 (1796)	total: 2m 14s	remaining: 10m 12s
2100:	test: 0.8561757	best: 0.8561757 (2100)	total: 2m 36s	remaining: 9m 47s
2400:	test: 0.8596557	best: 0.8596557 (2400)	total: 2m 58s	remaining: 9m 25s
2700:	test: 0.8627867	best: 0.8627926 (2699)	total: 3m 21s	remaining: 9m 3s
3000:	test: 0.8662198	best: 0.8662198 (3000)	total: 3m 43s	remaining: 8m 41s
3300:	test: 0.8687705	best: 0.8687705 (3300)	total: 4m 5s	remaining: 8m 18s
3600:	test: 0.8711878	best: 0.8711878 (3600)

Fold 3 started at Wed Mar  6 05:55:44 2019
0:	test: 0.5077281	best: 0.5077281 (0)	total: 105ms	remaining: 17m 27s
300:	test: 0.8258094	best: 0.8258094 (300)	total: 20s	remaining: 10m 43s
600:	test: 0.8325041	best: 0.8327398 (597)	total: 42.9s	remaining: 11m 11s
900:	test: 0.8361859	best: 0.8361955 (888)	total: 1m 6s	remaining: 11m 8s
1200:	test: 0.8403472	best: 0.8403472 (1200)	total: 1m 28s	remaining: 10m 49s
1500:	test: 0.8431561	best: 0.8431561 (1500)	total: 1m 51s	remaining: 10m 29s
1800:	test: 0.8468158	best: 0.8468158 (1800)	total: 2m 13s	remaining: 10m 7s
2100:	test: 0.8502182	best: 0.8502182 (2100)	total: 2m 36s	remaining: 9m 47s
2400:	test: 0.8534490	best: 0.8534490 (2400)	total: 2m 58s	remaining: 9m 24s
2700:	test: 0.8561329	best: 0.8561329 (2700)	total: 3m 20s	remaining: 9m 2s
3000:	test: 0.8593046	best: 0.8593096 (2997)	total: 3m 43s	remaining: 8m 40s
3300:	test: 0.8620085	best: 0.8620085 (3300)	total: 4m 10s	remaining: 8m 28s
3600:	test: 0.8646398	best: 0.8646398 (3600)	to

In [8]:
submission_cb = pd.DataFrame({
        "ID_code": test_id,
        "target": y_pred_cb
    })
submission_cb.to_csv('../results/submission_cb.csv', index=False)