In [1]:
import math
import pandas as pd
from itertools import islice
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
import logging
from tqdm import tqdm
import category_encoders
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder, scale, MinMaxScaler, Normalizer, QuantileTransformer, PowerTransformer, StandardScaler
from scipy.stats import boxcox
import math
from sklearn.preprocessing import KBinsDiscretizer
from catboost import Pool, CatBoostClassifier

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
test_path = '../data/test.csv'

df_test = pd.read_csv(test_path)
df_test.drop(['ID_code'], axis=1, inplace=True)
df_test = df_test.values

unique_samples = []
unique_count = np.zeros_like(df_test)
for feature in tqdm(range(df_test.shape[1])):
    _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

del df_test

100%|██████████| 200/200 [00:03<00:00, 51.94it/s]


In [3]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

test_real = test.iloc[real_samples_indexes]

features = train.drop(['ID_code', 'target'], axis = 1).columns.tolist()

data = train.append(test_real)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [4]:
num_round = 1000000
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

model = CatBoostClassifier(loss_function="Logloss",
                           eval_metric="AUC",
                           task_type="GPU",
                           learning_rate=0.01,
                           iterations=70000,
                           l2_leaf_reg=50,
                           random_seed=42,
                           od_type="Iter",
                           depth=5,
                           early_stopping_rounds=15000,
                           border_count=64
                          )

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, train.target.values)):
    print("Fold {}".format(fold_))
    X_train, y_train = train.iloc[trn_idx][features], train.iloc[trn_idx]['target']
    X_valid, y_valid = train.iloc[val_idx][features], train.iloc[val_idx]['target']
    
    for col in tqdm(features):
        gr = data[col].value_counts()
        gr_bin = data.groupby(col)[col].count()>1
        
        X_train[col + '_un'] = X_train[col].map(gr).astype('category').cat.codes
        X_valid[col + '_un'] = X_valid[col].map(gr).astype('category').cat.codes
        test[col + '_un'] = test[col].map(gr).astype('category').cat.codes
        
        X_train[col + '_un_bin'] = X_train[col].map(gr_bin).astype('category').cat.codes
        X_valid[col + '_un_bin'] = X_valid[col].map(gr_bin).astype('category').cat.codes
        test[col + '_un_bin'] = test[col].map(gr_bin).astype('category').cat.codes
        
        X_train[col + '_raw_mul'] = X_train[col] * X_train[col + '_un_bin']
        X_valid[col + '_raw_mul'] = X_valid[col] * X_valid[col + '_un_bin']
        test[col + '_raw_mul'] = test[col] * test[col + '_un_bin']
        
        X_train[col + '_raw_mul_2'] = X_train[col] * X_train[col + '_un']
        X_valid[col + '_raw_mul_2'] = X_valid[col] * X_valid[col + '_un']
        test[col + '_raw_mul_2'] = test[col] * test[col + '_un']
        
        X_train[col + '_raw_mul_3'] = X_train[col + '_un_bin'] * X_train[col + '_un']
        X_valid[col + '_raw_mul_3'] = X_valid[col + '_un_bin'] * X_valid[col + '_un']
        test[col + '_raw_mul_3'] = test[col + '_un_bin'] * test[col + '_un']


    _train = Pool(X_train, label=y_train)
    _valid = Pool(X_valid, label=y_valid)
    clf = model.fit(_train,
                    eval_set=_valid,
                    use_best_model=True,
                    verbose=5000)
    pred = clf.predict_proba(X_valid)[:,1]
    oof[val_idx] = pred
    print( "  auc = ", roc_auc_score(y_valid, pred) )
    predictions += clf.predict_proba(test.drop('ID_code', axis=1))[:,1] / folds.n_splits
print("CV score: {:<8.5f}".format(roc_auc_score(train.target, oof)))

Fold 0


100%|██████████| 200/200 [00:21<00:00,  9.33it/s]


0:	learn: 0.5728201	test: 0.5700304	best: 0.5700304 (0)	total: 12.3ms	remaining: 14m 20s
5000:	learn: 0.9240082	test: 0.9066963	best: 0.9066982 (4999)	total: 41.3s	remaining: 8m 56s
10000:	learn: 0.9398253	test: 0.9137069	best: 0.9137069 (10000)	total: 1m 25s	remaining: 8m 30s
15000:	learn: 0.9484921	test: 0.9158431	best: 0.9158431 (15000)	total: 2m 8s	remaining: 7m 51s
20000:	learn: 0.9548300	test: 0.9168046	best: 0.9168074 (19996)	total: 2m 52s	remaining: 7m 10s
25000:	learn: 0.9598748	test: 0.9173313	best: 0.9173395 (24929)	total: 3m 33s	remaining: 6m 24s
30000:	learn: 0.9642778	test: 0.9176749	best: 0.9176757 (29989)	total: 4m 14s	remaining: 5m 38s
35000:	learn: 0.9681751	test: 0.9179857	best: 0.9180108 (34582)	total: 4m 55s	remaining: 4m 55s
40000:	learn: 0.9717056	test: 0.9181272	best: 0.9181358 (39144)	total: 5m 37s	remaining: 4m 12s
45000:	learn: 0.9748630	test: 0.9183064	best: 0.9183152 (44872)	total: 6m 19s	remaining: 3m 30s
50000:	learn: 0.9777284	test: 0.9183868	best: 0.918

100%|██████████| 200/200 [00:19<00:00, 10.40it/s]


0:	learn: 0.5807576	test: 0.5756657	best: 0.5756657 (0)	total: 9.93ms	remaining: 11m 35s
5000:	learn: 0.9235764	test: 0.9086463	best: 0.9086494 (4998)	total: 43.7s	remaining: 9m 28s
10000:	learn: 0.9397349	test: 0.9150919	best: 0.9150919 (10000)	total: 1m 29s	remaining: 8m 54s
15000:	learn: 0.9483761	test: 0.9166624	best: 0.9166624 (15000)	total: 2m 10s	remaining: 8m
20000:	learn: 0.9546379	test: 0.9174297	best: 0.9174350 (19975)	total: 2m 52s	remaining: 7m 10s
25000:	learn: 0.9597359	test: 0.9177300	best: 0.9177377 (24891)	total: 3m 33s	remaining: 6m 24s
30000:	learn: 0.9640619	test: 0.9178941	best: 0.9178941 (30000)	total: 4m 14s	remaining: 5m 38s
35000:	learn: 0.9678469	test: 0.9180148	best: 0.9180195 (34958)	total: 4m 54s	remaining: 4m 54s
40000:	learn: 0.9712924	test: 0.9181183	best: 0.9181221 (39176)	total: 5m 35s	remaining: 4m 11s
45000:	learn: 0.9744330	test: 0.9181766	best: 0.9181819 (44812)	total: 6m 15s	remaining: 3m 28s
50000:	learn: 0.9772761	test: 0.9182482	best: 0.918258

100%|██████████| 200/200 [00:17<00:00, 11.12it/s]


0:	learn: 0.6017445	test: 0.5971054	best: 0.5971054 (0)	total: 11.7ms	remaining: 13m 38s
5000:	learn: 0.9224392	test: 0.9125243	best: 0.9125243 (5000)	total: 39.4s	remaining: 8m 31s
10000:	learn: 0.9383468	test: 0.9197622	best: 0.9197652 (9985)	total: 1m 21s	remaining: 8m 6s
15000:	learn: 0.9470513	test: 0.9217533	best: 0.9217556 (14969)	total: 2m 2s	remaining: 7m 30s
20000:	learn: 0.9533288	test: 0.9227132	best: 0.9227149 (19996)	total: 2m 43s	remaining: 6m 48s
25000:	learn: 0.9585161	test: 0.9232219	best: 0.9232269 (24933)	total: 3m 24s	remaining: 6m 7s
30000:	learn: 0.9629787	test: 0.9235570	best: 0.9235614 (29980)	total: 4m 5s	remaining: 5m 26s
35000:	learn: 0.9669530	test: 0.9237791	best: 0.9237806 (34993)	total: 4m 45s	remaining: 4m 45s
40000:	learn: 0.9704924	test: 0.9238660	best: 0.9238660 (40000)	total: 5m 26s	remaining: 4m 4s
45000:	learn: 0.9737311	test: 0.9240093	best: 0.9240111 (44990)	total: 6m 7s	remaining: 3m 23s
50000:	learn: 0.9767018	test: 0.9240926	best: 0.9240928 (

100%|██████████| 200/200 [00:18<00:00, 10.60it/s]


0:	learn: 0.5848160	test: 0.5736620	best: 0.5736620 (0)	total: 10.1ms	remaining: 11m 49s
5000:	learn: 0.9236907	test: 0.9052941	best: 0.9052952 (4999)	total: 40s	remaining: 8m 39s
10000:	learn: 0.9394110	test: 0.9127966	best: 0.9127966 (10000)	total: 1m 21s	remaining: 8m 10s
15000:	learn: 0.9479403	test: 0.9150627	best: 0.9150665 (14961)	total: 2m 3s	remaining: 7m 32s
20000:	learn: 0.9541599	test: 0.9162026	best: 0.9162032 (19999)	total: 2m 44s	remaining: 6m 51s
25000:	learn: 0.9592310	test: 0.9169162	best: 0.9169163 (24999)	total: 3m 25s	remaining: 6m 10s
30000:	learn: 0.9635986	test: 0.9174040	best: 0.9174147 (29936)	total: 4m 6s	remaining: 5m 28s
35000:	learn: 0.9676346	test: 0.9177527	best: 0.9177535 (34980)	total: 4m 47s	remaining: 4m 47s
40000:	learn: 0.9711438	test: 0.9179799	best: 0.9179972 (39818)	total: 5m 28s	remaining: 4m 6s
45000:	learn: 0.9743641	test: 0.9181210	best: 0.9181211 (44998)	total: 6m 9s	remaining: 3m 25s
50000:	learn: 0.9772726	test: 0.9182158	best: 0.9182179 

100%|██████████| 200/200 [00:18<00:00, 11.07it/s]


0:	learn: 0.5787821	test: 0.5719253	best: 0.5719253 (0)	total: 10.3ms	remaining: 11m 58s
5000:	learn: 0.9243197	test: 0.9042088	best: 0.9042088 (5000)	total: 40s	remaining: 8m 39s
10000:	learn: 0.9402627	test: 0.9122181	best: 0.9122181 (10000)	total: 1m 22s	remaining: 8m 12s
15000:	learn: 0.9489300	test: 0.9144688	best: 0.9144688 (15000)	total: 2m 3s	remaining: 7m 33s
20000:	learn: 0.9552233	test: 0.9155124	best: 0.9155124 (20000)	total: 2m 45s	remaining: 6m 53s
25000:	learn: 0.9602112	test: 0.9160678	best: 0.9160678 (25000)	total: 3m 26s	remaining: 6m 11s
30000:	learn: 0.9645241	test: 0.9164454	best: 0.9164455 (29999)	total: 4m 7s	remaining: 5m 29s
35000:	learn: 0.9684306	test: 0.9167426	best: 0.9167433 (34987)	total: 4m 48s	remaining: 4m 48s
40000:	learn: 0.9719731	test: 0.9169307	best: 0.9169307 (40000)	total: 5m 29s	remaining: 4m 6s
45000:	learn: 0.9751093	test: 0.9170274	best: 0.9170300 (44981)	total: 6m 9s	remaining: 3m 25s
50000:	learn: 0.9779670	test: 0.9171646	best: 0.9171658 

In [5]:
sub = pd.DataFrame({"ID_code": test.ID_code.values})
sub["target"] = predictions
sub.to_csv("Range_bins_sub_3.csv", index=False)

In [6]:
sub.head()

Unnamed: 0,ID_code,target
0,test_0,0.04873
1,test_1,0.180953
2,test_2,0.262334
3,test_3,0.161932
4,test_4,0.054634
