In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

import catboost as cb
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

# ф-ии

In [71]:
# CatBoost
def catboost_base(x, x_val, y, y_val):
    cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.001,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": False,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
    }

    model = cb.CatBoostClassifier(**cb_params)
    model.fit(x, y,
        )
    y_pred = model.predict(x_val)
    print("roc_auc_score: ",roc_auc_score(y_val, y_pred))
    print("confusion_matrix: ",confusion_matrix(y_val, y_pred))
    print("accuracy_score: ",accuracy_score(y_val, y_pred))

In [12]:
# XGBoost
def xgb_base(x, x_val, y, y_val):

    params = {
    "booster": "gbtree",
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "learning_rate": 0.01,
    "nthread": 6,
    "seed": 27
    }

    dtrain = xgb.DMatrix(
    x, y
    )
    dvalid = xgb.DMatrix(
    x_val, y_val
    )

    model = xgb.train(
        params=params,
        dtrain=dtrain,
        evals=[(dtrain, "dtrain"), (dvalid, "dvalid")],
        early_stopping_rounds=25,
        num_boost_round=1000,
        verbose_eval=10,
        maximize=True,
    )

In [13]:
def forest_base(x, x_val, y, y_val):
    model = RandomForestClassifier(
    n_estimators=250, max_depth=8, n_jobs=6, random_state=27
    )
    model.fit(x, y)
    y_pred = model.predict(x_val)
    print("roc_auc_score: ",roc_auc_score(y_val, y_pred))
    print("confusion_matrix: ",confusion_matrix(y_val, y_pred))
    print("accuracy_score: ",accuracy_score(y_val, y_pred))

# Чтение данных

In [14]:
path = "./data"
print(os.listdir(path))

train = pd.read_csv(f"{path}/train_new.csv")
train.columns = train.columns.str.lower()
test = pd.read_csv(f"{path}/test.csv")
test.columns = test.columns.str.lower()

print("train.shape = {} rows, {} cols".format(*train.shape))
print("test.shape = {} rows, {} cols".format(*test.shape))

['greeks.csv', 'sample_submission.csv', 'test.csv', 'train.csv', 'train_new.csv']
train.shape = 617 rows, 57 cols
test.shape = 5 rows, 57 cols


In [15]:
for i in train.columns:
    if i != "id":
        train[i] = train[i].fillna(train[i].median())

In [16]:
train.isna().sum()

ab         0
af         0
ah         0
am         0
ar         0
ax         0
ay         0
az         0
bc         0
bd         0
bn         0
bp         0
bq         0
br         0
bz         0
cb         0
cc         0
cd         0
cf         0
ch         0
cl         0
cr         0
cs         0
cu         0
cw         0
da         0
de         0
df         0
dh         0
di         0
dl         0
dn         0
du         0
dv         0
dy         0
eb         0
ee         0
eg         0
eh         0
el         0
ep         0
eu         0
fc         0
fd         0
fe         0
fi         0
fl         0
fr         0
fs         0
gb         0
ge         0
gf         0
gh         0
gi         0
gl         0
class      0
ej_freq    0
dtype: int64

# train test split

In [17]:
train.dtypes[train.dtypes == "object"]

Series([], dtype: object)

In [18]:
x_train, x_valid, y_train, y_valid = train_test_split(train.drop( "class", axis=1), train["class"], test_size = 0.2)
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((493, 56), (124, 56), (493,), (124,))

# Baseline XGBoost-classifier

In [19]:
xgb_base(x_train, x_valid, y_train, y_valid)

[0]	dtrain-auc:0.95183	dvalid-auc:0.88083
[10]	dtrain-auc:0.95468	dvalid-auc:0.88313
[20]	dtrain-auc:0.95490	dvalid-auc:0.88333
[30]	dtrain-auc:0.98179	dvalid-auc:0.92292
[40]	dtrain-auc:0.98405	dvalid-auc:0.92875
[50]	dtrain-auc:0.98695	dvalid-auc:0.92375
[60]	dtrain-auc:0.98866	dvalid-auc:0.91792
[62]	dtrain-auc:0.98881	dvalid-auc:0.91625


# Baseline RandomForestClassifier

In [20]:
forest_base(x_train, x_valid, y_train, y_valid)

roc_auc_score:  0.7916666666666667
confusion_matrix:  [[100   0]
 [ 10  14]]
accuracy_score:  0.9193548387096774


# Baseline CatBoostClassifier

In [21]:
catboost_base(x_train, x_valid, y_train, y_valid)

0:	total: 149ms	remaining: 4m 57s
10:	total: 203ms	remaining: 36.6s
20:	total: 228ms	remaining: 21.5s
30:	total: 252ms	remaining: 16s
40:	total: 275ms	remaining: 13.1s
50:	total: 296ms	remaining: 11.3s
60:	total: 317ms	remaining: 10.1s
70:	total: 339ms	remaining: 9.21s
80:	total: 365ms	remaining: 8.65s
90:	total: 386ms	remaining: 8.11s
100:	total: 407ms	remaining: 7.66s
110:	total: 428ms	remaining: 7.28s
120:	total: 449ms	remaining: 6.97s
130:	total: 470ms	remaining: 6.71s
140:	total: 491ms	remaining: 6.47s
150:	total: 512ms	remaining: 6.27s
160:	total: 532ms	remaining: 6.08s
170:	total: 553ms	remaining: 5.92s
180:	total: 578ms	remaining: 5.81s
190:	total: 600ms	remaining: 5.68s
200:	total: 622ms	remaining: 5.57s
210:	total: 643ms	remaining: 5.45s
220:	total: 663ms	remaining: 5.34s
230:	total: 684ms	remaining: 5.24s
240:	total: 704ms	remaining: 5.14s
250:	total: 724ms	remaining: 5.04s
260:	total: 745ms	remaining: 4.96s
270:	total: 765ms	remaining: 4.88s
280:	total: 789ms	remaining: 4.8

# Обучение с удалением пропусков

In [22]:
path = "./data"
print(os.listdir(path))

train = pd.read_csv(f"{path}/train_new.csv")
train.columns = train.columns.str.lower()
test = pd.read_csv(f"{path}/test.csv")
test.columns = test.columns.str.lower()

print("train.shape = {} rows, {} cols".format(*train.shape))
print("test.shape = {} rows, {} cols".format(*test.shape))

['greeks.csv', 'sample_submission.csv', 'test.csv', 'train.csv', 'train_new.csv']
train.shape = 617 rows, 57 cols
test.shape = 5 rows, 57 cols


In [23]:
train = train.dropna()

## train_test_split

In [24]:
x_train, x_valid, y_train, y_valid = train_test_split(train.drop( "class", axis=1), train["class"], test_size = 0.2)
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((438, 56), (110, 56), (438,), (110,))

## baseline xgb

In [25]:
xgb_base(x_train, x_valid, y_train, y_valid)

[0]	dtrain-auc:0.97813	dvalid-auc:0.84012
[10]	dtrain-auc:0.97999	dvalid-auc:0.83527
[20]	dtrain-auc:0.98059	dvalid-auc:0.83891
[30]	dtrain-auc:0.98620	dvalid-auc:0.87597
[40]	dtrain-auc:0.98714	dvalid-auc:0.87670
[50]	dtrain-auc:0.98780	dvalid-auc:0.88711
[60]	dtrain-auc:0.98839	dvalid-auc:0.90795
[70]	dtrain-auc:0.99204	dvalid-auc:0.91085
[80]	dtrain-auc:0.99156	dvalid-auc:0.91860
[90]	dtrain-auc:0.99209	dvalid-auc:0.92878
[100]	dtrain-auc:0.99434	dvalid-auc:0.93847
[110]	dtrain-auc:0.99526	dvalid-auc:0.94089
[120]	dtrain-auc:0.99623	dvalid-auc:0.94186
[130]	dtrain-auc:0.99708	dvalid-auc:0.94380
[140]	dtrain-auc:0.99786	dvalid-auc:0.94816
[150]	dtrain-auc:0.99804	dvalid-auc:0.94961
[160]	dtrain-auc:0.99833	dvalid-auc:0.95252
[170]	dtrain-auc:0.99865	dvalid-auc:0.95446
[180]	dtrain-auc:0.99890	dvalid-auc:0.95252
[190]	dtrain-auc:0.99922	dvalid-auc:0.95349
[198]	dtrain-auc:0.99932	dvalid-auc:0.95349


## RandomForestClassifier

In [26]:
forest_base(x_train, x_valid, y_train, y_valid)

roc_auc_score:  0.8275193798449612
confusion_matrix:  [[85  1]
 [ 8 16]]
accuracy_score:  0.9181818181818182


## catboost

In [27]:
catboost_base(x_train, x_valid, y_train, y_valid)

0:	total: 4.25ms	remaining: 8.51s
10:	total: 32.4ms	remaining: 5.86s
20:	total: 60.2ms	remaining: 5.67s
30:	total: 87.4ms	remaining: 5.55s
40:	total: 110ms	remaining: 5.23s
50:	total: 135ms	remaining: 5.14s
60:	total: 156ms	remaining: 4.96s
70:	total: 178ms	remaining: 4.83s
80:	total: 198ms	remaining: 4.68s
90:	total: 219ms	remaining: 4.6s
100:	total: 244ms	remaining: 4.58s
110:	total: 264ms	remaining: 4.49s
120:	total: 284ms	remaining: 4.41s
130:	total: 304ms	remaining: 4.34s
140:	total: 324ms	remaining: 4.27s
150:	total: 346ms	remaining: 4.23s
160:	total: 365ms	remaining: 4.17s
170:	total: 386ms	remaining: 4.12s
180:	total: 405ms	remaining: 4.07s
190:	total: 426ms	remaining: 4.04s
200:	total: 447ms	remaining: 4s
210:	total: 466ms	remaining: 3.95s
220:	total: 485ms	remaining: 3.91s
230:	total: 505ms	remaining: 3.87s
240:	total: 525ms	remaining: 3.83s
250:	total: 547ms	remaining: 3.81s
260:	total: 567ms	remaining: 3.78s
270:	total: 587ms	remaining: 3.74s
280:	total: 606ms	remaining: 3.

* лучше всех себя показала модель xgb

# Удаление выбросов

In [37]:
for i in train:
    max_ = train[i].max()
    quantile = train[i].quantile(0.99)
    print(i)
    print("max: ", max_)
    print("quantile 99: ", quantile)
    train[i] = train[i].drop(train[i].loc[train[i]>quantile].index)


ab
max:  4.435374
quantile 99:  2.067405589999999
af
max:  28688.18766
quantile 99:  10150.88023619998
ah
max:  1910.123198
quantile 99:  546.5820654
am
max:  630.51823
quantile 99:  403.5895810999994
ar
max:  178.943634
quantile 99:  33.199239599999885
ax
max:  38.27088
quantile 99:  13.144718429999998
ay
max:  10.315851
quantile 99:  0.22643228999999956
az
max:  38.971568
quantile 99:  22.9370743
bc
max:  1463.693448
quantile 99:  52.5272368599999
bd 
max:  53060.59924
quantile 99:  10320.462955399991
bn
max:  28.9542
quantile 99:  28.248
bp
max:  2447.81055
quantile 99:  976.4940900599984
bq
max:  344.644105
quantile 99:  344.644105
br
max:  179250.2529
quantile 99:  5245.003213999996
bz
max:  50092.4593
quantile 99:  3098.786244429995
cb
max:  2271.436167
quantile 99:  525.3320549749989
cc
max:  4.1030316
quantile 99:  1.441057152599997
cd 
max:  485.169816
quantile 99:  262.1905745199999
cf
max:  200.967526
quantile 99:  41.83008016999998
ch
max:  0.224074
quantile 99:  0.06862315

In [38]:
train.shape

(548, 57)

##  train_test_split

In [46]:
x_train, x_valid, y_train, y_valid = train_test_split(train.drop("class", axis=1), train["class"], test_size = 0.2)
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((438, 56), (110, 56), (438,), (110,))

In [47]:
xgb_base(x_train, x_valid, y_train, y_valid)

[0]	dtrain-auc:0.94361	dvalid-auc:0.86861
[10]	dtrain-auc:0.94913	dvalid-auc:0.88083
[20]	dtrain-auc:0.97929	dvalid-auc:0.91667
[30]	dtrain-auc:0.98876	dvalid-auc:0.91944
[40]	dtrain-auc:0.98969	dvalid-auc:0.91833
[50]	dtrain-auc:0.99133	dvalid-auc:0.91778
[60]	dtrain-auc:0.99195	dvalid-auc:0.91556
[62]	dtrain-auc:0.99233	dvalid-auc:0.91611


In [41]:
catboost_base(x_train, x_valid, y_train, y_valid)

0:	total: 3.84ms	remaining: 7.67s
10:	total: 33ms	remaining: 5.97s
20:	total: 54ms	remaining: 5.09s
30:	total: 74.2ms	remaining: 4.71s
40:	total: 94.5ms	remaining: 4.51s
50:	total: 114ms	remaining: 4.36s
60:	total: 134ms	remaining: 4.26s
70:	total: 154ms	remaining: 4.19s
80:	total: 175ms	remaining: 4.14s
90:	total: 194ms	remaining: 4.08s
100:	total: 216ms	remaining: 4.06s
110:	total: 237ms	remaining: 4.03s
120:	total: 256ms	remaining: 3.98s
130:	total: 276ms	remaining: 3.94s
140:	total: 296ms	remaining: 3.9s
150:	total: 316ms	remaining: 3.87s
160:	total: 336ms	remaining: 3.83s
170:	total: 356ms	remaining: 3.81s
180:	total: 376ms	remaining: 3.77s
190:	total: 396ms	remaining: 3.75s
200:	total: 418ms	remaining: 3.74s
210:	total: 438ms	remaining: 3.72s
220:	total: 459ms	remaining: 3.69s
230:	total: 478ms	remaining: 3.66s
240:	total: 499ms	remaining: 3.64s
250:	total: 519ms	remaining: 3.62s
260:	total: 539ms	remaining: 3.59s
270:	total: 559ms	remaining: 3.57s
280:	total: 579ms	remaining: 3.

* Уберу признаки которые визуально плохо разделимы
* gi, dl, dh, fs, dy, cu, ay, br

In [42]:
train_new = train.drop(["gi", "dl", "dh", "fs", "dy", "cu", "ay", "br"], axis=1)

In [43]:
x_train, x_valid, y_train, y_valid = train_test_split(train_new.drop("class", axis=1), train_new["class"], test_size = 0.2)
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((438, 48), (110, 48), (438,), (110,))

In [44]:
xgb_base(x_train, x_valid, y_train, y_valid)

[0]	dtrain-auc:0.94464	dvalid-auc:0.89034
[10]	dtrain-auc:0.97606	dvalid-auc:0.89721
[20]	dtrain-auc:0.97872	dvalid-auc:0.90339
[30]	dtrain-auc:0.98046	dvalid-auc:0.89995
[40]	dtrain-auc:0.98859	dvalid-auc:0.91117
[50]	dtrain-auc:0.99429	dvalid-auc:0.91575
[60]	dtrain-auc:0.99615	dvalid-auc:0.91621
[70]	dtrain-auc:0.99742	dvalid-auc:0.91529
[80]	dtrain-auc:0.99869	dvalid-auc:0.91712
[90]	dtrain-auc:0.99898	dvalid-auc:0.91941
[100]	dtrain-auc:0.99920	dvalid-auc:0.91896
[110]	dtrain-auc:0.99945	dvalid-auc:0.91941
[116]	dtrain-auc:0.99953	dvalid-auc:0.91941


In [45]:
catboost_base(x_train, x_valid, y_train, y_valid)

0:	total: 3.29ms	remaining: 6.59s
10:	total: 36.3ms	remaining: 6.56s
20:	total: 64.9ms	remaining: 6.12s
30:	total: 102ms	remaining: 6.49s
40:	total: 123ms	remaining: 5.87s
50:	total: 142ms	remaining: 5.43s
60:	total: 161ms	remaining: 5.12s
70:	total: 181ms	remaining: 4.91s
80:	total: 204ms	remaining: 4.82s
90:	total: 223ms	remaining: 4.68s
100:	total: 243ms	remaining: 4.57s
110:	total: 262ms	remaining: 4.46s
120:	total: 281ms	remaining: 4.37s
130:	total: 302ms	remaining: 4.32s
140:	total: 324ms	remaining: 4.26s
150:	total: 345ms	remaining: 4.22s
160:	total: 364ms	remaining: 4.16s
170:	total: 384ms	remaining: 4.11s
180:	total: 410ms	remaining: 4.12s
190:	total: 431ms	remaining: 4.08s
200:	total: 451ms	remaining: 4.04s
210:	total: 472ms	remaining: 4s
220:	total: 492ms	remaining: 3.96s
230:	total: 512ms	remaining: 3.92s
240:	total: 533ms	remaining: 3.89s
250:	total: 553ms	remaining: 3.85s
260:	total: 574ms	remaining: 3.83s
270:	total: 600ms	remaining: 3.83s
280:	total: 623ms	remaining: 3.

* После удаления стало хуже
  
* Обучение на признаках которые показали наибольшую нелинейную зависимость
'du', 'cr', 'di', 'eh', 'gl', 'fd ', 'bc', 'fl', 'ab', 'bq', 'cc', 'da','af', 'ee', 'cd ', 'de', 'dh', 'dy', 'fe', 'el'

In [52]:
train_new = train[['du', 'cr', 'di', 'eh', 'gl', 'fd ', 'bc', 'fl', 'ab', 'bq', 'cc', 'da',
        'af', 'ee', 'cd ', 'de', 'dh', 'dy', 'fe', 'el', "class"]]
train_new.shape

(548, 21)

In [53]:
x_train, x_valid, y_train, y_valid = train_test_split(train_new.drop("class", axis=1), train_new["class"], test_size = 0.2)
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((438, 20), (110, 20), (438,), (110,))

In [54]:
xgb_base(x_train, x_valid, y_train, y_valid)

[0]	dtrain-auc:0.96581	dvalid-auc:0.88490
[10]	dtrain-auc:0.97811	dvalid-auc:0.87247
[20]	dtrain-auc:0.98157	dvalid-auc:0.89358
[30]	dtrain-auc:0.98239	dvalid-auc:0.89329
[40]	dtrain-auc:0.98347	dvalid-auc:0.89589
[50]	dtrain-auc:0.98500	dvalid-auc:0.90110
[60]	dtrain-auc:0.98610	dvalid-auc:0.90341
[70]	dtrain-auc:0.98609	dvalid-auc:0.91990
[80]	dtrain-auc:0.99262	dvalid-auc:0.91874
[90]	dtrain-auc:0.99533	dvalid-auc:0.91816
[98]	dtrain-auc:0.99644	dvalid-auc:0.91643


In [55]:
catboost_base(x_train, x_valid, y_train, y_valid)

0:	total: 2.61ms	remaining: 5.21s
10:	total: 26.4ms	remaining: 4.77s
20:	total: 52.8ms	remaining: 4.98s
30:	total: 77.3ms	remaining: 4.91s
40:	total: 94.2ms	remaining: 4.5s
50:	total: 111ms	remaining: 4.23s
60:	total: 127ms	remaining: 4.03s
70:	total: 142ms	remaining: 3.86s
80:	total: 159ms	remaining: 3.76s
90:	total: 175ms	remaining: 3.67s
100:	total: 192ms	remaining: 3.62s
110:	total: 209ms	remaining: 3.56s
120:	total: 225ms	remaining: 3.5s
130:	total: 241ms	remaining: 3.44s
140:	total: 257ms	remaining: 3.39s
150:	total: 273ms	remaining: 3.34s
160:	total: 290ms	remaining: 3.31s
170:	total: 306ms	remaining: 3.27s
180:	total: 322ms	remaining: 3.24s
190:	total: 338ms	remaining: 3.2s
200:	total: 353ms	remaining: 3.16s
210:	total: 368ms	remaining: 3.12s
220:	total: 396ms	remaining: 3.19s
230:	total: 413ms	remaining: 3.16s
240:	total: 431ms	remaining: 3.15s
250:	total: 447ms	remaining: 3.11s
260:	total: 462ms	remaining: 3.08s
270:	total: 479ms	remaining: 3.06s
280:	total: 494ms	remaining: 

* Cтало явно еще хуже
* Обучение на признаках которые показали хорошую разделяющую способность по метрике gini
* bq, ab, du, af, fl, bn, bc, cd, cr, gl, da, de

In [57]:
train.columns

Index(['ab', 'af', 'ah', 'am', 'ar', 'ax', 'ay', 'az', 'bc', 'bd ', 'bn', 'bp',
       'bq', 'br', 'bz', 'cb', 'cc', 'cd ', 'cf', 'ch', 'cl', 'cr', 'cs', 'cu',
       'cw ', 'da', 'de', 'df', 'dh', 'di', 'dl', 'dn', 'du', 'dv', 'dy', 'eb',
       'ee', 'eg', 'eh', 'el', 'ep', 'eu', 'fc', 'fd ', 'fe', 'fi', 'fl', 'fr',
       'fs', 'gb', 'ge', 'gf', 'gh', 'gi', 'gl', 'class', 'ej_freq'],
      dtype='object')

In [58]:
train_new = train[["bq", "ab", "du", "af", "fl", "bn", "bc", "cd ", "cr", "gl", "da", "de", "class"]]

In [59]:
x_train, x_valid, y_train, y_valid = train_test_split(train_new.drop("class", axis=1), train_new["class"], test_size = 0.2)
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((438, 12), (110, 12), (438,), (110,))

In [67]:
xgb_base(x_train, x_valid, y_train, y_valid)

[0]	dtrain-auc:0.94899	dvalid-auc:0.84163
[10]	dtrain-auc:0.96376	dvalid-auc:0.85340
[20]	dtrain-auc:0.97064	dvalid-auc:0.85179
[26]	dtrain-auc:0.97154	dvalid-auc:0.85045


In [72]:
catboost_base(x_train, x_valid, y_train, y_valid)

roc_auc_score:  0.7212413055109685
confusion_matrix:  [[86  3]
 [11 10]]
accuracy_score:  0.8727272727272727
