In [133]:
from feature_engine.imputation import MeanMedianImputer
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, precision_score, recall_score, accuracy_score
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression

from tqdm import tqdm_notebook
import tqdm
import warnings
from sklearn.exceptions import ConvergenceWarning
import matplotlib.pyplot as plt
import seaborn as sns
import shap

warnings.simplefilter('ignore')
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [134]:
seed = 22
np.random.seed(seed)

In [135]:
df = pd.read_csv(r'train.tsv', sep='\t', index_col=0)

In [136]:
df = df.replace('?', np.nan)

In [137]:
X = df.drop(columns=['y']).astype(float)
cols = X.columns.tolist()

In [138]:
imp = MeanMedianImputer(imputation_method='median')
imp.fit(X)

In [139]:
X = imp.transform(X)

In [140]:
dct_to = {'N': 0, 'P': 1}
dct_out = {0: 'N', 1: 'P'}
y = df['y']
y = y.map(dct_to)

In [141]:
crs_late = X.corrwith(y).sort_values(ascending=False, key=abs)

In [142]:
for x in crs_late[abs(crs_late) > 0.3].index.tolist():
    for x2 in crs_late[abs(crs_late) > 0.3].index.tolist():
        if x != x2:
            X[f'{x}_div_{x2}'] = X[x] / X[x2]

In [143]:
crs = X.drop(columns=cols).corrwith(y).sort_values(ascending=False, key=abs)

In [144]:
good_divs = crs[abs(crs) > 0.45].index.tolist()
len(good_divs)

107

In [145]:
X = X[cols + good_divs]

In [146]:
X

Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x736_div_x291,x186_div_x650,x147_div_x444,x890_div_x836,x736_div_x650,x147_div_x789,x600_div_x229,x215_div_x147,x186_div_x215,x491_div_x759
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,80.0,60.0,-10.0,-20.0,-10.0,150.0,350.0,-510.0,60.0,-10.0,...,0.984848,0.342593,-0.530435,0.503663,-0.150463,-1.095808,2.213115,-0.540984,-1.494949,0.297003
2,70.0,20.0,-20.0,-10.0,-10.0,140.0,360.0,-440.0,-400.0,0.0,...,0.429752,0.390244,-0.391304,0.471287,-0.140921,-0.754491,1.730769,-1.007937,-1.133858,0.149100
3,80.0,60.0,20.0,-10.0,10.0,140.0,190.0,-500.0,880.0,0.0,...,0.571429,0.330623,-0.394322,0.544554,-0.184282,-0.698324,1.200000,-1.192000,-0.818792,0.055556
4,60.0,60.0,10.0,-40.0,0.0,140.0,580.0,-450.0,-200.0,-10.0,...,0.559633,0.304239,-0.462733,0.544554,-0.152120,-0.892216,1.831461,-0.751678,-1.089286,0.206406
5,70.0,60.0,10.0,-10.0,0.0,230.0,350.0,30.0,390.0,0.0,...,0.355372,0.386091,-0.449351,0.823762,-0.103118,-1.362205,1.483516,-0.433526,-2.146667,0.293548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19996,70.0,80.0,10.0,0.0,0.0,180.0,180.0,-600.0,590.0,0.0,...,0.368794,0.287736,-0.391304,0.718812,-0.122642,-0.659686,1.730769,-1.103175,-0.877698,0.149100
19997,70.0,90.0,10.0,0.0,0.0,140.0,540.0,-380.0,60.0,-10.0,...,0.454545,0.192857,-0.257143,0.574113,-0.098214,-0.664773,1.516854,-1.487179,-0.620690,0.113111
19998,60.0,60.0,10.0,40.0,10.0,180.0,320.0,-450.0,60.0,0.0,...,0.419355,0.342697,-0.271540,0.554455,-0.146067,-0.601156,1.258427,-1.519231,-0.772152,0.123667
19999,70.0,60.0,-30.0,0.0,0.0,200.0,300.0,-450.0,60.0,0.0,...,0.241135,0.330623,-0.315068,0.726733,-0.092141,-0.575000,1.000000,-1.121739,-0.945736,0.141463


In [147]:
from collections import Counter

for x in Counter(X.columns):
    if Counter(X.columns)[x] > 1:
        print(x)

In [148]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y)

In [150]:
rfc = RandomForestClassifier(random_state=seed)
rfc.fit(X_train, y_train)

cat = CatBoostClassifier(use_best_model=True, random_state=seed, verbose=500)
cat.fit(X_train, y_train, eval_set=(X_test, y_test))

xgb = XGBClassifier(random_state=seed)
xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=10)

lgbm = LGBMClassifier(random_state=seed, verbose=-1)
lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)])

KeyboardInterrupt: 

In [80]:
df_imp = pd.DataFrame()
df_imp['cols'] = X.columns
df_imp['Rfc_imp'] = rfc.feature_importances_
df_imp['Cat_imp'] = cat.feature_importances_
df_imp['XGB_imp'] = xgb.feature_importances_
df_imp['LGBM_imp'] = lgbm.feature_importances_

In [81]:
rfc_rank = df_imp[['Rfc_imp', 'cols']].sort_values(by='Rfc_imp', ascending=False).reset_index(drop=True).reset_index()
rfc_rank = rfc_rank.rename(columns={'index': 'rfc_ind'})
rfc_rank['rfc_ind'] += 1
df_imp = df_imp.merge(rfc_rank)

In [82]:
cat_rank = df_imp[['Cat_imp', 'cols']].sort_values(by='Cat_imp', ascending=False).reset_index(drop=True).reset_index()
cat_rank = cat_rank.rename(columns={'index': 'cat_ind'})
cat_rank['cat_ind'] += 1
df_imp = df_imp.merge(cat_rank)

In [83]:
XGB_rank = df_imp[['XGB_imp', 'cols']].sort_values(by='XGB_imp', ascending=False).reset_index(drop=True).reset_index()
XGB_rank = XGB_rank.rename(columns={'index': 'XGB_ind'})
XGB_rank['XGB_ind'] += 1
df_imp = df_imp.merge(XGB_rank)

In [84]:
LGBM_rank = df_imp[['LGBM_imp', 'cols']].sort_values(by='LGBM_imp', ascending=False).reset_index(drop=True).reset_index()
LGBM_rank = LGBM_rank.rename(columns={'index': 'LGBM_ind'})
LGBM_rank['LGBM_ind'] += 1
df_imp = df_imp.merge(LGBM_rank)

In [85]:
print(f1_score(y_test, rfc.predict(X_test)))
print(f1_score(y_test, cat.predict(X_test)))
print(f1_score(y_test, xgb.predict(X_test)))
print(f1_score(y_test, lgbm.predict(X_test)))

0.8588132125109617
0.8732718894009217
0.8666281755196305
0.8677065280184865


In [86]:
result = permutation_importance(cat, X, y, n_repeats=2, random_state=seed)

In [87]:
df_imp['perm_imp'] = result.importances_mean

In [88]:
perm_rank = df_imp[['perm_imp', 'cols']].sort_values(by='perm_imp', ascending=False).reset_index(drop=True).reset_index()
perm_rank = perm_rank.rename(columns={'index': 'perm_ind'})
perm_rank['perm_ind'] += 1
df_imp = df_imp.merge(perm_rank)

In [89]:
df_imp

Unnamed: 0,cols,Rfc_imp,Cat_imp,XGB_imp,LGBM_imp,rfc_ind,cat_ind,XGB_ind,LGBM_ind,perm_imp,perm_ind
0,x1,0.000051,0.000000,0.000934,0,1060,918,132,854,0.000000,665
1,x2,0.000284,0.010964,0.000321,1,726,792,595,764,0.000000,833
2,x3,0.000326,0.035787,0.000183,5,678,563,782,171,0.000075,368
3,x4,0.000359,0.090118,0.000559,4,628,314,312,269,0.000325,47
4,x5,0.000125,0.006916,0.000409,0,922,830,467,948,0.000000,759
...,...,...,...,...,...,...,...,...,...,...,...
1102,x147_div_x789,0.000539,0.100510,0.000379,3,403,282,513,369,0.000125,204
1103,x600_div_x229,0.003185,0.259524,0.000502,4,65,104,367,266,-0.000025,894
1104,x215_div_x147,0.000722,0.051212,0.002722,2,254,473,27,549,0.000025,485
1105,x186_div_x215,0.001035,0.086891,0.000217,2,181,323,746,538,-0.000150,1062


In [90]:
model_lr = LogisticRegression(random_state=seed)
model_lr.fit(X_train, y_train)

In [91]:
explainer = shap.LinearExplainer(model_lr, shap.maskers.Independent(X_train))
shap_values = explainer(X_test)

feature_names = X_train.columns


rf_resultX = pd.DataFrame(shap_values.values, columns=feature_names)


vals = np.abs(rf_resultX.values).mean(0)

shap_importance = pd.DataFrame(list(zip(feature_names, vals)),
                                  columns=['cols','shap_imp'])
shap_importance.sort_values(by=['shap_imp'],
                               ascending=False, inplace=True)

In [92]:
shap_importance = shap_importance.sort_values(by='shap_imp', ascending=False).reset_index(drop=True).reset_index()
shap_importance = shap_importance.rename(columns={'index': 'shap_ind'})
shap_importance['shap_ind'] += 1
df_imp = df_imp.merge(shap_importance)

In [93]:
df_imp

Unnamed: 0,cols,Rfc_imp,Cat_imp,XGB_imp,LGBM_imp,rfc_ind,cat_ind,XGB_ind,LGBM_ind,perm_imp,perm_ind,shap_ind,shap_imp
0,x1,0.000051,0.000000,0.000934,0,1060,918,132,854,0.000000,665,740,1.960952e-04
1,x2,0.000284,0.010964,0.000321,1,726,792,595,764,0.000000,833,554,1.346673e-03
2,x3,0.000326,0.035787,0.000183,5,678,563,782,171,0.000075,368,598,7.958523e-04
3,x4,0.000359,0.090118,0.000559,4,628,314,312,269,0.000325,47,513,2.220906e-03
4,x5,0.000125,0.006916,0.000409,0,922,830,467,948,0.000000,759,755,1.603133e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1102,x147_div_x789,0.000539,0.100510,0.000379,3,403,282,513,369,0.000125,204,1028,7.885571e-08
1103,x600_div_x229,0.003185,0.259524,0.000502,4,65,104,367,266,-0.000025,894,1014,3.235404e-07
1104,x215_div_x147,0.000722,0.051212,0.002722,2,254,473,27,549,0.000025,485,1032,5.689607e-08
1105,x186_div_x215,0.001035,0.086891,0.000217,2,181,323,746,538,-0.000150,1062,1070,1.370802e-08


In [94]:
df_imp['full_imp'] = df_imp['rfc_ind'] + df_imp['cat_ind'] + df_imp['XGB_ind'] + df_imp['LGBM_ind'] + df_imp['perm_ind'] + df_imp['shap_ind']

In [95]:
final_cols = df_imp.sort_values(by='full_imp').head(200)['cols'].tolist()

In [96]:
X_new = X[final_cols]

In [97]:
final_cols

['x22',
 'x742',
 'x144',
 'x729',
 'x202',
 'x191',
 'x53',
 'x375',
 'x88',
 'x169',
 'x308',
 'x879',
 'x526',
 'x969',
 'x763',
 'x704',
 'x597',
 'x478',
 'x101',
 'x867',
 'x183',
 'x942',
 'x670',
 'x419',
 'x861',
 'x630',
 'x391',
 'x326',
 'x796',
 'x658',
 'x900',
 'x193',
 'x536',
 'x63',
 'x706',
 'x286',
 'x552',
 'x882',
 'x218',
 'x331',
 'x492',
 'x341',
 'x281',
 'x525',
 'x288',
 'x741',
 'x223',
 'x538',
 'x908',
 'x877',
 'x175',
 'x439',
 'x203',
 'x562',
 'x846',
 'x641',
 'x576',
 'x469',
 'x588',
 'x769',
 'x607',
 'x600_div_x890',
 'x232',
 'x836',
 'x783',
 'x886',
 'x598',
 'x902',
 'x601',
 'x227',
 'x466',
 'x836_div_x227',
 'x866',
 'x236',
 'x818',
 'x176',
 'x327',
 'x82',
 'x407',
 'x836_div_x890',
 'x353',
 'x231',
 'x746',
 'x233',
 'x502',
 'x215',
 'x453',
 'x657',
 'x258',
 'x42',
 'x744',
 'x399',
 'x287',
 'x656',
 'x546',
 'x785',
 'x943',
 'x884',
 'x234',
 'x443',
 'x291',
 'x563',
 'x539',
 'x694',
 'x294',
 'x543',
 'x890',
 'x695',
 'x364'

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, shuffle=True, stratify=y)

In [99]:
rfc = RandomForestClassifier(random_state=seed)
rfc.fit(X_train, y_train)

cat = CatBoostClassifier(use_best_model=True, random_state=seed, verbose=500)
cat.fit(X_train, y_train, eval_set=(X_test, y_test))

xgb = XGBClassifier(random_state=seed)
xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=10)

lgbm = LGBMClassifier(random_state=seed, verbose=-1)
lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)])

Learning rate set to 0.060814
0:	learn: 0.6427362	test: 0.6431619	best: 0.6431619 (0)	total: 32.6ms	remaining: 32.5s
500:	learn: 0.1262870	test: 0.2199556	best: 0.2199556 (500)	total: 13.5s	remaining: 13.4s
999:	learn: 0.0739787	test: 0.2187448	best: 0.2184275 (800)	total: 25.8s	remaining: 0us

bestTest = 0.2184275354
bestIteration = 800

Shrink model to first 801 iterations.
[0]	validation_0-logloss:0.47103
[10]	validation_0-logloss:0.25005
[20]	validation_0-logloss:0.23771
[30]	validation_0-logloss:0.23782
[40]	validation_0-logloss:0.23808
[50]	validation_0-logloss:0.23954
[60]	validation_0-logloss:0.24375
[70]	validation_0-logloss:0.24773
[80]	validation_0-logloss:0.25178
[90]	validation_0-logloss:0.25327
[99]	validation_0-logloss:0.25701


In [100]:
print(f1_score(y_test, rfc.predict(X_test)))
print(f1_score(y_test, cat.predict(X_test)))
print(f1_score(y_test, xgb.predict(X_test)))
print(f1_score(y_test, lgbm.predict(X_test)))

0.8579846788450206
0.8722974920726434
0.8589558696279204
0.869942196531792


In [101]:
cat_new = CatBoostClassifier(
    iterations=1500,
    eval_metric='F1',
    learning_rate=0.04,
    random_seed=seed,
    l2_leaf_reg=8,
    depth=8,
    early_stopping_rounds=3
)

In [102]:
cat_new.fit(X_new, y)

0:	learn: 0.7541703	total: 83.9ms	remaining: 2m 5s
1:	learn: 0.7853348	total: 163ms	remaining: 2m 2s
2:	learn: 0.8047802	total: 239ms	remaining: 1m 59s
3:	learn: 0.8144861	total: 317ms	remaining: 1m 58s
4:	learn: 0.8151357	total: 387ms	remaining: 1m 55s
5:	learn: 0.8158290	total: 458ms	remaining: 1m 54s
6:	learn: 0.8196721	total: 527ms	remaining: 1m 52s
7:	learn: 0.8209685	total: 597ms	remaining: 1m 51s
8:	learn: 0.8236132	total: 671ms	remaining: 1m 51s
9:	learn: 0.8246522	total: 740ms	remaining: 1m 50s
10:	learn: 0.8274017	total: 808ms	remaining: 1m 49s
11:	learn: 0.8282756	total: 877ms	remaining: 1m 48s
12:	learn: 0.8285153	total: 947ms	remaining: 1m 48s
13:	learn: 0.8303468	total: 1.01s	remaining: 1m 47s
14:	learn: 0.8324276	total: 1.08s	remaining: 1m 47s
15:	learn: 0.8333185	total: 1.16s	remaining: 1m 47s
16:	learn: 0.8345067	total: 1.24s	remaining: 1m 47s
17:	learn: 0.8348477	total: 1.3s	remaining: 1m 47s
18:	learn: 0.8368820	total: 1.38s	remaining: 1m 47s
19:	learn: 0.8381343	tot

<catboost.core.CatBoostClassifier at 0x23e77ee5450>

In [103]:
test = pd.read_csv("test.tsv", sep='\t', index_col=0)
test = test.replace('?', np.nan)
test = imp.transform(test)

In [116]:
test = test.astype(float)

In [117]:
for x in crs_late[abs(crs_late) > 0.3].index.tolist():
    for x2 in crs_late[abs(crs_late) > 0.3].index.tolist():
        if x != x2:
            test[f'{x}_div_{x2}'] = test[x] / test[x2]

In [125]:
prediction = cat_new.predict(test[final_cols])

In [126]:
submission = pd.read_csv("test.tsv", sep='\t')[["id"]].copy()

In [127]:
submission

Unnamed: 0,id
0,20001
1,20002
2,20003
3,20004
4,20005
...,...
12340,32341
12341,32342
12342,32343
12343,32344


In [128]:
submission["y"] = prediction

In [129]:
submission["y"].value_counts()

y
0    8885
1    3460
Name: count, dtype: int64

In [130]:
submission['y'] = submission['y'].map(dct_out)

In [131]:
submission.to_csv("submission.tsv", sep='\t')

In [132]:
submission

Unnamed: 0,id,y
0,20001,P
1,20002,N
2,20003,N
3,20004,N
4,20005,N
...,...,...
12340,32341,N
12341,32342,N
12342,32343,N
12343,32344,N
