In [1]:
import matplotlib.pyplot as plt

import lightgbm as lgb
import pandas as pd
from catboost import CatBoostClassifier
import catboost
import numpy as np

from datetime import datetime
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import featuretools as ft

from bayes_opt import BayesianOptimization

%matplotlib inline

In [20]:
train = pd.read_csv('data/train.csv').drop("ID_code",axis=1)
test = pd.read_csv('data/test.csv')

In [31]:
for i in range(199):
    for j in range(i + 1, 199):
        x = (train[f'var_{i}'] / train[f'var_{j}'].replace(0, 0.01))[train['target']==1].mean()
        y = (train[f'var_{i}'] / train[f'var_{j}'].replace(0, 0.01))[train['target']==0].mean()
        k = max(x, y) / min(x, y)
        if(k > 100):
            print(i, j, k)
            train[f'var_{i}/{j}'] = train[f'var_{i}'] / train[f'var_{j}']
            test[f'var_{i}/{j}'] = test[f'var_{i}'] / test[f'var_{j}']
        
        

0 1 100.74361769724653
2 35 301.4994652675756
3 178 281.6635632096735
8 36 143.77592502737463
11 120 185.43364766749673
13 45 511.18193220757064
17 40 108.70466949692108
19 80 221.41496533330292
33 54 169.55917790223845
34 84 222.42720449084533
37 67 15831.694506039317
37 145 134.46672911998422
43 117 580.6434029866365
45 127 165.76548778547675
48 80 123.49047771148007
48 189 566.5066412177823
54 127 287.59409772977483
58 154 127.33498437306783
58 164 470.96284060283864
58 196 136.1262894417534
62 190 138.00768757506344
64 140 188.24267343225202
76 178 847.8202106862002
81 115 434.094565612192
83 142 1443.2641821356694
83 192 851.0304391077631
90 185 234.6390029920295
115 117 232.83389679602342
119 139 579.899347316647
123 154 1317.819622597636
133 145 268.19304348020484
137 189 2498.1290311373978
140 141 213.82865784760395
141 196 178.47314983540807
145 184 178.45053185672467
173 189 465.4286043621193
190 193 141.14827049352107


In [10]:
f = [x for x  in train.columns if x != 'target']
columns_cnt = []
cat_features = []
for column in f:
    columns_cnt.append(len(train[column].unique()))
    if (len(train[column].unique()) / len(train) < .03):
        cat_features.append(column)
cat_features

['var_68']

In [22]:
cat_features = ['var_68']

In [23]:
X = train.iloc[:, 1:]
y = train.iloc[:, 0]

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.50)

In [25]:
param = {
   "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    "bagging_seed" : 10,
    "verbosity" : 1,
}

In [26]:
train_dataset = lgb.Dataset(X_train, label=y_train)
val_dataset = lgb.Dataset(X_val, label=y_val)

model = lgb.train(param, train_dataset, 10000, valid_sets=[val_dataset], verbose_eval=200,
                 categorical_feature=cat_features)

New categorical_feature is ['var_68']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[200]	valid_0's auc: 0.859684
[400]	valid_0's auc: 0.873805
[600]	valid_0's auc: 0.881245
[800]	valid_0's auc: 0.882928
[1000]	valid_0's auc: 0.886068
[1200]	valid_0's auc: 0.888134
[1400]	valid_0's auc: 0.890367
[1600]	valid_0's auc: 0.892064
[1800]	valid_0's auc: 0.893675
[2000]	valid_0's auc: 0.894991
[2200]	valid_0's auc: 0.896266
[2400]	valid_0's auc: 0.897259
[2600]	valid_0's auc: 0.898053
[2800]	valid_0's auc: 0.898792
[3000]	valid_0's auc: 0.899471
[3200]	valid_0's auc: 0.899875
[3400]	valid_0's auc: 0.900361
[3600]	valid_0's auc: 0.900724
[3800]	valid_0's auc: 0.901058
[4000]	valid_0's auc: 0.90127
[4200]	valid_0's auc: 0.901483
[4400]	valid_0's auc: 0.901769
[4600]	valid_0's auc: 0.902062
[4800]	valid_0's auc: 0.902157
[5000]	valid_0's auc: 0.902297
[5200]	valid_0's auc: 0.902492
[5400]	valid_0's auc: 0.902655
[5600]	valid_0's auc: 0.902735
[5800]	valid_0's auc: 0.90294
[6000]	valid_0's auc: 0.903108
[6200]	valid_0's auc: 0.90316
[6400]	valid_0's auc: 0.903113
[6600]	valid_0'

In [39]:

# last submitted 
y_test_pred = model.predict(X_test)
score = roc_auc_score(y_test, y_test_pred)
print('AUC: ', score)

AUC:  0.9027052544137998


In [28]:
import pickle
# save the classifier
with open('my_dumped_classifier_best.pkl', 'wb') as fid:
    pickle.dump(model, fid)

In [29]:
with open('my_dumped_classifier_best.pkl', 'rb') as fid:
    gnb_loaded_1 = pickle.load(fid)

# Sumbmissin

In [33]:
sub_df = pd.DataFrame({'ID_code': test['ID_code'].values})
sub_df['target'] = model.predict(test.iloc[:, 1:])
sub_df.to_csv('submission.csv', index=False)

In [38]:
test

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_115/117,var_119/139,var_123/154,var_133/145,var_137/189,var_140/141,var_141/196,var_145/184,var_173/189,var_190/193
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,90.075581,-3.324641,0.495081,2.487241,10.701404,2.426198,-0.486691,0.096367,5.104273,-0.879550
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,0.129205,0.041167,1.094697,-1.391466,4.970954,0.722709,-10.341570,-0.202204,-5.735222,1.048212
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.8950,20.2537,1.5233,...,0.062845,0.398498,1.092321,-59.254268,5.755097,-3.258146,-0.195580,-0.004548,-2.519838,-0.343303
3,test_3,8.5374,-1.3222,12.0220,6.5749,8.8458,3.1744,4.9397,20.5660,3.3755,...,0.349645,0.640917,0.999563,3.758817,9.684600,0.915075,3.327167,0.222552,8.522131,2.672270
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.9890,...,-0.602006,0.875675,3.632071,-4.405699,14.459714,-3.477189,0.700322,-0.407090,1.448441,1.251081
5,test_5,5.9862,-2.2913,8.6058,7.0685,14.2465,-8.6761,4.2467,14.7632,1.8790,...,0.010168,-3.876159,1.527357,0.628241,5.737117,-1.124125,-0.882772,1.776299,0.750859,-0.237832
6,test_6,8.4624,-6.1065,7.3603,8.2627,12.0104,-7.2073,4.1670,13.0809,-4.3004,...,0.327560,-0.031944,0.661148,700.768421,0.214820,0.548941,1.219620,0.000589,-4.742528,1.396964
7,test_7,17.3035,-2.4212,13.3989,8.3998,11.0777,9.6449,5.9596,17.8477,-4.8068,...,0.295269,1.282472,1.441144,2.954699,-1.136998,1.425684,1.057582,0.121406,-5.908603,2.979592
8,test_8,6.9856,0.8402,13.7161,4.7749,8.6784,-13.7607,4.3386,14.5843,2.5883,...,0.269156,-0.056373,0.994655,-100.231343,-119.530259,1.254962,-1.582496,-0.010801,5.594380,95.737569
9,test_9,10.3811,-6.9348,14.6690,9.0941,11.9058,-10.8018,3.4508,20.2816,-1.4112,...,0.147815,0.018267,2.141435,0.536393,13.971994,-0.583835,-2.061061,0.372217,-1.400718,0.378607


In [34]:
test.iloc[:, 1:].shape

(200000, 237)

In [35]:
train.iloc[:, 1:].shape

(200000, 237)