In [1]:
from fastai import *
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from parfit import bestFit
import lightgbm as lgb

In [3]:
from skopt.space import Real, Integer
from skopt.utils import use_named_args
import itertools
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize

In [4]:
TRAIN = 'data/train.csv'
TEST = 'data/test.csv'
SAMPLE = 'data/sample_submission.csv'
train = pd.read_csv(TRAIN)
test = pd.read_csv(TEST)

In [4]:
train.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [5]:
train.describe()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,...,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.10049,10.679914,-1.627622,10.715192,6.796529,11.078333,-5.065317,5.408949,16.54585,0.284162,...,3.23444,7.438408,1.927839,3.331774,17.993784,-0.142088,2.303335,8.908158,15.87072,-3.326537
std,0.300653,3.040051,4.050044,2.640894,2.043319,1.62315,7.863267,0.866607,3.418076,3.332634,...,4.559922,3.023272,1.478423,3.99203,3.135162,1.429372,5.454369,0.921625,3.010945,10.438015
min,0.0,0.4084,-15.0434,2.1171,-0.0402,5.0748,-32.5626,2.3473,5.3497,-10.5055,...,-14.0933,-2.6917,-3.8145,-11.7834,8.6944,-5.261,-14.2096,5.9606,6.2993,-38.8528
25%,0.0,8.45385,-4.740025,8.722475,5.254075,9.883175,-11.20035,4.7677,13.9438,-2.3178,...,-0.058825,5.1574,0.889775,0.5846,15.6298,-1.1707,-1.946925,8.2528,13.8297,-11.208475
50%,0.0,10.52475,-1.60805,10.58,6.825,11.10825,-4.83315,5.3851,16.4568,0.3937,...,3.2036,7.34775,1.9013,3.39635,17.95795,-0.1727,2.4089,8.8882,15.93405,-2.81955
75%,0.0,12.7582,1.358625,12.5167,8.3241,12.261125,0.9248,6.003,19.1029,2.9379,...,6.4062,9.512525,2.9495,6.2058,20.396525,0.8296,6.556725,9.5933,18.064725,4.8368
max,1.0,20.315,10.3768,19.353,13.1883,16.6714,17.2516,8.4477,27.6918,10.1513,...,18.4409,16.7165,8.4024,18.2818,27.9288,4.2729,18.3215,12.0004,26.0791,28.5007


# Random Forest

In [62]:
X = train.drop(['ID_code', 'target'], axis=1)
y = train.target

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
grid = {
    'min_samples_leaf': [25, 200],
    'max_features': [ 'log2', 1],
    'n_estimators': [100, 200],
    'n_jobs': [-1],
    'random_state': [42]
}
paramGrid = ParameterGrid(grid)

In [9]:
best_model, best_score, all_models, all_scores = bestFit(RandomForestClassifier(), paramGrid,
                                                    X_train, y_train, X_test, y_test, # nfolds=5 [optional, instead of validation set]
                                                    metric=roc_auc_score, greater_is_better=True, 
                                                    scoreLabel='AUC')
print(best_model, best_score)

-------------FITTING MODELS-------------


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 51.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 121.2min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 186.3min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 268.9min
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed: 378.7min remaining: 15.8min


-------------SCORING MODELS-------------


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 387.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.9s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=25, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False) 0.8528387049512353


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   16.6s finished


In [13]:
best_model, best_score, all_models, all_scores = bestFit(RandomForestClassifier(), paramGrid,
                                                    X_train, y_train, X_test, y_test, # nfolds=5 [optional, instead of validation set]
                                                    metric=roc_auc_score, greater_is_better=True, 
                                                    scoreLabel='AUC')
print(best_model, best_score)

-------------FITTING MODELS-------------


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   45.0s remaining:  2.2min
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:  1.0min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:  1.2min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:  2.4min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:  2.9min remaining:   58.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  3.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  3.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-------------SCORING MODELS-------------


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    1.2s remaining:    3.5s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=200, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False) 0.8797524288311014


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.1s finished


In [14]:
y_pred = best_model.predict(X_test)

In [15]:
roc_auc_score(y_test, y_pred)

0.5

In [18]:
print(X_test.shape, y_test.shape)

(50000, 200) (50000,)


In [19]:
print(X_train.shape, y_train.shape)

(150000, 200) (150000,)


# LightGBM

In [74]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0053,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
    'gpu_platform_id': -1,
    
}

In [58]:
trn_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_test, label=y_test)
clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 4000)

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.920424	valid_1's auc: 0.898443
[10000]	training's auc: 0.934279	valid_1's auc: 0.901848
[15000]	training's auc: 0.945071	valid_1's auc: 0.902537
[20000]	training's auc: 0.954432	valid_1's auc: 0.902486
Early stopping, best iteration is:
[17581]	training's auc: 0.950025	valid_1's auc: 0.902572


In [36]:
y_pred = clf.predict(X_test)

In [37]:
roc_auc_score(y_test, y_pred)

0.902110115136418

# Stratified KFold

In [55]:
from sklearn.model_selection import StratifiedKFold

In [75]:
skf = StratifiedKFold(n_splits=10)
models = []
for train_idx, test_idx in skf.split(X.values, y.values):
        X_train, y_train = X.iloc[trn_idx][features], y.iloc[trn_idx]
        X_valid, y_valid = X.iloc[val_idx][features], y.iloc[val_idx]
        X_tr, y_tr = augment(X_train.values, y_train.values)
        X_tr = pd.DataFrame(X_tr)
        trn_data = lgb.Dataset(X.values[train_idx], label=y.values[train_idx])
        test_data = lgb.Dataset(X.values[test_idx], label=y.values[test_idx])

        model_lgb     = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, test_data], verbose_eval=5000, early_stopping_rounds = 4000)
        models.append(model_lgb)
        auc = roc_auc_score(y.values[test_idx], model_lgb.predict(X.values[test_idx]))

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.917985	valid_1's auc: 0.894393
[10000]	training's auc: 0.930626	valid_1's auc: 0.899417
[15000]	training's auc: 0.940304	valid_1's auc: 0.9005
[20000]	training's auc: 0.948743	valid_1's auc: 0.900809
[25000]	training's auc: 0.956476	valid_1's auc: 0.900688
Early stopping, best iteration is:
[21855]	training's auc: 0.951706	valid_1's auc: 0.900883
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.91781	valid_1's auc: 0.895085
[10000]	training's auc: 0.930409	valid_1's auc: 0.899045
[15000]	training's auc: 0.940061	valid_1's auc: 0.899826
Early stopping, best iteration is:
[15346]	training's auc: 0.940666	valid_1's auc: 0.899904
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.918368	valid_1's auc: 0.89008
[10000]	training's auc: 0.930832	valid_1's auc: 0.894626
[15000]	training's auc: 0.940553	valid_1's auc: 0.895475
[200

In [88]:
#save models
!touch lgbm_cv10_models.m

In [91]:
import pickle
with open('lgbm_cv10_models.m', 'wb') as handle:
    pickle.dump(models, handle)

In [92]:
with open('lgbm_cv10_models.m', 'rb') as handle:
    k = pickle.load(handle)

In [93]:
k

[<lightgbm.basic.Booster at 0x7fb45c367940>,
 <lightgbm.basic.Booster at 0x7fb45c367470>,
 <lightgbm.basic.Booster at 0x7fb45c367160>,
 <lightgbm.basic.Booster at 0x7fb45c367080>,
 <lightgbm.basic.Booster at 0x7fb45c3df470>,
 <lightgbm.basic.Booster at 0x7fb45c3df6a0>,
 <lightgbm.basic.Booster at 0x7fb45c32f160>,
 <lightgbm.basic.Booster at 0x7fb45c32f240>,
 <lightgbm.basic.Booster at 0x7fb45c32f390>,
 <lightgbm.basic.Booster at 0x7fb45c32f4e0>]

# Prepare cross val answer

In [77]:
codes = test.ID_code
df_test = test.drop('ID_code', axis=1)
answer = [model.predict(df_test) for model in models]

In [84]:
ans = np.array(answer)
a = np.average(ans, axis=0)

In [87]:
answer_df = pd.DataFrame({
    'ID_code' : codes,
    'target' : a
})
answer_df.to_csv('answer_2_lgbm_with_CV10.csv', index=False)

# Prepare answer

In [40]:
test.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,-2.1556,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,10.6165,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.1293,-20.976
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,-0.7484,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.8956,-23.1794
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,4.2259,9.1723,1.2835,3.3778,19.5542,-0.286,-5.1612,7.2882,13.926,-9.1846


In [42]:
codes = test.ID_code
df_test = test.drop('ID_code', axis=1)
answer_preds = clf.predict(df_test)

In [49]:
answer = {
    'ID_code' : codes,
    'target' : answer_preds
}
answer_df = pd.DataFrame(answer)

In [52]:
answer_df.to_csv('answer_1_simple_gbm.csv', index=False)

# XGBoost

In [None]:
from x