# Imports

In [1]:
from fastai import *
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from skopt.space import Real, Integer
from skopt.utils import use_named_args
import itertools
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
import lightgbm as lgb
import pickle
import gc
import xgboost as xgb

# Make data

In [2]:
df_train = pd.read_csv('data/train.csv')
X_train = df_train.drop(['ID_code', 'target'], axis=1)
y_train = df_train.target

In [3]:
Xtr, Xval, ytr, yval  = train_test_split(X_train, y_train, test_size = 0.05)

In [4]:
print(f"{Xtr.shape} {Xval.shape}")

(190000, 200) (10000, 200)


In [5]:
print(f"1: {yval[yval == 1].shape[0]} | 0 : {yval[yval == 0].shape[0]}")

1: 1028 | 0 : 8972


# Train the old LightGBModel

In [6]:
def save_model(models, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(models, handle)
        
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [10]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
    "device" : "gpu",
    'gpu_platform_id' : 0,
    'gpu_device_id' : 0,
}

In [11]:
gc.enable()
skf = StratifiedKFold(n_splits=10)
lgbm_models = []
for train_idx, test_idx in skf.split(Xtr.values, ytr.values):
        gc.collect()
        X_train, y_train = Xtr.iloc[train_idx], ytr.iloc[train_idx]
        X_tr, y_tr = augment(X_train.values, y_train.values)
        X_tr = pd.DataFrame(X_tr)
        trn_data = lgb.Dataset(X_tr, label=y_tr)
        test_data = lgb.Dataset(Xtr.values[test_idx], label=ytr.values[test_idx])
        model_lgb     = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, test_data], verbose_eval=5000, early_stopping_rounds = 4000)
        lgbm_models.append(model_lgb)
        auc = roc_auc_score(ytr.values[test_idx], model_lgb.predict(Xtr.values[test_idx]))

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.912646	valid_1's auc: 0.903208
[10000]	training's auc: 0.923005	valid_1's auc: 0.907249
[15000]	training's auc: 0.931071	valid_1's auc: 0.907831
[20000]	training's auc: 0.938295	valid_1's auc: 0.907875
Early stopping, best iteration is:
[17002]	training's auc: 0.934041	valid_1's auc: 0.907989
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.91372	valid_1's auc: 0.898341
[10000]	training's auc: 0.923905	valid_1's auc: 0.90175
[15000]	training's auc: 0.931813	valid_1's auc: 0.902068
Early stopping, best iteration is:
[15420]	training's auc: 0.932426	valid_1's auc: 0.902109
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.913414	valid_1's auc: 0.898391
[10000]	training's auc: 0.923549	valid_1's auc: 0.901734
[15000]	training's auc: 0.931467	valid_1's auc: 0.902197
[20000]	training's auc: 0.938578	valid_1's auc: 0.902162
Ea

In [13]:
save_model(lgbm_models, 'latest_lgb_models.m')

# XGB

In [None]:
skf = StratifiedKFold(n_splits=10)
xgb_models = []
for k, (train_idx, test_idx) in enumerate(skf.split(Xtr.values, ytr.values)):
        print(f"Fold: {k}")
        gc.collect()
        model_xgb =  xgb.XGBClassifier(max_depth=2,
                              colsample_bytree=0.7,
                              n_estimators=20000,
                              scale_pos_weight = 9,
                              learning_rate=0.02,
                              objective='binary:logistic', 
                              verbosity =1,
                              eval_metric  = 'auc',
                              tree_method='gpu_hist',
                               verbose = True,
                              n_jobs=-1)
        
        X_train, y_train = Xtr.iloc[train_idx], ytr.iloc[train_idx]
        X_tr, y_tr = augment(X_train.values, y_train.values)
        bst = model_xgb.fit(X_tr, y_tr,  eval_set = [(Xtr.values[test_idx], ytr.values[test_idx])],
                            early_stopping_rounds=200,
                            verbose= True, 
                            eval_metric = 'auc')
        xgb_models.append(model_xgb)                                

In [None]:
save_model(xgb_models, 'latest_xgb_models.m')

# Lightgbm #2

In [None]:
new_params = {
    "objective" : "binary", "metric" : "auc", "boosting": 'gbdt', "max_depth" : -1, "num_leaves" : 13,
    "learning_rate" : 0.01, "bagging_freq": 5, "bagging_fraction" : 0.4, "feature_fraction" : 0.05,
    "min_data_in_leaf": 80, "min_sum_heassian_in_leaf": 10, "tree_learner": "serial", "boost_from_average": "false",
    "bagging_seed" : random_state, "verbosity" : 1, "seed": random_state
}
# Try CV 5

# Lightgbm #3

In [None]:
new_params2 = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    #"lambda_l1" : 5,
    #"lambda_l2" : 5,
    "bagging_seed" : random_state,
    "verbosity" : 1,
    "seed": random_state
}
# Try CV 11

# Get preds

In [54]:
def load_model(model_name):
    with open(model_name, 'rb') as handle:
        model = pickle.load(handle)
    return model    

def make_prediction(model, X):
    return model.predict(X)

def get_all_preds(X, model_names):
    all_preds = {}
    for model_name in model_names:
        i = 0
        models = load_model(model_name)
        for model in models:
            i = i + 1
            y_pred = make_prediction(model, X)
            key = str(model_name) + str(i)
            all_preds[key] = y_pred
            print(f"{key} Done with {len(y_pred)} shape")
    return pd.DataFrame(all_preds)      

In [39]:
def get_score(y1, y2):
    return roc_auc_score(y1, y2)

In [None]:
model_names

In [None]:
X_aug = get_all_preds(X_train, model_names)

# Preds|

In [17]:
preds = {}
for i, model in enumerate(lgbm_models):
    key = f"fold{i}"
    print(key)
    y_pred = make_prediction(model, Xval)
    preds[key] = y_pred

fold0
fold1
fold2
fold3
fold4
fold5
fold6
fold7
fold8
fold9


In [28]:
for column in preds.columns:
    print(f"{column} : {roc_auc_score(yval, preds[column].values)}")

fold0 : 0.890064159833186
fold1 : 0.8912337085025441
fold2 : 0.8907837569888853
fold3 : 0.891505631007666
fold4 : 0.8919291275407624
fold5 : 0.8916691314613037
fold6 : 0.891846835203686
fold7 : 0.8922693559383191
fold8 : 0.8915386997333685
fold9 : 0.8923010151773525


In [38]:
average = preds.mean(axis = 1)
roc_auc_score(yval, average)

0.8923989202898425

In [42]:
mnames = ['lb_aug_901_cv10.m']
X_aug = get_all_preds(Xval, mnames)

lb_aug_901_cv10.m1 Done with 10000 shape
lb_aug_901_cv10.m2 Done with 10000 shape
lb_aug_901_cv10.m3 Done with 10000 shape
lb_aug_901_cv10.m4 Done with 10000 shape
lb_aug_901_cv10.m5 Done with 10000 shape
lb_aug_901_cv10.m6 Done with 10000 shape
lb_aug_901_cv10.m7 Done with 10000 shape
lb_aug_901_cv10.m8 Done with 10000 shape
lb_aug_901_cv10.m9 Done with 10000 shape
lb_aug_901_cv10.m10 Done with 10000 shape


In [44]:
for column in X_aug.columns:
    print(f"{column} : {roc_auc_score(yval, X_aug[column].values)}")

lb_aug_901_cv10.m1 : 0.9193087313579124
lb_aug_901_cv10.m2 : 0.9163918529068387
lb_aug_901_cv10.m3 : 0.9157911947416173
lb_aug_901_cv10.m4 : 0.9178648748983
lb_aug_901_cv10.m5 : 0.9161113650596496
lb_aug_901_cv10.m6 : 0.9161777193551577
lb_aug_901_cv10.m7 : 0.9219117279699403
lb_aug_901_cv10.m8 : 0.9154303661542785
lb_aug_901_cv10.m9 : 0.9151270012542262
lb_aug_901_cv10.m10 : 0.9200904543491119


In [46]:
!ls data/*csv

data/sample_submission.csv  data/test.csv  data/train.csv


# Generate answer

In [48]:
dfTest = pd.read_csv('data/test.csv')

In [49]:
dfTest.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,-2.1556,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,10.6165,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.1293,-20.976
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,-0.7484,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.8956,-23.1794
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,4.2259,9.1723,1.2835,3.3778,19.5542,-0.286,-5.1612,7.2882,13.926,-9.1846


In [56]:
def make_answer(X, models, disk = True):
    codes = X.ID_code
    X = X.drop('ID_code', axis = 1)
    y_preds = get_all_preds(X, models, disk)
    answer_df = pd.DataFrame({
    'ID_code' : codes,
    'target' : answer_df.mean(axis = 1)
    })
    return answer_df

In [None]:
answer_df = make_answer(dfTest, lgbm_models, disk=False)

<lightgbm.basic.Booster object at 0x7f0ca00c02b0>1 Done with 200000 shape
<lightgbm.basic.Booster object at 0x7f0ca00c02b0>2 Done with 200000 shape
<lightgbm.basic.Booster object at 0x7f0ca00c02b0>3 Done with 200000 shape
<lightgbm.basic.Booster object at 0x7f0ca00c02b0>4 Done with 200000 shape
<lightgbm.basic.Booster object at 0x7f0ca00c02b0>5 Done with 200000 shape
<lightgbm.basic.Booster object at 0x7f0ca00c02b0>6 Done with 200000 shape
<lightgbm.basic.Booster object at 0x7f0ca00c02b0>7 Done with 200000 shape
<lightgbm.basic.Booster object at 0x7f0ca00c02b0>8 Done with 200000 shape
<lightgbm.basic.Booster object at 0x7f0ca00c02b0>9 Done with 200000 shape
<lightgbm.basic.Booster object at 0x7f0ca00c02b0>10 Done with 200000 shape
<lightgbm.basic.Booster object at 0x7f0c646d2550>1 Done with 200000 shape
<lightgbm.basic.Booster object at 0x7f0c646d2550>2 Done with 200000 shape
<lightgbm.basic.Booster object at 0x7f0c646d2550>3 Done with 200000 shape
<lightgbm.basic.Booster object at 0x7