# LB Shuffle Augment answer

In [1]:
from fastai import *
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from parfit import bestFit
import lightgbm as lgb
from skopt.space import Real, Integer
from skopt.utils import use_named_args
import itertools
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
TRAIN = 'data/train.csv'
TEST = 'data/test.csv'
SAMPLE = 'data/sample_submission.csv'
train = pd.read_csv(TRAIN)
test = pd.read_csv(TEST)

In [6]:
X = train.drop(['ID_code', 'target'], axis=1)
y = train.target

In [2]:
from sklearn.model_selection import StratifiedKFold

In [3]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [4]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [9]:
skf = StratifiedKFold(n_splits=10)
models = []
for train_idx, test_idx in skf.split(X.values, y.values):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_tr, y_tr = augment(X_train.values, y_train.values)
        X_tr = pd.DataFrame(X_tr)
        trn_data = lgb.Dataset(X_tr, label=y_tr)
        test_data = lgb.Dataset(X.values[test_idx], label=y.values[test_idx])
        model_lgb     = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, test_data], verbose_eval=5000, early_stopping_rounds = 4000)
        models.append(model_lgb)
        auc = roc_auc_score(y.values[test_idx], model_lgb.predict(X.values[test_idx]))

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.911936	valid_1's auc: 0.896492
[10000]	training's auc: 0.9219	valid_1's auc: 0.900367
[15000]	training's auc: 0.929507	valid_1's auc: 0.900799
Early stopping, best iteration is:
[14063]	training's auc: 0.928155	valid_1's auc: 0.900877
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.912908	valid_1's auc: 0.896762
[10000]	training's auc: 0.922734	valid_1's auc: 0.900175
[15000]	training's auc: 0.930334	valid_1's auc: 0.900483
[20000]	training's auc: 0.937161	valid_1's auc: 0.900366
Early stopping, best iteration is:
[16413]	training's auc: 0.932313	valid_1's auc: 0.900573
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.912846	valid_1's auc: 0.891986
[10000]	training's auc: 0.922749	valid_1's auc: 0.895467
[15000]	training's auc: 0.930317	valid_1's auc: 0.895507
Early stopping, best iteration is:
[15350]	training's auc: 


### Create answer

In [11]:
codes = test.ID_code
df_test = test.drop('ID_code', axis=1)
answer = [model.predict(df_test, num_iteration=model.best_iteration) for model in models]

In [12]:
ans = np.array(answer)
a = np.average(ans, axis=0)

In [13]:
answer_df = pd.DataFrame({
    'ID_code' : codes,
    'target' : a
})
answer_df.to_csv('answer_9_lb_aug.csv', index=False)