In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

from numba import jit

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'

df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

np.random.seed(42)

Weird AF stuff. split the training set by target, randomly shuffle each feature, put the two sets back together, 

    - only augment train in each fold, don't touch valid and test.
    - upsample positive instances.
    - repeat and bagging.
    
All that augmentation does is add some observations by sampling with replacement from the original dataset and append the data to the original dataset.

In [3]:
@jit
def augment(x, y, t = 2):
    xs, xn = [], []
    for i in range(t):
        mask = y > 0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [4]:
params = {
            "objective" : "binary",
            "metric" : "auc",
            "boosting": 'gbdt',
            "max_depth" : -1, 
            "num_leaves" : 13,
            "learning_rate" : 0.01,
            "bagging_freq": 5,
            "bagging_fraction" : 0.4,
            "feature_fraction" : 0.05,
            "min_data_in_leaf": 80,
            "min_sum_heassian_in_leaf": 10,
            "tree_learner": "serial",
            "boost_from_average": "false",
            "bagging_seed" : 42,
            "verbosity" : 1,
            "seed": 42
        }

In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof = df_train[['ID_code', 'target']]
oof['predict'] = 0
predictions = df_test[['ID_code']]
val_aucs = []

In [6]:
features = [col for col in df_train.columns if col not in ['target', 'ID_code']]
x_test = df_test[features].values

for fold, (trn_idx, val_idx) in enumerate(skf.split(df_train, df_train['target'])):
    x_train, y_train = df_train.iloc[trn_idx][features], df_train.iloc[trn_idx]['target']
    x_valid, y_valid = df_train.iloc[val_idx][features], df_train.iloc[val_idx]['target']
    
    N = 3
    p_valid, yp = 0, 0
    for i in range(N):
        X_t, y_t = augment(x_train.values, y_train.values)  #NOTE: augmentation is done only on train set (not validation)
        X_t = pd.DataFrame(X_t)
        X_t = X_t.add_prefix('var_')
    
        trn_data = lgb.Dataset(X_t, label = y_t)
        val_data = lgb.Dataset(x_valid, label = y_valid)
        
        evals_result = {}
        lgb_clf = lgb.train(params, trn_data, 10000,#100000
                            valid_sets = [trn_data, val_data],
                            early_stopping_rounds = 100, #1000
                            verbose_eval = 5000, evals_result = evals_result)
        
        p_valid += lgb_clf.predict(x_valid)
        yp += lgb_clf.predict(x_test)
    
    oof['predict'][val_idx] = p_valid/N
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)
    predictions['fold{}'.format(fold+1)] = yp/N


Training until validation scores don't improve for 100 rounds.
[5000]	training's auc: 0.916515	valid_1's auc: 0.897858
Early stopping, best iteration is:
[6487]	training's auc: 0.920835	valid_1's auc: 0.899256
Training until validation scores don't improve for 100 rounds.
[5000]	training's auc: 0.916167	valid_1's auc: 0.898052
Early stopping, best iteration is:
[8899]	training's auc: 0.926169	valid_1's auc: 0.900157
Training until validation scores don't improve for 100 rounds.
[5000]	training's auc: 0.916843	valid_1's auc: 0.897906
Early stopping, best iteration is:
[7796]	training's auc: 0.924302	valid_1's auc: 0.899978
Training until validation scores don't improve for 100 rounds.
[5000]	training's auc: 0.916153	valid_1's auc: 0.89883
Early stopping, best iteration is:
[7688]	training's auc: 0.923455	valid_1's auc: 0.900574
Training until validation scores don't improve for 100 rounds.
[5000]	training's auc: 0.915725	valid_1's auc: 0.899309
Early stopping, best iteration is:
[6951]	

In [9]:
mean_auc = np.mean(val_aucs)
std_auc = np.std(val_aucs)
all_auc = roc_auc_score(oof['target'], oof['predict'])
print("Mean auc: %.9f, std: %.9f. All auc: %.9f." % (mean_auc, std_auc, all_auc))

predictions['target'] = np.mean(predictions[[col for col in predictions.columns 
                                             if col not in ['ID_code', 'target']]].values, axis=1)

submission = pd.DataFrame({"ID_code":df_test["ID_code"].values})
submission["target"] = predictions['target']
submission.head()

Mean auc: 0.901128060, std: 0.002999787. All auc: 0.901056032.


Unnamed: 0,ID_code,target
0,test_0,0.134829
1,test_1,0.304282
2,test_2,0.244035
3,test_3,0.26508
4,test_4,0.065086


In [10]:
submission.to_csv("../results/augment_lgb.csv", index=False)