In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split
import scikitplot as skplt
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import pickle
import gc
from helpers import save_model, lgb_trainer, lgb_trainer_no_aug, test
import multiprocessing

In [5]:
def load_dataframe(dataset):
    return pd.read_csv(dataset)
with multiprocessing.Pool() as pool:
    otrain_df, otest_df = pool.map(load_dataframe, ['data/train.csv', 'data/test.csv'])

### Helpers

In [2]:
def save_model(models, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(models, handle)
        
def disarrange(a, axis=-1):
    """
    Shuffle `a` in-place along the given axis.

    Apply numpy.random.shuffle to the given axis of `a`.
    Each one-dimensional slice is shuffled independently.
    """
    b = a.swapaxes(axis, -1)
    # Shuffle `b` in-place along the last axis.  `b` is a view of `a`,
    # so `a` is shuffled in place, too.
    shp = b.shape[:-1]
    for ndx in np.ndindex(shp):
        np.random.shuffle(b[ndx])
    return

def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        disarrange(x1,axis=0)
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        disarrange(x1,axis=0)
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

def lgb_trainer(X, y, params, n_folds):
    skf = StratifiedKFold(n_splits=n_folds)
    models = []
    for train_idx, test_idx in skf.split(X.values, y.values):
            gc.collect()
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_tr, y_tr = augment(X_train.values, y_train.values)
            X_tr = pd.DataFrame(X_tr)
            trn_data = lgb.Dataset(X_tr, label=y_tr)
            test_data = lgb.Dataset(X.values[test_idx], label=y.values[test_idx])
            model_lgb     = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, test_data], verbose_eval=5000, early_stopping_rounds = 4000)
            models.append(model_lgb)
            auc = roc_auc_score(y.values[test_idx], model_lgb.predict(X.values[test_idx]))
    return models

def lgb_trainer_no_aug(X, y, params, n_folds):
    skf = StratifiedKFold(n_splits=n_folds)
    models = []
    for train_idx, test_idx in skf.split(X.values, y.values):
            gc.collect()
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_tr, y_tr = X_train.values, y_train.values
            X_tr = pd.DataFrame(X_tr)
            trn_data = lgb.Dataset(X_tr, label=y_tr)
            test_data = lgb.Dataset(X.values[test_idx], label=y.values[test_idx])
            model_lgb     = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, test_data], verbose_eval=5000, early_stopping_rounds = 4000)
            models.append(model_lgb)
            auc = roc_auc_score(y.values[test_idx], model_lgb.predict(X.values[test_idx]))
    return models

def test(X, y, models):
    preds = pd.DataFrame({})
    for i, model in enumerate(models):
        preds[str(i)] = model.predict(X)
        print(f"Fold: {i} \t Score: {roc_auc_score(y, preds[str(i)].values)}")
    averaged_preds = preds.mean(axis=1)
    print(f"Score: {roc_auc_score(y, averaged_preds)}")
    return averaged_preds, preds

param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.01,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
}

# Augment #1 - Squares

In [4]:
whole = train_df.append(test_df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [5]:
def augment_squares(df):
    for column in df.columns:
        if column in ['ID_code', 'target']:
            continue
        key = column + '_square'
        df[key] = np.square(df[column])
    return df   

In [6]:
whole = augment_squares(whole)
train_df, test_df = whole[:200000], whole[200000:]
y_train = train_df.target
train_df = train_df.drop(['target', 'ID_code'], axis=1)
Xtr, Xval, ytr, yval  = train_test_split(train_df, y_train, test_size = 0.05)

In [8]:
rank_models = lgb_trainer_no_aug(Xtr, ytr, param, n_folds = 5)

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.930218	valid_1's auc: 0.900855
[10000]	training's auc: 0.950958	valid_1's auc: 0.902792
Early stopping, best iteration is:
[10371]	training's auc: 0.952166	valid_1's auc: 0.902857
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.931839	valid_1's auc: 0.891377
[10000]	training's auc: 0.952127	valid_1's auc: 0.893871
[15000]	training's auc: 0.967181	valid_1's auc: 0.893717
Early stopping, best iteration is:
[11134]	training's auc: 0.955876	valid_1's auc: 0.893933
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.930154	valid_1's auc: 0.901229
[10000]	training's auc: 0.950674	valid_1's auc: 0.903684
Early stopping, best iteration is:
[10767]	training's auc: 0.953312	valid_1's auc: 0.903802
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.93154	valid_1's auc: 0.894204
[10000]	training'

In [18]:
q, w = test(Xval, yval, rank_models)

Fold: 0 	 Score: 0.9011898326312805
Fold: 1 	 Score: 0.9018027909059605
Fold: 2 	 Score: 0.9008703628374002
Fold: 3 	 Score: 0.8995685094888662
Fold: 4 	 Score: 0.8996896069395307
Score: 0.9020395219259979


# Augment #2 - (x-mean)^^2

In [14]:
def augment_squares_prime(df):
    for column in df.columns:
        if column in ['ID_code', 'target']:
            continue
        key = column + '_square'
        df[key] = np.square(df[column] - df[column].mean())
    return df   

In [15]:
whole = train_df.append(test_df)
whole = augment_squares_prime(whole)
train_df, test_df = whole[:200000], whole[200000:]
y_train = train_df.target
train_df = train_df.drop(['target', 'ID_code'], axis=1)
Xtr, Xval, ytr, yval  = train_test_split(train_df, y_train, test_size = 0.05)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [16]:
rank_models = lgb_trainer_no_aug(Xtr, ytr, param, n_folds = 5)

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.930282	valid_1's auc: 0.895956
[10000]	training's auc: 0.952013	valid_1's auc: 0.900068
[15000]	training's auc: 0.967219	valid_1's auc: 0.900571
Early stopping, best iteration is:
[14174]	training's auc: 0.965024	valid_1's auc: 0.90065
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.93053	valid_1's auc: 0.895697
[10000]	training's auc: 0.952317	valid_1's auc: 0.899372
[15000]	training's auc: 0.967456	valid_1's auc: 0.899599
Early stopping, best iteration is:
[15378]	training's auc: 0.968447	valid_1's auc: 0.899663
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.931103	valid_1's auc: 0.892626
[10000]	training's auc: 0.952853	valid_1's auc: 0.89593
[15000]	training's auc: 0.96773	valid_1's auc: 0.895715
Early stopping, best iteration is:
[11402]	training's auc: 0.957474	valid_1's auc: 0.896116
Training until validation 

In [17]:
q, w = test(Xval, yval, rank_models)

Fold: 0 	 Score: 0.9011898326312805
Fold: 1 	 Score: 0.9018027909059605
Fold: 2 	 Score: 0.9008703628374002
Fold: 3 	 Score: 0.8995685094888662
Fold: 4 	 Score: 0.8996896069395307
Score: 0.9020395219259979


# Exponent

In [7]:
def augment_exponent(df):
    for column in df.columns:
        if column in ['ID_code', 'target']:
            continue
        key = column + '_square'
        df[key] = np.exp(df[column] - df[column].mean())
    return df   

def augment_exponent_prime(df):
    for column in df.columns:
        if column in ['ID_code', 'target']:
            continue
        key = column + '_square'
        df[key] = np.exp(df[column])
    return df 

def augment_squares_prime(df):
    for column in df.columns:
        if column in ['ID_code', 'target']:
            continue
        key = column + '_square'
        df[key] = np.square(df[column] - df[column].mean())
    return df   

def augment_squares(df):
    for column in df.columns:
        if column in ['ID_code', 'target']:
            continue
        key = column + '_square'
        df[key] = np.square(df[column])
    return df   

In [8]:
whole = otrain_df.append(otest_df)
whole = augment_exponent(whole)
train_df, test_df = whole[:200000], whole[200000:]
y_train = train_df.target
train_df = train_df.drop(['target', 'ID_code'], axis=1)
Xtr, Xval, ytr, yval  = train_test_split(train_df, y_train, test_size = 0.05)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [11]:
rank_models = lgb_trainer_no_aug(Xtr, ytr, param, n_folds = 5)
q, w = test(Xval, yval, rank_models)

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.931132	valid_1's auc: 0.89762
[10000]	training's auc: 0.951039	valid_1's auc: 0.899225
Early stopping, best iteration is:
[9834]	training's auc: 0.950496	valid_1's auc: 0.89929
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.931052	valid_1's auc: 0.895564
[10000]	training's auc: 0.95098	valid_1's auc: 0.897643
Early stopping, best iteration is:
[9849]	training's auc: 0.950481	valid_1's auc: 0.897668
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.929996	valid_1's auc: 0.902342
[10000]	training's auc: 0.94997	valid_1's auc: 0.904284
Early stopping, best iteration is:
[10374]	training's auc: 0.951237	valid_1's auc: 0.90432
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.930996	valid_1's auc: 0.897699
[10000]	training's auc: 0.950765	valid_1's auc: 0.899544
Early stopping, best it

# Apply all

In [3]:
def augment_exponent(df):
    for column in df.columns:
        if column in ['ID_code', 'target']:
            continue
        key = column + '_exp'
        df[key] = np.exp(df[column] - df[column].mean())
    return df   

def augment_exponent_prime(df):
    for column in df.columns:
        if column in ['ID_code', 'target']:
            continue
        key = column + '_expp'
        df[key] = np.exp(df[column])
    return df 

def augment_squares_prime(df):
    for column in df.columns:
        if column in ['ID_code', 'target']:
            continue
        key = column + '_squarep'
        df[key] = np.square(df[column] - df[column].mean())
    return df   

def augment_squares(df):
    for column in df.columns:
        if column in ['ID_code', 'target']:
            continue
        key = column + '_square'
        df[key] = np.square(df[column])
    return df   

def augment_log_prime(df):
    for column in df.columns:
        if column in ['ID_code', 'target']:
            continue
        key = column + '_logp'
        df[key] = np.log(df[column] - df[column].mean())
    return df   

def augment_log(df):
    for column in df.columns:
        if column in ['ID_code', 'target']:
            continue
        key = column + '_log'
        df[key] = np.log(df[column])
    return df   

def augment_row_wise(df):
    cols = [col for col in df.columns]
    df['average'] = df[cols].mean(axis=1)
    df['std'] = df[cols].std(axis=1)
    return df   

In [4]:
def load_dataframe(dataset):
    return pd.read_csv(dataset)
with multiprocessing.Pool() as pool:
    otrain_df, otest_df = pool.map(load_dataframe, ['data/train.csv', 'data/test.csv'])

In [5]:
def apply_all(otrain_df, otest_df, augmentations, param, n_folds):
    all_models = []
    for augmentation in augmentations:
        if augmentation == 'Normal':
            print(f"Doing augmentation: {augmentation}")
            whole = otrain_df.append(otest_df)
        else:    
            print(f"Doing augmentation: {augmentation.__name__}")
            whole = augmentation(otrain_df.append(otest_df))
        train_df, test_df = whole[:200000], whole[200000:]
        y_train = train_df.target
        train_df = train_df.drop(['target', 'ID_code'], axis=1)
        models = lgb_trainer_no_aug(train_df, y_train, param, n_folds = n_folds)
        models2 = lgb_trainer(train_df, y_train, param, n_folds = n_folds)
        all_models += models
        all_models += models2
    return all_models    

In [21]:
augmentations = [augment_row_wise, 'Normal', augment_log, augment_log_prime, augment_exponent, 
                                         augment_exponent_prime, augment_squares, augment_squares_prime
                                        ]

In [22]:
all_models = apply_all(otrain_df, otest_df, augmentations, param, 3)

Doing augmentation: <function augment_row_wise at 0x7f81841c9598>


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.933098	valid_1's auc: 0.89652


KeyboardInterrupt: 