In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split
import scikitplot as skplt
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import pickle
import gc
from helpers import save_model, lgb_trainer, lgb_trainer_no_aug, test
import multiprocessing

In [2]:
def load_dataframe(dataset):
    return pd.read_csv(dataset)
with multiprocessing.Pool() as pool:
    train_df, test_df = pool.map(load_dataframe, ['data/train.csv', 'data/test.csv'])

### Helpers

In [3]:
def save_model(models, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(models, handle)
        
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)
    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

def lgb_trainer(X, y, params, n_folds):
    skf = StratifiedKFold(n_splits=n_folds)
    models = []
    for train_idx, test_idx in skf.split(X.values, y.values):
            gc.collect()
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_tr, y_tr = augment(X_train.values, y_train.values)
            X_tr = pd.DataFrame(X_tr)
            trn_data = lgb.Dataset(X_tr, label=y_tr)
            test_data = lgb.Dataset(X.values[test_idx], label=y.values[test_idx])
            model_lgb     = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, test_data], verbose_eval=5000, early_stopping_rounds = 4000)
            models.append(model_lgb)
            auc = roc_auc_score(y.values[test_idx], model_lgb.predict(X.values[test_idx]))
    return models

def lgb_trainer_no_aug(X, y, params, n_folds):
    skf = StratifiedKFold(n_splits=n_folds)
    models = []
    for train_idx, test_idx in skf.split(X.values, y.values):
            gc.collect()
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_tr, y_tr = X_train.values, y_train.values
            X_tr = pd.DataFrame(X_tr)
            trn_data = lgb.Dataset(X_tr, label=y_tr)
            test_data = lgb.Dataset(X.values[test_idx], label=y.values[test_idx])
            model_lgb     = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, test_data], verbose_eval=5000, early_stopping_rounds = 4000)
            models.append(model_lgb)
            auc = roc_auc_score(y.values[test_idx], model_lgb.predict(X.values[test_idx]))
    return models

def test(X, y, models):
    preds = pd.DataFrame({})
    for i, model in enumerate(models):
        preds[str(i)] = model.predict(X)
        print(f"Fold: {i} \t Score: {roc_auc_score(y, preds[str(i)].values)}")
    averaged_preds = preds.mean(axis=1)
    print(f"Score: {roc_auc_score(y, averaged_preds)}")
    return averaged_preds, preds

# Augment rank

In [4]:
whole = train_df.append(test_df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [5]:
def augment_ranks(df):
    for column in df.columns:
        if column in ['ID_code', 'target']:
            continue
        key = column + '_rank'
        df[key] = np.argsort(df[column])
    return df   

In [6]:
whole = augment_ranks(whole)

In [7]:
whole.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_10,var_100,var_101,var_102,var_103,var_104,...,var_90_rank,var_91_rank,var_92_rank,var_93_rank,var_94_rank,var_95_rank,var_96_rank,var_97_rank,var_98_rank,var_99_rank
0,train_0,0.0,8.9255,-6.7863,2.9252,9.4763,13.3102,26.5376,1.4403,14.71,...,26612,107906,165151,144266,371497,83672,85889,367237,154549,220487
1,train_1,0.0,11.5006,-4.1473,-0.4032,-13.695,8.4068,35.4734,1.7093,15.1866,...,6534,59339,145133,102186,225371,91243,130521,361110,260836,238168
2,train_2,0.0,8.6093,-2.7457,-0.3249,-0.3939,12.6317,14.8863,1.3854,15.0284,...,61117,155651,138496,51928,241797,191880,198398,143353,378978,355199
3,train_3,0.0,11.0604,-2.1518,2.3061,-19.8592,22.5316,18.6129,1.3512,9.3291,...,55152,92527,169836,390680,132410,232483,131761,198207,293197,72482
4,train_4,0.0,9.8369,-1.4834,-9.4458,-22.9264,12.3562,17.341,1.694,7.1179,...,148063,86626,192857,208448,364423,129932,41602,148130,295811,92898


In [8]:
tdf = whole[200000:]

In [9]:
tdf, testdf = whole[:200000], whole[200000:]

In [10]:
y_train = tdf.target
tdf = tdf.drop(['target', 'ID_code'], axis=1)

In [11]:
Xtr, Xval, ytr, yval  = train_test_split(tdf, y_train, test_size = 0.2, random_state=42)

In [16]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.001,
    'max_depth': 4,
    'metric':'auc',
    'min_data_in_leaf': 100,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 10,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
    'seed':42,
}

In [17]:
rank_models = lgb_trainer_no_aug(Xtr, ytr, param, n_folds = 2)

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.906994	valid_1's auc: 0.876464
[10000]	training's auc: 0.914307	valid_1's auc: 0.881109


KeyboardInterrupt: 

In [17]:
q, w = test(Xval, yval, rank_models)

Fold: 0 	 Score: 0.8979343920178489
Fold: 1 	 Score: 0.8989173868095288
Score: 0.9009706772952017


In [17]:
q, w = test(Xval, yval, rank_models)

Fold: 0 	 Score: 0.8997925361059268
Fold: 1 	 Score: 0.9014075481477227
Fold: 2 	 Score: 0.9010311820861489
Fold: 3 	 Score: 0.8997016035382924
Fold: 4 	 Score: 0.9013254155059885
Fold: 5 	 Score: 0.9010606279920454
Fold: 6 	 Score: 0.9004653919785967
Fold: 7 	 Score: 0.9014594451466209
Fold: 8 	 Score: 0.9017516478143297
Fold: 9 	 Score: 0.9008035122083742
Score: 0.9019027132089482


In [19]:
rank_models = lgb_trainer(Xtr, ytr, param, n_folds = 10)

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.912828	valid_1's auc: 0.900054
[10000]	training's auc: 0.925068	valid_1's auc: 0.902264
[15000]	training's auc: 0.934735	valid_1's auc: 0.90222
Early stopping, best iteration is:
[11294]	training's auc: 0.927695	valid_1's auc: 0.902475
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.912384	valid_1's auc: 0.899655
[10000]	training's auc: 0.924822	valid_1's auc: 0.902414
[15000]	training's auc: 0.934502	valid_1's auc: 0.902163
Early stopping, best iteration is:
[11307]	training's auc: 0.927468	valid_1's auc: 0.902561
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.913259	valid_1's auc: 0.894817
[10000]	training's auc: 0.925386	valid_1's auc: 0.897191
Early stopping, best iteration is:
[10958]	training's auc: 0.927331	valid_1's auc: 0.897374
Training until validation scores don't improve for 4000 rounds.
[5000]	training'

In [20]:
q, w = test(Xval, yval, rank_models)

Fold: 0 	 Score: 0.9012618980921195
Fold: 1 	 Score: 0.9028719460731515
Fold: 2 	 Score: 0.9033342806415959
Fold: 3 	 Score: 0.9019429897928757
Fold: 4 	 Score: 0.9008802295110931
Fold: 5 	 Score: 0.9020246711563589
Fold: 6 	 Score: 0.9020896552245443
Fold: 7 	 Score: 0.9018609699707041
Fold: 8 	 Score: 0.9025413847540833
Fold: 9 	 Score: 0.9009602185811338
Score: 0.9028675461102015


In [32]:
Xval.shape

(10000, 400)

In [34]:
test_df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_10,var_100,var_101,var_102,var_103,var_104,...,var_90_rank,var_91_rank,var_92_rank,var_93_rank,var_94_rank,var_95_rank,var_96_rank,var_97_rank,var_98_rank,var_99_rank
0,test_0,,11.0656,7.7798,-2.0248,-9.2198,17.3089,30.9548,1.4918,12.8721,...,283352,7357,136022,153965,202685,380958,384304,14213,247572,368780
1,test_1,,8.5304,1.2543,-1.3809,-1.7257,15.4712,35.602,1.657,13.0783,...,57818,384042,254703,226045,324501,41135,14516,332780,96953,387184
2,test_2,,5.4827,-10.3581,-4.7057,-3.5065,14.1663,28.0256,1.3935,10.8257,...,79010,34644,232915,316034,380224,127703,379528,307586,137103,347787
3,test_3,,8.5374,-1.3222,0.0095,1.7021,2.5363,3.8763,1.5173,13.4083,...,83965,160008,288365,346486,334643,12387,278900,27442,311438,175989
4,test_4,,11.7058,-0.1327,5.1025,-14.3858,17.863,23.2274,1.4375,14.4838,...,193746,55131,355825,374237,91741,44310,272654,17443,29860,365488


In [21]:
save_model(rank_models, 'rank_aug_models.m')

In [35]:
def make_answer(models, answer_qdf):
    test_labels = answer_qdf.ID_code
    answer_qdf = answer_qdf.drop(['ID_code','target'], axis=1)
    y_preds = {}
    for i, model in enumerate(models):
            print(f"On fold: {i}")
            y_preds[str(i)] = model.predict(answer_qdf)
    y_preds = pd.DataFrame(y_preds)
    answer_df = pd.DataFrame({
        'ID_code' : test_labels,
        'target' : y_preds.mean(axis = 1),
        })
    return answer_df 

In [36]:
answer = make_answer(rank_models, test_df)

On fold: 0
On fold: 1
On fold: 2
On fold: 3
On fold: 4
On fold: 5
On fold: 6
On fold: 7
On fold: 8
On fold: 9


In [37]:
answer.head()

Unnamed: 0,ID_code,target
0,test_0,0.157681
1,test_1,0.266446
2,test_2,0.248564
3,test_3,0.283618
4,test_4,0.060358


In [38]:
!ls *.csv

answer_10_gpu_lgbm.csv
answer_11_gpu_lgbm_and_pred_best.csv
answer_12_lgbm_bin_trainednotonval.csv
answer_13_lgbm_bin_trainednotonval.csv
answer_14_my_scores_blended.csv
answer_15_probability_no_aug.csv
answer_16_probability_lolThiswontWork.csv
answer_17_kendall_correlation_inverse_weighted_blend.csv
answer_18_blend_average.csv
answer_19_kendall_correlation_inverse_weighted_blend_with_larger_difference.csv
answer_1_simple_gbm.csv
answer_2_lgbm_with_CV10.csv
answer_3_lgbm_with_augmented_data_smote_and_shiz.csv
answer_4_lgbm_with_augmented_data_smote_and_shiz_and_super_learner.csv
answer_5_lgbm_with_only_random_augmented_data.csv
answer_6_lgbm_with_best_and_random_averaged.csv
answer_7_lgbm_ORbwRandomAndBest.csv
answer_8_lgbm_AveragedORbwRandomAndBest.csv
answer_9_lb_aug.csv
init3.csv
paugTest.csv
paugTrain.csv
test_augmented.csv
train_augmented.csv


In [39]:
answer.to_csv('answer_20_lgbm_rank_augmented.csv', index=False)

In [40]:
!ls

 adasyn_aug_lgbm_models_cv10.m
 answer_10_gpu_lgbm.csv
 answer_11_gpu_lgbm_and_pred_best.csv
 answer_12_lgbm_bin_trainednotonval.csv
 answer_13_lgbm_bin_trainednotonval.csv
 answer_14_my_scores_blended.csv
 answer_15_probability_no_aug.csv
 answer_16_probability_lolThiswontWork.csv
 answer_17_kendall_correlation_inverse_weighted_blend.csv
 answer_18_blend_average.csv
 answer_19_kendall_correlation_inverse_weighted_blend_with_larger_difference.csv
 answer_1_simple_gbm.csv
 answer_20_lgbm_rank_augmented.csv
 answer_2_lgbm_with_CV10.csv
 answer_3_lgbm_with_augmented_data_smote_and_shiz.csv
 answer_4_lgbm_with_augmented_data_smote_and_shiz_and_super_learner.csv
 answer_5_lgbm_with_only_random_augmented_data.csv
 answer_6_lgbm_with_best_and_random_averaged.csv
 answer_7_lgbm_ORbwRandomAndBest.csv
 answer_8_lgbm_AveragedORbwRandomAndBest.csv
 answer_9_lb_aug.csv
'argsort feature augment.ipynb'
'Augmented Data - Answer generate.ipynb'
'Augmented LightGBM model .ipynb'


In [41]:
x = pd.read_csv('stacking/2019-03-20_15_43_sub.csv')
x.head()

Unnamed: 0,ID_code,target
0,test_0,0.139434
1,test_1,0.270568
2,test_2,0.235634
3,test_3,0.296368
4,test_4,0.060134
