In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split
import scikitplot as skplt
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import pickle
import gc

# Make probability Dfs

In [None]:
def augment_probs(mydf):
    info = {}
    for var in mydf.columns:
        if var in ['ID_code', 'target']:
            continue
        IQR = mydf[var].quantile([0.75]).values - mydf[var].quantile([0.25]).values 
        n = mydf.shape[0] 
        bin_size = 2.5 * IQR / (n ** (1/3)) 
        bin_number = int((mydf[var].max() - mydf[var].min())/bin_size)
        key = 'prob_' + var
        mydf[key] = pd.cut(mydf[var], bins = bin_number, labels = range(bin_number)).astype('float')
        # df_map1 = mydf[mydf['target'] == 0].groupby(key)['target'].count()
        df_map2 = mydf[mydf['target'] == 1].groupby(key)['target'].count()
        df_map3 = mydf.groupby(key)['target'].count()
        df_map_prob = (df_map2 / df_map3).fillna(0)
        mydf[key] = mydf[key].map(df_map_prob)
        info[var] = {
            'bin' : bin_number,
            'map' : df_map_prob,
        }
    return mydf, info

def test_augment_probs(mydf, info):
    for var in mydf.columns:
        if var in ['ID_code', 'target']:
            continue
        bin_number = info[var]['bin']
        pmap = info[var]['map']
        key = 'prob_' + var
        mydf[key] = pd.cut(mydf[var], bins = bin_number, labels = range(bin_number)).astype('float')
        mydf[key] = mydf[key].map(pmap)
    return mydf

In [None]:
aug_df, info = augment_probs(mydf)
aug_df.to_csv('paugTrain.csv', index=False)

In [None]:
testmydf = pd.read_csv('data/test.csv')
testdf = test_augment_probs(testmydf, info)
testdf.to_csv('paugTest', index=False)

# Train on prob dfs

In [6]:
df_train = pd.read_csv('paugTrain.csv')
X_train = df_train.drop(['ID_code', 'target'], axis=1)
y_train = df_train.target

In [7]:
Xtr, Xval, ytr, yval  = train_test_split(X_train, y_train, test_size = 0.05)

In [8]:
Xtr.shape

(190000, 400)

### Functions

In [11]:
def save_model(models, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(models, handle)
        
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)
    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

def lgb_trainer(X, y, params, n_folds):
    skf = StratifiedKFold(n_splits=n_folds)
    models = []
    for train_idx, test_idx in skf.split(X.values, y.values):
            gc.collect()
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_tr, y_tr = augment(X_train.values, y_train.values)
            X_tr = pd.DataFrame(X_tr)
            trn_data = lgb.Dataset(X_tr, label=y_tr)
            test_data = lgb.Dataset(X.values[test_idx], label=y.values[test_idx])
            model_lgb     = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, test_data], verbose_eval=5000, early_stopping_rounds = 4000)
            models.append(model_lgb)
            auc = roc_auc_score(y.values[test_idx], model_lgb.predict(X.values[test_idx]))
    return models

def lgb_trainer_no_aug(X, y, params, n_folds):
    skf = StratifiedKFold(n_splits=n_folds)
    models = []
    for train_idx, test_idx in skf.split(X.values, y.values):
            gc.collect()
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_tr, y_tr = X_train.values, y_train.values
            X_tr = pd.DataFrame(X_tr)
            trn_data = lgb.Dataset(X_tr, label=y_tr)
            test_data = lgb.Dataset(X.values[test_idx], label=y.values[test_idx])
            model_lgb     = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, test_data], verbose_eval=5000, early_stopping_rounds = 4000)
            models.append(model_lgb)
            auc = roc_auc_score(y.values[test_idx], model_lgb.predict(X.values[test_idx]))
    return models

def test(X, y, models):
    preds = pd.DataFrame({})
    for i, model in enumerate(models):
        preds[str(i)] = model.predict(X)
        print(f"Fold: {i} \t Score: {roc_auc_score(y, preds[str(i)].values)}")
    averaged_preds = preds.mean(axis=1)
    print(f"Score: {roc_auc_score(y, averaged_preds)}")
    return averaged_preds, preds

### Train

In [13]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
}

In [14]:
first_models_v = lgb_trainer(Xtr, ytr, param, n_folds = 10)

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.953788	valid_1's auc: 0.903553
[10000]	training's auc: 0.965129	valid_1's auc: 0.91489
[15000]	training's auc: 0.970238	valid_1's auc: 0.917575
[20000]	training's auc: 0.974018	valid_1's auc: 0.918367
[25000]	training's auc: 0.977409	valid_1's auc: 0.918371
Early stopping, best iteration is:
[21669]	training's auc: 0.97518	valid_1's auc: 0.91843
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.953962	valid_1's auc: 0.899077
[10000]	training's auc: 0.965241	valid_1's auc: 0.911142
[15000]	training's auc: 0.970298	valid_1's auc: 0.915007
[20000]	training's auc: 0.97407	valid_1's auc: 0.916235
[25000]	training's auc: 0.977441	valid_1's auc: 0.916481
Early stopping, best iteration is:
[24699]	training's auc: 0.977248	valid_1's auc: 0.916558
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.953783	valid_1's auc: 0.905273
[100

In [15]:
save_model(first_models_v, "probs_model_augmented_on_val.m")

In [16]:
first_models_no_aug = lgb_trainer_no_aug(Xtr, ytr, param, n_folds = 10)

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.942318	valid_1's auc: 0.913375
[10000]	training's auc: 0.959361	valid_1's auc: 0.919047
[15000]	training's auc: 0.970405	valid_1's auc: 0.919307
Early stopping, best iteration is:
[12679]	training's auc: 0.965561	valid_1's auc: 0.919548
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.942287	valid_1's auc: 0.909312
[10000]	training's auc: 0.959412	valid_1's auc: 0.917216
[15000]	training's auc: 0.970512	valid_1's auc: 0.91774
Early stopping, best iteration is:
[13026]	training's auc: 0.966445	valid_1's auc: 0.917904
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.942102	valid_1's auc: 0.914823
[10000]	training's auc: 0.959238	valid_1's auc: 0.920966
[15000]	training's auc: 0.970352	valid_1's auc: 0.92161
Early stopping, best iteration is:
[13324]	training's auc: 0.966863	valid_1's auc: 0.921742
Training until validatio

In [17]:
save_model(first_models_no_aug, "probs_model_no_augmented.m")

In [18]:
q, w = test(Xval, yval, first_models_v)

Fold: 0 	 Score: 0.9238241618755103
Fold: 1 	 Score: 0.9234295391503152
Fold: 2 	 Score: 0.9234469537189564
Fold: 3 	 Score: 0.9233404238577011
Fold: 4 	 Score: 0.9236695807044859
Fold: 5 	 Score: 0.9242265244089904
Fold: 6 	 Score: 0.9239748731423924
Fold: 7 	 Score: 0.923447168713631
Fold: 8 	 Score: 0.9239153196175331
Fold: 9 	 Score: 0.9232223917813556
Score: 0.9244281894137482


In [19]:
q, w = test(Xval, yval, first_models_no_aug)

Fold: 0 	 Score: 0.9241793330779197
Fold: 1 	 Score: 0.9244944077735195
Fold: 2 	 Score: 0.9246285644504585
Fold: 3 	 Score: 0.9246700584226529
Fold: 4 	 Score: 0.9243303668368136
Fold: 5 	 Score: 0.9241090298193314
Fold: 6 	 Score: 0.9236623783828875
Fold: 7 	 Score: 0.9248365717981165
Fold: 8 	 Score: 0.9244765632155292
Fold: 9 	 Score: 0.9238271718009545
Score: 0.9252236697097013


# Make answer

In [None]:
def make_answer(models):
    answer_qdf = pd.read_csv('data/test.csv')
    test_labels = answer_qdf.ID_code
    answer_qdf = answer_qdf.drop('ID_code', axis=1)
    y_preds = {}
    for i, model in enumerate(models):
            print(f"On fold: {i}")
            y_preds[str(i)] = model.predict(answer_qdf)
    y_preds = pd.DataFrame(y_preds)
    answer_df = pd.DataFrame({
        'ID_code' : test_labels,
        'target' : y_preds.mean(axis = 1),
        })
    return answer_df 