In [8]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
import gc

In [28]:
def build_model_input():
    ## use the preprocessed clean dataset from other notebook.
    data = pd.read_csv('data.csv')
    test = pd.read_csv('test.csv')
    y = pd.read_csv('target.csv')
    ids = pd.read_csv('skids.csv')
    print('Shapes : ', data.shape, test.shape, y.shape, ids.shape)
    
    return data, test, y, ids

In [29]:
def train_model(data_, test_, y_, folds_):

    oof_preds = np.zeros(data_.shape[0])
    sub_preds = np.zeros(test_.shape[0])

    feature_importance_df = pd.DataFrame()

    feats = [f for f in data_.columns if f not in ['SK_ID_CURR']]

    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(data_, y_)):
        trn_x, trn_y = data_[feats].iloc[trn_idx], y_.iloc[trn_idx]
        val_x, val_y = data_[feats].iloc[val_idx], y_.iloc[val_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.03,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(
            trn_x,
            trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric='auc',
            verbose=100,
            early_stopping_rounds=100  #30
        )

        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_[feats],
            num_iteration=clf.best_iteration_)[:, 1] / folds_.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        print('Fold %2d AUC : %.6f' %
              (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
        del clf, trn_x, trn_y, val_x, val_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))

    test_['TARGET'] = sub_preds

    df_oof_preds = pd.DataFrame({'SK_ID_CURR':ids, 'TARGET':y, 'PREDICTION':oof_preds})
    df_oof_preds = df_oof_preds[['SK_ID_CURR', 'TARGET', 'PREDICTION']]

    return oof_preds, df_oof_preds, test_[['SK_ID_CURR', 'TARGET']], feature_importance_df, roc_auc_score(y, oof_preds)

In [30]:
def display_importances(feature_importance_df_):
    # Plot feature importances
    cols = feature_importance_df_[["feature", "importance"]].groupby(
        "feature").mean().sort_values(
            by="importance", ascending=False)[:50].index

    best_features = feature_importance_df_.loc[
        feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(
        x="importance",
        y="feature",
        data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')

In [31]:
def display_roc_curve(y_, oof_preds_, folds_idx_):
    # Plot ROC curves
    plt.figure(figsize=(6, 6))
    scores = []
    roc_arr = []
    for n_fold, (_, val_idx) in enumerate(folds_idx_):
        # Plot the roc curve
        fpr, tpr, thresholds = roc_curve(y_.iloc[val_idx], oof_preds_[val_idx])
        score = roc_auc_score(y_.iloc[val_idx], oof_preds_[val_idx])
        scores.append(score)
        plt.plot(
            fpr,
            tpr,
            lw=1,
            alpha=0.3,
            label='ROC fold %d (AUC = %0.4f)' % (n_fold + 1, score))
        roc_arr.append(score)
        
    plt.plot(
        [0, 1], [0, 1],
        linestyle='--',
        lw=2,
        color='r',
        label='Luck',
        alpha=.8)
    fpr, tpr, thresholds = roc_curve(y_, oof_preds_)
    score = roc_auc_score(y_, oof_preds_)
    plt.plot(
        fpr,
        tpr,
        color='b',
        label='Avg ROC (AUC = %0.4f $\pm$ %0.4f)' % (score, np.std(scores)),
        lw=2,
        alpha=.8)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('LightGBM ROC Curve')
    plt.legend(loc="lower right")
    plt.tight_layout()

    plt.savefig('roc_curve-01.png')

In [32]:
def display_precision_recall(y_, oof_preds_, folds_idx_):
    # Plot ROC curves
    plt.figure(figsize=(6, 6))

    scores = []
    for n_fold, (_, val_idx) in enumerate(folds_idx_):
        # Plot the roc curve
        fpr, tpr, thresholds = roc_curve(y_.iloc[val_idx], oof_preds_[val_idx])
        score = average_precision_score(y_.iloc[val_idx], oof_preds_[val_idx])
        scores.append(score)
        plt.plot(fpr, tpr, lw=1, alpha=0.3,label='AP fold %d (AUC = %0.4f)' % (n_fold + 1, score))

    precision, recall, thresholds = precision_recall_curve(y_, oof_preds_)
    score = average_precision_score(y_, oof_preds_)
    plt.plot(precision, recall, color='b', 
             label='Avg ROC (AUC = %0.4f $\pm$ %0.4f)' % (score, np.std(scores)), lw=2, alpha=.8)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('LightGBM Recall / Precision')
    plt.legend(loc="best")
    plt.tight_layout()

    plt.savefig('recall_precision_curve-01.png')

In [33]:
if __name__ == '__main__':
    gc.enable()
    # Build model inputs
    data, test, y, ids = build_model_input()
    # Create Folds
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1001)
    # Train model and get oof and test predictions
    oof_preds, df_oof_preds, test_preds, importances, score = train_model(data, test, y, folds)
    # Save test predictions
    now = datetime.now()
    score = str(round(score, 6)).replace('.', '')
    sub_file = 'submission_average-LGB.csv'
    test_preds.to_csv(sub_file, index=False)
    oof_file = 'train_LGB.csv'
    df_oof_preds.to_csv(oof_file, index=False)
    # Display a few graphs
    folds_idx = [(trn_idx, val_idx)
                 for trn_idx, val_idx in folds.split(data, y)]
    display_importances(feature_importance_df_=importances)
    display_roc_curve(y_=y, oof_preds_=oof_preds, folds_idx_=folds_idx)
    display_precision_recall(y_=y, oof_preds_=oof_preds, folds_idx_=folds_idx)

Shapes :  (307511, 380) (48744, 381) (307511, 1) (307511, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's binary_logloss: 0.240724	training's auc: 0.782792	valid_1's binary_logloss: 0.24601	valid_1's auc: 0.7636
[200]	training's binary_logloss: 0.231756	training's auc: 0.804278	valid_1's binary_logloss: 0.241362	valid_1's auc: 0.775064
[300]	training's binary_logloss: 0.226364	training's auc: 0.817779	valid_1's binary_logloss: 0.239861	valid_1's auc: 0.779072
[400]	training's binary_logloss: 0.222104	training's auc: 0.828676	valid_1's binary_logloss: 0.239086	valid_1's auc: 0.781321
[500]	training's binary_logloss: 0.218469	training's auc: 0.83778	valid_1's binary_logloss: 0.238682	valid_1's auc: 0.782572
[600]	training's binary_logloss: 0.215028	training's auc: 0.846512	valid_1's binary_logloss: 0.238486	valid_1's auc: 0.783087
[700]	training's binary_logloss: 0.211859	training's auc: 0.854156	valid_1's binary_logloss: 0.238349	valid_1's auc: 0.783454
[800]	training's binary_logloss: 0.20888	training's auc: 0.

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's binary_logloss: 0.240624	training's auc: 0.782689	valid_1's binary_logloss: 0.2466	valid_1's auc: 0.76116
[200]	training's binary_logloss: 0.231639	training's auc: 0.804353	valid_1's binary_logloss: 0.241863	valid_1's auc: 0.77344
[300]	training's binary_logloss: 0.226157	training's auc: 0.818032	valid_1's binary_logloss: 0.240297	valid_1's auc: 0.777886
[400]	training's binary_logloss: 0.22174	training's auc: 0.829292	valid_1's binary_logloss: 0.239694	valid_1's auc: 0.779502
[500]	training's binary_logloss: 0.217937	training's auc: 0.838916	valid_1's binary_logloss: 0.239351	valid_1's auc: 0.780334
[600]	training's binary_logloss: 0.214583	training's auc: 0.847301	valid_1's binary_logloss: 0.239348	valid_1's auc: 0.780253
Early stopping, best iteration is:
[574]	training's binary_logloss: 0.21543	training's auc: 0.845218	valid_1's binary_logloss: 0.239302	valid_1's auc: 0.780451
Fold  2 AUC : 0.780451


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's binary_logloss: 0.240832	training's auc: 0.782249	valid_1's binary_logloss: 0.245691	valid_1's auc: 0.764165
[200]	training's binary_logloss: 0.232226	training's auc: 0.802954	valid_1's binary_logloss: 0.240735	valid_1's auc: 0.77706
[300]	training's binary_logloss: 0.226929	training's auc: 0.816545	valid_1's binary_logloss: 0.238835	valid_1's auc: 0.782194
[400]	training's binary_logloss: 0.222753	training's auc: 0.827407	valid_1's binary_logloss: 0.237756	valid_1's auc: 0.785191
[500]	training's binary_logloss: 0.219279	training's auc: 0.836324	valid_1's binary_logloss: 0.237236	valid_1's auc: 0.786591
[600]	training's binary_logloss: 0.215892	training's auc: 0.844983	valid_1's binary_logloss: 0.23693	valid_1's auc: 0.787385
[700]	training's binary_logloss: 0.212665	training's auc: 0.85288	valid_1's binary_logloss: 0.236745	valid_1's auc: 0.78794
[800]	training's binary_logloss: 0.209832	training's auc: 0

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's binary_logloss: 0.240392	training's auc: 0.78354	valid_1's binary_logloss: 0.247374	valid_1's auc: 0.758824
[200]	training's binary_logloss: 0.231439	training's auc: 0.805312	valid_1's binary_logloss: 0.242919	valid_1's auc: 0.770301
[300]	training's binary_logloss: 0.226054	training's auc: 0.818744	valid_1's binary_logloss: 0.241391	valid_1's auc: 0.774523
[400]	training's binary_logloss: 0.221849	training's auc: 0.829285	valid_1's binary_logloss: 0.240748	valid_1's auc: 0.77627
[500]	training's binary_logloss: 0.21825	training's auc: 0.838441	valid_1's binary_logloss: 0.240415	valid_1's auc: 0.777055
[600]	training's binary_logloss: 0.214799	training's auc: 0.846921	valid_1's binary_logloss: 0.240226	valid_1's auc: 0.777498
[700]	training's binary_logloss: 0.211647	training's auc: 0.854564	valid_1's binary_logloss: 0.240124	valid_1's auc: 0.77784
[800]	training's binary_logloss: 0.20863	training's auc: 0.

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's binary_logloss: 0.241135	training's auc: 0.781407	valid_1's binary_logloss: 0.244608	valid_1's auc: 0.768551
[200]	training's binary_logloss: 0.232187	training's auc: 0.803322	valid_1's binary_logloss: 0.239838	valid_1's auc: 0.779493
[300]	training's binary_logloss: 0.226711	training's auc: 0.817057	valid_1's binary_logloss: 0.238203	valid_1's auc: 0.7835
[400]	training's binary_logloss: 0.222529	training's auc: 0.827837	valid_1's binary_logloss: 0.237493	valid_1's auc: 0.785313
[500]	training's binary_logloss: 0.218777	training's auc: 0.837338	valid_1's binary_logloss: 0.237088	valid_1's auc: 0.786291
[600]	training's binary_logloss: 0.215469	training's auc: 0.845585	valid_1's binary_logloss: 0.236809	valid_1's auc: 0.787119
[700]	training's binary_logloss: 0.212385	training's auc: 0.853075	valid_1's binary_logloss: 0.236721	valid_1's auc: 0.787302
[800]	training's binary_logloss: 0.209456	training's auc:

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [9]:
roc_arr = []
for n_fold, (_, val_idx) in enumerate(folds_idx):
    # Plot the roc curve
    fpr, tpr, thresholds = roc_curve(y.iloc[val_idx], oof_preds[val_idx])
    score = roc_auc_score(y.iloc[val_idx], oof_preds[val_idx])
    roc_arr.append(score)
        

In [10]:
roc_arr

[0.7832825854418263,
 0.7809480948239104,
 0.7879534405019654,
 0.7781764674564283,
 0.7870822948142525]