## 3. LightGBM_GSCV1


Reference：
- https://www.kaggle.com/ogrellier/good-fun-with-ligthgbm/code

## Run name

In [1]:
import time

project_name = 'HomeCreditDefaultRisk'
step_name = 'LightGBM_GSCV1'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)
t0 = time.time()

run_name: HomeCreditDefaultRisk_LightGBM_GSCV1_20180603_204528


## Important params

## Import PKGs

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
from IPython.display import display
import seaborn as sns

import os
import sys
import gc
import math
import tqdm
import shutil
import zipfile
import pickle
import h5py
# import cv2
from PIL import Image

from tqdm import tqdm
import multiprocessing

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.utils import shuffle

random_num = np.random.randint(10000)
cpu_amount = multiprocessing.cpu_count()

print('cpu_amount: %s' % (cpu_amount - 1))
print('random_num: %s' % random_num)

  from ._conv import register_converters as _register_converters


cpu_amount: 3
random_num: 6612


In [3]:
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold

import xgboost
# from xgboost import plot_importance

## Project folders

In [4]:
cwd = os.getcwd()
feature_folder = os.path.join(cwd, 'feature')
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')

application_test_csv_file = os.path.join(input_folder, 'application_test.csv')
application_train_csv_file = os.path.join(input_folder, 'application_train.csv')
bureau_csv_file = os.path.join(input_folder, 'bureau.csv')
bureau_balance_csv_file = os.path.join(input_folder, 'bureau_balance.csv')
credit_card_balance_csv_file = os.path.join(input_folder, 'credit_card_balance.csv')
installments_payments_csv_file = os.path.join(input_folder, 'installments_payments.csv')
POS_CASH_balance_csv_file = os.path.join(input_folder, 'POS_CASH_balance.csv')
previous_application_csv_file = os.path.join(input_folder, 'previous_application.csv')
sample_submission_csv_file = os.path.join(input_folder, 'sample_submission.csv')

print(application_test_csv_file)
print(application_train_csv_file)
print(bureau_csv_file)
print(bureau_balance_csv_file)
print(credit_card_balance_csv_file)
print(installments_payments_csv_file)
print(POS_CASH_balance_csv_file)
print(previous_application_csv_file)
print(sample_submission_csv_file)

D:\bitbucket\kaggle\home-credit-default-risk\input\application_test.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\application_train.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\bureau.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\bureau_balance.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\credit_card_balance.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\installments_payments.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\POS_CASH_balance.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\previous_application.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\sample_submission.csv


## Load data

In [5]:
def build_model_input():
    buro_bal = pd.read_csv(bureau_balance_csv_file)
    print('Buro bal shape : ', buro_bal.shape)
    
    print('transform to dummies')
    buro_bal = pd.concat([buro_bal, pd.get_dummies(buro_bal.STATUS, prefix='buro_bal_status')], axis=1).drop('STATUS', axis=1)
    
    print('Counting buros')
    buro_counts = buro_bal[['SK_ID_BUREAU', 'MONTHS_BALANCE']].groupby('SK_ID_BUREAU').count()
    buro_bal['buro_count'] = buro_bal['SK_ID_BUREAU'].map(buro_counts['MONTHS_BALANCE'])
    
    print('averaging buro bal')
    avg_buro_bal = buro_bal.groupby('SK_ID_BUREAU').mean()
    
    avg_buro_bal.columns = ['avg_buro_' + f_ for f_ in avg_buro_bal.columns]
    del buro_bal
    gc.collect()
    
    print('Read Bureau')
    buro = pd.read_csv(bureau_csv_file)
    
    print('Go to dummies')
    buro_credit_active_dum = pd.get_dummies(buro.CREDIT_ACTIVE, prefix='ca_')
    buro_credit_currency_dum = pd.get_dummies(buro.CREDIT_CURRENCY, prefix='cu_')
    buro_credit_type_dum = pd.get_dummies(buro.CREDIT_TYPE, prefix='ty_')
    
    buro_full = pd.concat([buro, buro_credit_active_dum, buro_credit_currency_dum, buro_credit_type_dum], axis=1)
    # buro_full.columns = ['buro_' + f_ for f_ in buro_full.columns]
    
    del buro_credit_active_dum, buro_credit_currency_dum, buro_credit_type_dum
    gc.collect()
    
    print('Merge with buro avg')
    buro_full = buro_full.merge(right=avg_buro_bal.reset_index(), how='left', on='SK_ID_BUREAU', suffixes=('', '_bur_bal'))
    
    print('Counting buro per SK_ID_CURR')
    nb_bureau_per_curr = buro_full[['SK_ID_CURR', 'SK_ID_BUREAU']].groupby('SK_ID_CURR').count()
    buro_full['SK_ID_BUREAU'] = buro_full['SK_ID_CURR'].map(nb_bureau_per_curr['SK_ID_BUREAU'])
    
    print('Averaging bureau')
    avg_buro = buro_full.groupby('SK_ID_CURR').mean()
    print(avg_buro.head())
    
    del buro, buro_full
    gc.collect()
    
    print('Read prev')
    prev = pd.read_csv(previous_application_csv_file)
    
    prev_cat_features = [
        f_ for f_ in prev.columns if prev[f_].dtype == 'object'
    ]
    
    print('Go to dummies')
    prev_dum = pd.DataFrame()
    for f_ in prev_cat_features:
        prev_dum = pd.concat([prev_dum, pd.get_dummies(prev[f_], prefix=f_).astype(np.uint8)], axis=1)
    
    prev = pd.concat([prev, prev_dum], axis=1)
    
    del prev_dum
    gc.collect()
    
    print('Counting number of Prevs')
    nb_prev_per_curr = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
    prev['SK_ID_PREV'] = prev['SK_ID_CURR'].map(nb_prev_per_curr['SK_ID_PREV'])
    
    print('Averaging prev')
    avg_prev = prev.groupby('SK_ID_CURR').mean()
    print(avg_prev.head())
    del prev
    gc.collect()
    
    print('Reading POS_CASH')
    pos = pd.read_csv(POS_CASH_balance_csv_file)
    
    print('Go to dummies')
    pos = pd.concat([pos, pd.get_dummies(pos['NAME_CONTRACT_STATUS'])], axis=1)
    
    print('Compute nb of prevs per curr')
    nb_prevs = pos[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
    pos['SK_ID_PREV'] = pos['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])
    
    print('Go to averages')
    avg_pos = pos.groupby('SK_ID_CURR').mean()
    
    del pos, nb_prevs
    gc.collect()
    
    print('Reading CC balance')
    cc_bal = pd.read_csv(credit_card_balance_csv_file)
    
    print('Go to dummies')
    cc_bal = pd.concat([cc_bal, pd.get_dummies(cc_bal['NAME_CONTRACT_STATUS'], prefix='cc_bal_status_')], axis=1)
    
    nb_prevs = cc_bal[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
    cc_bal['SK_ID_PREV'] = cc_bal['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])
    
    print('Compute average')
    avg_cc_bal = cc_bal.groupby('SK_ID_CURR').mean()
    avg_cc_bal.columns = ['cc_bal_' + f_ for f_ in avg_cc_bal.columns]
    
    del cc_bal, nb_prevs
    gc.collect()
    
    print('Reading Installments')
    inst = pd.read_csv(installments_payments_csv_file)
    nb_prevs = inst[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
    inst['SK_ID_PREV'] = inst['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])
    
    avg_inst = inst.groupby('SK_ID_CURR').mean()
    avg_inst.columns = ['inst_' + f_ for f_ in avg_inst.columns]
    
    print('Read data and test')
    data = pd.read_csv(application_train_csv_file)
    test = pd.read_csv(application_test_csv_file)
    print('Shapes : ', data.shape, test.shape)
    
    id_test = test['SK_ID_CURR']
    
    y = data['TARGET']
    del data['TARGET']
    
    categorical_feats = [
        f for f in data.columns if data[f].dtype == 'object'
    ]
    categorical_feats
    for f_ in categorical_feats:
        data[f_], indexer = pd.factorize(data[f_])
        test[f_] = indexer.get_indexer(test[f_])
        
    data = data.merge(right=avg_buro.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right=avg_buro.reset_index(), how='left', on='SK_ID_CURR')
    
    data = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
    
    data = data.merge(right=avg_pos.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right=avg_pos.reset_index(), how='left', on='SK_ID_CURR')
    
    data = data.merge(right=avg_cc_bal.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right=avg_cc_bal.reset_index(), how='left', on='SK_ID_CURR')
    
    data = data.merge(right=avg_inst.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right=avg_inst.reset_index(), how='left', on='SK_ID_CURR')
    
    del avg_buro, avg_prev
    gc.collect()

    return data, test, y, id_test

In [6]:
x_data, x_test, y_data, id_test = build_model_input()

Buro bal shape :  (27299925, 3)
transform to dummies
Counting buros
averaging buro bal
Read Bureau
Go to dummies
Merge with buro avg
Counting buro per SK_ID_CURR
Averaging bureau
            SK_ID_BUREAU  DAYS_CREDIT  CREDIT_DAY_OVERDUE  \
SK_ID_CURR                                                  
100001               7.0  -735.000000                 0.0   
100002               8.0  -874.000000                 0.0   
100003               4.0 -1400.750000                 0.0   
100004               2.0  -867.000000                 0.0   
100005               3.0  -190.666667                 0.0   

            DAYS_CREDIT_ENDDATE  DAYS_ENDDATE_FACT  AMT_CREDIT_MAX_OVERDUE  \
SK_ID_CURR                                                                   
100001                82.428571        -825.500000                     NaN   
100002              -349.000000        -697.500000                1681.029   
100003              -544.500000       -1097.333333                   0.000   
100

[5 rows x 163 columns]
Reading POS_CASH
Go to dummies
Compute nb of prevs per curr
Go to averages
Reading CC balance
Go to dummies
Compute average
Reading Installments
Read data and test
Shapes :  (307511, 122) (48744, 121)


## Get feature

In [7]:
# id_data = train_csv['SK_ID_CURR']
# id_test = test_csv['SK_ID_CURR']

# useless_features = []
# x_data = train_csv.drop(columns=['SK_ID_CURR'] + useless_features)
# x_test = test_csv.drop(columns=['SK_ID_CURR'] + useless_features)

In [8]:
# train_csv.loc[2][:20]

In [9]:
# plt.hist(x_data[['EXT_SOURCE_1']], bins=100, normed=True)
# plt.xlabel(('x'))
# plt.ylabel('EXT_SOURCE_1')
# plt.show()

In [10]:
# log_columns = ['EXT_SOURCE_1']

# for data_set in [x_data, x_test]:
#     data_set = data_set[log_columns].apply(lambda x: np.log(x + 1))

In [11]:
# plt.hist(x_data[['EXT_SOURCE_1']], bins=100, normed=True)
# plt.xlabel(('x'))
# plt.ylabel('EXT_SOURCE_1')
# plt.show()

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.05, random_state=random_num, shuffle=False)

# x_train, y_train = shuffle(x_train, y_train, random_state=random_num)
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(292135, 380)
(292135,)
(15376, 380)
(15376,)


## Train

In [None]:
%%time
import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

# lgb_train = lgb.Dataset(x_train, label=y_train)
# lgb_val = lgb.Dataset(x_val, label=y_val, reference=lgb_train)

# LightGBM parameters
param_grid = {
#     'task': 'train',
#     'num_boost_round': [200],
#     'early_stopping_rounds': [10],
#     'boosting_type': ['gbdt'], # (default="gbdt")
#     'num_leaves': [300], # (default=31)
    'max_depth': [6,7,8], # (default=-1)
#     'learning_rate': [0.1], # (default=0.1)
#     'n_estimators': [1000, 500], # (default=10)
#     'max_bin': [1000, 255], # (default=255)
#     'subsample_for_bin': [100*10000], # (default=50000)
#     'objective': ['binary'], # (default=None)
#     'min_split_gain': [0.], # (default=0.)
#     'min_child_weight': [1e-3], # (default=1e-3)
#     'min_child_samples': [10], # (default=20)
#     'subsample': [0.7], # (default=1.)
#     'subsample_freq': [1], # (default=1)
    'colsample_bytree': [0.2, 0.8], # (default=1.)
#     'reg_alpha': [0.], # (default=0.)
#     'reg_lambda': [0.], # (default=0.)
#     'random_state': [random_num], # (default=None)
#     'n_jobs': [-1], # (default=-1)
#     'silent': [False], # (default=True)
#     'metric': ['auc', 'binary_logloss'],
}
# print('params: ', params)
# train
clf = lgb.LGBMClassifier(
#     'num_boost_round'=200,
#     'early_stopping_rounds'=10,
    boosting_type='gbdt', # (default="gbdt")
    num_leaves=300, # (default=31)
    max_depth=-1, # (default=-1)
    learning_rate=0.03, # (default=0.1)
    n_estimators=4000, # (default=10)
#     'max_bin'=255, # (default=255)
    subsample_for_bin=500, # (default=50000)
    objective='binary', # (default=None)
    class_weight=None,
    min_split_gain=0.01, # (default=0.)
    min_child_weight=2, # (default=1e-3)
    min_child_samples=10, # (default=20)
    subsample=0.9, # (default=1.)
#     'subsample_freq'=1, # (default=1)
    colsample_bytree=0.2, # (default=1.)
    reg_alpha=0.1, # (default=0.)
    reg_lambda=0.1, # (default=0.)
    random_state=random_num, # (default=None)
    n_jobs=-1, # (default=-1)
    silent=False, # (default=True)
#     'metric'=['auc', 'binary_logloss'],
)
# gbm = lgb.train(
#     params,
#     train_set=lgb_train,
#     valid_sets=lgb_val
# )
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, verbose=2, cv=3, n_jobs=1, scoring='roc_auc')
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] colsample_bytree=0.2, max_depth=6 ...............................
[CV] ................ colsample_bytree=0.2, max_depth=6, total=11.7min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 13.7min remaining:    0.0s


[CV] colsample_bytree=0.2, max_depth=6 ...............................


In [None]:
%%time

print('*' * 80)
y_train_proba = grid_search.predict_proba(x_train)
print(y_train.shape)
print(y_train_proba.shape)
print(y_train_proba[:10])
y_train_pred = (y_train_proba[:, 1]>=0.5).astype(int)
acc_train = accuracy_score(y_train, y_train_pred)
roc_train = roc_auc_score(y_train, y_train_proba[:, 1])
print('acc_train: %.4f \t roc_train: %.4f' % (acc_train, roc_train))

# y_train_pred = grid_search.predict(x_train)
# acc_train = accuracy_score(y_train, y_train_pred)
# roc_train = roc_auc_score(y_train, y_train_proba[:, 1])
# print('acc_train: %.4f \t roc_train: %.4f' % (acc_train, roc_train))

y_val_proba = grid_search.predict_proba(x_val)
print(y_val.shape)
print(y_val_proba.shape)
print(y_val_proba[:10])
y_val_pred = (y_val_proba[:, 1]>=0.5).astype(int)
print(y_val.shape)
print(y_val_pred.shape)
acc_val = accuracy_score(y_val, y_val_pred)
roc_val = roc_auc_score(y_val, y_val_proba[:, 1])
print('acc_val:   %.4f \t roc_val:   %.4f' % (acc_val, roc_val))

In [None]:
print(grid_search.cv_results_)
print('*' * 60)
print(grid_search.grid_scores_ )
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)
print(grid_search.scorer_)
print('*' * 60)
print(type(grid_search.best_estimator_))
print(dir(grid_search.best_estimator_))

In [None]:
cv_results = pd.DataFrame(grid_search.cv_results_)
display(cv_results)

In [None]:
fe_times = grid_search.best_estimator_.booster_.feature_importance()
fe_name = grid_search.best_estimator_.booster_.feature_name()
print(fe_times)
print(fe_name)

In [None]:
importance_score = pd.DataFrame(data={'feature': fe_name, 'importance': fe_times})
display(importance_score.head())

plt.figure(figsize=(18,60))
# sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
sns.barplot(x="importance", y="feature", data=importance_score.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()

In [None]:
importance_score=importance_score.sort_values(by='importance', ascending=False)
display(importance_score['feature'][:20])
for item in importance_score.values:
    print('%s\t%s' % (item[1], item[0]))

## Predict

In [None]:
run_name_acc = run_name + '_' + str(int(roc_val*10000)).zfill(4)
print(run_name_acc)

In [None]:
y_test_proba = grid_search.predict_proba(x_test)

print(y_test_proba.shape)
print(y_test_proba[:10])

In [None]:
def save_proba(y_val_proba, y_val, y_test_proba, id_test, file_name):
    print(id_test[:5])
    if os.path.exists(file_name):
        os.remove(file_name)
        print('File removed: %s' % file_name)
    with h5py.File(file_name) as h:
        h.create_dataset('y_val_proba', data=y_val_proba)
        h.create_dataset('y_val', data=y_val)
        h.create_dataset('y_test_proba', data=y_test_proba)
        h.create_dataset('id_test', data=id_test)
    print('File saved:   %s' % file_name)

def load_proba(file_name):
    with h5py.File(file_name, 'r') as h:
        y_val_proba = np.array(h['y_val_proba'])
        y_val = np.array(h['y_val'])
        y_test_proba = np.array(h['y_test_proba'])
        id_test = np.array(h['id_test'])
    print('File loaded:  %s' % file_name)
    print(id_test[:5])
    
    return y_val_proba, y_val, y_test_proba, id_test


y_proba_file = os.path.join(model_folder, 'proba_%s.p' % run_name_acc)
save_proba(
    y_val_proba[:, 1], 
    y_val, 
    y_test_proba[:, 1], 
    id_test,
    y_proba_file
)
y_val_proba_true, y_val, y_test_proba_true, id_test = load_proba(y_proba_file)

print(y_val_proba_true.shape)
print(y_val.shape)
print(y_test_proba_true.shape)
print(len(id_test))

In [None]:
# %%time
submission_csv_file = os.path.join(output_folder, 'pred_%s.csv' % run_name_acc)
print(submission_csv_file)
submission_csv = pd.DataFrame({ 'SK_ID_CURR': id_test , 'TARGET': y_test_proba_true })
submission_csv.to_csv(submission_csv_file, index = False)
display(submission_csv.head())

In [None]:
print('Time cost: %.2f s' % (time.time() - t0))

print('random_num: ', random_num)
print(run_name_acc)
print('Done!')