## 4. Ensemble


Reference：
- https://www.kaggle.com/ogrellier/good-fun-with-ligthgbm/code

## Run name

In [1]:
import time

project_name = 'HomeCreditDefaultRisk'
step_name = 'Ensemble'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)
t0 = time.time()

run_name: HomeCreditDefaultRisk_Ensemble_20180522_171304


## Important params

## Import PKGs

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
from IPython.display import display
import seaborn as sns

import os
import sys
import gc
import math
import tqdm
import shutil
import zipfile
import pickle
import h5py
# import cv2
from PIL import Image

from tqdm import tqdm
import multiprocessing

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.utils import shuffle

random_num = np.random.randint(10000)
cpu_amount = multiprocessing.cpu_count()

print('cpu_amount: %s' % (cpu_amount - 2))
print('random_num: %s' % random_num)

  from ._conv import register_converters as _register_converters


cpu_amount: 2
random_num: 8252


In [3]:
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold

import xgboost
# from xgboost import plot_importance

## Project folders

In [4]:
cwd = os.getcwd()
feature_folder = os.path.join(cwd, 'feature')
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')

application_test_csv_file = os.path.join(input_folder, 'application_test.csv')
application_train_csv_file = os.path.join(input_folder, 'application_train.csv')
bureau_csv_file = os.path.join(input_folder, 'bureau.csv')
bureau_balance_csv_file = os.path.join(input_folder, 'bureau_balance.csv')
credit_card_balance_csv_file = os.path.join(input_folder, 'credit_card_balance.csv')
installments_payments_csv_file = os.path.join(input_folder, 'installments_payments.csv')
POS_CASH_balance_csv_file = os.path.join(input_folder, 'POS_CASH_balance.csv')
previous_application_csv_file = os.path.join(input_folder, 'previous_application.csv')
sample_submission_csv_file = os.path.join(input_folder, 'sample_submission.csv')

print(application_test_csv_file)
print(application_train_csv_file)
print(bureau_csv_file)
print(bureau_balance_csv_file)
print(credit_card_balance_csv_file)
print(installments_payments_csv_file)
print(POS_CASH_balance_csv_file)
print(previous_application_csv_file)
print(sample_submission_csv_file)

D:\bitbucket\kaggle\home-credit-default-risk\input\application_test.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\application_train.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\bureau.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\bureau_balance.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\credit_card_balance.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\installments_payments.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\POS_CASH_balance.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\previous_application.csv
D:\bitbucket\kaggle\home-credit-default-risk\input\sample_submission.csv


## Load data

## Load predict probability files

In [5]:
ori_proba_files = [
    { 'file_name': 'proba_HomeCreditDefaultRisk_XGBoost_GSCV_20180522_103529_7186.p'},
    { 'file_name': 'proba_HomeCreditDefaultRisk_LightGBM_GSCV_20180522_110454_7617.p'},
    { 'file_name': 'proba_HomeCreditDefaultRisk_LightGBM_GSCV_20180522_110454_7617.p'},
]

for file in ori_proba_files:
    if os.path.exists(os.path.join(model_folder, file['file_name'])):
        print('File exists: %s' % file['file_name'])
    else:
        print('***File do not exists: %s' % file['file_name'])

File exists: proba_HomeCreditDefaultRisk_XGBoost_GSCV_20180522_103529_7186.p
File exists: proba_HomeCreditDefaultRisk_LightGBM_GSCV_20180522_110454_7617.p
File exists: proba_HomeCreditDefaultRisk_LightGBM_GSCV_20180522_110454_7617.p


In [6]:
def save_proba(y_val_proba, y_val, y_test_proba, id_test, file_name):
    print(id_test[:5])
    if os.path.exists(file_name):
        os.remove(file_name)
        print('File removed: %s' % file_name)
    with h5py.File(file_name) as h:
        h.create_dataset('y_val_proba', data=y_val_proba)
        h.create_dataset('y_val', data=y_val)
        h.create_dataset('y_test_proba', data=y_test_proba)
        h.create_dataset('id_test', data=id_test)
    print('File saved:   %s' % file_name)

def load_proba(file_name):
    with h5py.File(file_name, 'r') as h:
        y_val_proba = np.array(h['y_val_proba'])
        y_val = np.array(h['y_val'])
        y_test_proba = np.array(h['y_test_proba'])
        id_test = np.array(h['id_test'])
    print('File loaded:  %s' % file_name)
    print(id_test[:5])
    
    return y_val_proba, y_val, y_test_proba, id_test


y_proba_file = os.path.join(model_folder, ori_proba_files[0]['file_name'])
# save_proba(
#     y_val_proba, 
#     y_val, 
#     y_test_proba, 
#     id_test,
#     y_proba_file
# )
y_val_proba_true, y_val, y_test_proba_true, id_test = load_proba(y_proba_file)


y_val_probas = []
y_vas = []
y_test_probas = []
for file in ori_proba_files:
    y_proba_file = os.path.join(model_folder, file['file_name'])
    y_val_proba, y_val, y_test_proba, id_test = load_proba(y_proba_file)
    y_val_probas.append(y_val_proba)
    y_test_probas.append(y_test_proba)

print(len(y_val_probas))

File loaded:  D:\bitbucket\kaggle\home-credit-default-risk\model\proba_HomeCreditDefaultRisk_XGBoost_GSCV_20180522_103529_7186.p
[100001 100005 100013 100028 100038]
File loaded:  D:\bitbucket\kaggle\home-credit-default-risk\model\proba_HomeCreditDefaultRisk_XGBoost_GSCV_20180522_103529_7186.p
[100001 100005 100013 100028 100038]
File loaded:  D:\bitbucket\kaggle\home-credit-default-risk\model\proba_HomeCreditDefaultRisk_LightGBM_GSCV_20180522_110454_7617.p
[100001 100005 100013 100028 100038]
File loaded:  D:\bitbucket\kaggle\home-credit-default-risk\model\proba_HomeCreditDefaultRisk_LightGBM_GSCV_20180522_110454_7617.p
[100001 100005 100013 100028 100038]
3


In [7]:
%%time
def get_mean(probas):
    probas_newaxises = []
    for p in probas:
        print('*', end='')
#         print(p.shape)
        print(p[:, np.newaxis].shape)
        probas_newaxises.append(p[:, np.newaxis])
    probas_newaxis = np.concatenate(probas_newaxises, axis=-1)
    print(probas_newaxis.shape)
    probas_mean = np.mean(probas_newaxis, axis=-1)
    print('probas_mean.shape: \t', probas_mean.shape)
    return probas_mean

y_test_proba_mean = get_mean(y_test_probas)

*(48744, 1)
*(48744, 1)
*(48744, 1)
(48744, 3)
probas_mean.shape: 	 (48744,)
Wall time: 34 ms


In [8]:
# %%time
submission_csv_file = os.path.join(output_folder, 'pred.csv')
print(submission_csv_file)
submission_csv = pd.DataFrame({ 'SK_ID_CURR': id_test , 'TARGET': y_test_proba_true })
submission_csv.to_csv(submission_csv_file, index = False)
display(submission_csv.head())

D:\bitbucket\kaggle\home-credit-default-risk\output\pred.csv


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.479796
1,100005,0.504281
2,100013,0.320612
3,100028,0.363076
4,100038,0.54773


In [10]:
print('Time cost: %.2f s' % (time.time() - t0))

print('random_num: ', random_num)
print('Done!')

Time cost: 8.68 s
random_num:  8252
Done!
