# Testing selected models with Kaggle's test data

Now that a model has been selected, trained, and tuned, the test data can be processed and tested. 

First, packages need to be imported: 

In [1]:
import pandas as pd
import numpy as np
import joblib
import json
import sys
import os
sys.path.append(os.path.abspath('../functions'))
from functions import clean_column_names

Next, each table must be imported.

In [4]:
train = pd.read_csv('../tables/application_train.csv', sep=',')
test = pd.read_csv('../tables/application_test.csv', sep=',')
bureau = pd.read_csv('../tables/bureau.csv', sep=',')
bureau_balance = pd.read_csv('../tables/bureau_balance.csv', sep=',')
POS_CASH_balance = pd.read_csv('../tables/POS_CASH_balance.csv', sep=',')
installments_payments = pd.read_csv('../tables/installments_payments.csv', sep=',')
credit_card_balance = pd.read_csv('../tables/credit_card_balance.csv', sep=',')
previous_application = pd.read_csv('../tables/previous_application.csv', sep=',')


# Merging tables: 

Now, all tables are merged as seen in the previous notebook: 


Preprocessing tables: 

In [3]:
days_cols = ['DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION',
       'DAYS_LAST_DUE', 'DAYS_TERMINATION']
for col in days_cols: 
    previous_application[col] = previous_application[col].replace(365243, np.nan)
    

installments_payments['late_payment_lag'] = installments_payments['DAYS_ENTRY_PAYMENT'] - installments_payments['DAYS_INSTALMENT']
credit_card_balance['max_drawings_receivable_ratio'] = (credit_card_balance['AMT_DRAWINGS_ATM_CURRENT'] / credit_card_balance['AMT_TOTAL_RECEIVABLE'].replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).fillna(0)
bureau['credit_usage'] = (bureau['AMT_CREDIT_SUM'] / bureau['AMT_CREDIT_SUM_DEBT'].replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).fillna(0)
POS_CASH_balance['credit_term_ratio'] = (POS_CASH_balance['CNT_INSTALMENT_FUTURE'] / POS_CASH_balance['CNT_INSTALMENT'].replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).fillna(0)
bureau['bureau_debt_over_limit'] = (bureau['AMT_CREDIT_SUM_DEBT'] / bureau['AMT_CREDIT_SUM_LIMIT'].replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).fillna(0)
previous_application['first_dibursement_ratio'] = (previous_application['DAYS_FIRST_DRAWING'] / previous_application['DAYS_FIRST_DUE']).replace(0, np.nan).replace([np.inf, -np.inf], np.nan).fillna(0)

Bureau_balance to bureau: 

In [4]:
bb = bureau_balance.copy()
bur = bureau.copy()
bureau_cat_cols = bb.select_dtypes(include=['object']).columns.tolist()
bureau_cat_dummies = pd.get_dummies(bb[bureau_cat_cols], prefix=bureau_cat_cols)
bureau_cat_encoded = pd.concat([bb[['SK_ID_BUREAU']], bureau_cat_dummies], axis=1)
bureau_cat_counts = bureau_cat_encoded.groupby('SK_ID_BUREAU').sum()
bureau_cat_counts.columns = ['bureau_CNT_' + col for col in bureau_cat_counts.columns]
bb_num_cols = bb.select_dtypes(exclude=['object']).columns.tolist()
bb_num_cols.remove('SK_ID_BUREAU')
bb_num_agg = bb.groupby('SK_ID_BUREAU')[bb_num_cols].agg(['mean', 'min', 'max', 'std'])
bb_num_agg.columns = ['bureau_NUM_' + '_'.join(col) for col in bb_num_agg.columns]
bb_agg = pd.merge(bb_num_agg, bureau_cat_counts, on='SK_ID_BUREAU', how='outer')
bur = bur.merge(bb_agg, on='SK_ID_BUREAU', how='left')


Bureau to train, and saving the final dataframe as a .csv file: 

In [5]:
bur_cat_cols = bur.select_dtypes(include=['object']).columns.tolist()
bur_binary_cols = [col for col in bur.columns 
               if set(bur[col].dropna().unique()).issubset({0, 1})]

bur_num_cols = bur.select_dtypes(exclude=['object']).columns.tolist()
bur_num_cols.remove('SK_ID_BUREAU')
bur_num_cols.remove('SK_ID_CURR')
for col in bur_num_cols:
    if col in bur_binary_cols:
        bur_num_cols.remove(col)

agg_bur_cat_dummies = pd.get_dummies(bur[bur_cat_cols], prefix=bur_cat_cols)
agg_bureau_cat_encoded = pd.concat([bur[['SK_ID_BUREAU']], agg_bur_cat_dummies], axis=1)
agg_bureau_cat_counts = agg_bureau_cat_encoded.groupby('SK_ID_BUREAU').sum()
agg_bureau_cat_counts.columns = ['bureau_CNT_' + col for col in agg_bureau_cat_counts.columns]
agg_bureau_cat_counts = agg_bureau_cat_counts.copy()
agg_bureau_cat_counts.reset_index(inplace=True)
bureau_num_agg = bur.groupby('SK_ID_BUREAU')[bur_num_cols].agg(['mean', 'min', 'max', 'std'])
bureau_num_agg.columns = ['bureau_NUM_' + '_'.join(col) for col in bureau_num_agg.columns]
bureau_num_agg = bureau_num_agg.copy()
bureau_num_agg.reset_index(inplace=True)
bur_merged = bureau_num_agg.merge(agg_bureau_cat_counts, on='SK_ID_BUREAU', how='outer')
bur_merged.columns = [col.replace(' ', '_') for col in bur_merged.columns]
bureau_id_map = bureau[['SK_ID_BUREAU', 'SK_ID_CURR']].drop_duplicates()
bureau_merged = bur_merged.merge(bureau_id_map, on='SK_ID_BUREAU', how='left')
bureau_agg = bureau_merged.drop(columns=['SK_ID_BUREAU'])
bureau_agg = bureau_agg.copy()
bureau_final = bureau_agg.groupby('SK_ID_CURR').agg(['mean'])
bureau_final.columns = ['BUREAU_' + '_'.join(col) for col in bureau_final.columns]
bureau_final = bureau_final.copy()
bureau_final.reset_index(inplace=True)
bureau_final.to_csv('bureau_final.csv', index=False)

## Merging other tables to previous_application: 

The POS_cash_balance table is joined to the previous_application table: 

In [6]:
pos = POS_CASH_balance.copy()
cat_cols = pos.select_dtypes(include=['object']).columns.tolist()
cat_dummies = pd.get_dummies(pos[cat_cols], prefix=cat_cols)
cat_encoded = pd.concat([pos[['SK_ID_PREV']], cat_dummies], axis=1)
cat_counts = cat_encoded.groupby('SK_ID_PREV').sum()
cat_counts.columns = ['POS_CNT_' + col for col in cat_counts.columns]
num_cols = pos.select_dtypes(exclude=['object']).columns.tolist()
num_cols.remove('SK_ID_PREV')
num_cols.remove('SK_ID_CURR')
num_agg = pos.groupby('SK_ID_PREV')[num_cols].agg(['mean', 'min', 'max', 'std'])
num_agg.columns = ['POS_NUM_' + '_'.join(col) for col in num_agg.columns]
num_agg.reset_index(inplace=True)
agg_pos = num_agg.merge(cat_counts, on='SK_ID_PREV', how='outer')
previous = previous_application.copy()
previous = previous.merge(agg_pos, on='SK_ID_PREV', how='left')

In [7]:
ip = installments_payments.copy()
ip_cat_cols = ip.select_dtypes(include=['object']).columns.tolist()

ip_binary_cols = [col for col in ip.columns 
               if set(ip[col].dropna().unique()).issubset({0, 1})]

ip_num_cols = ip.select_dtypes(exclude=['object']).columns.tolist()
if 'SK_ID_PREV' in ip_num_cols:
    ip_num_cols.remove('SK_ID_PREV')
if 'SK_ID_CURR' in ip_num_cols:
    ip_num_cols.remove('SK_ID_CURR')
for col in ip_num_cols:
    if col in ip_binary_cols:
        ip_num_cols.remove(col)
ip_agg = ip.groupby('SK_ID_PREV')[ip_num_cols].agg(['mean', 'min', 'max', 'std'])
ip_agg.columns = ['ip_NUM_' + '_'.join(col) for col in ip_agg.columns]
ip_agg.reset_index(inplace=True)
previous = previous.merge(ip_agg, on='SK_ID_PREV', how='left')


The credit_card_balance table is joined to the previous_application table in the same way as previously shown: 

In [8]:
ccb = credit_card_balance.copy()
ccb_cat_cols = ccb.select_dtypes(include=['object']).columns.tolist()

ccb_binary_cols = [col for col in ccb.columns 
               if set(ccb[col].dropna().unique()).issubset({0, 1})]

ccb_num_cols = ccb.select_dtypes(exclude=['object']).columns.tolist()
if 'SK_ID_PREV' in ccb_num_cols:
    ccb_num_cols.remove('SK_ID_PREV')
if 'SK_ID_CURR' in ccb_num_cols:
    ccb_num_cols.remove('SK_ID_CURR')
for col in ccb_num_cols:
    if col in ccb_binary_cols:
        ccb_num_cols.remove(col)
ccb_cat_dummies = pd.get_dummies(ccb[ccb_cat_cols], prefix=ccb_cat_cols)
ccb_cat_encoded = pd.concat([ccb[['SK_ID_PREV']], ccb_cat_dummies], axis=1)
ccb_cat_counts = ccb_cat_encoded.groupby('SK_ID_PREV').sum()
ccb_cat_counts.columns = ['ccb_CNT_' + col for col in ccb_cat_counts.columns]
ccb_cat_counts.reset_index(inplace=True)
ccb_num_agg = ccb.groupby('SK_ID_PREV')[ccb_num_cols].agg(['mean', 'min', 'max', 'std'])
ccb_num_agg.columns = ['ccb_NUM_' + '_'.join(col) for col in ccb_num_agg.columns]
ccb_num_agg.reset_index(inplace=True)
agg_ccb = ccb_num_agg.merge(ccb_cat_counts, on='SK_ID_PREV', how='outer')
agg_ccb.columns = [col.replace(' ', '_') for col in agg_ccb.columns]
previous = previous.merge(agg_ccb, on='SK_ID_PREV', how='left')


Now, the previous application table is aggregated to be joined to the application table: 

In [9]:
p = previous.copy()
prev_sorted = p.sort_values(['SK_ID_CURR', 'DAYS_DECISION'], ascending=[True, False])
last_1 = prev_sorted.groupby('SK_ID_CURR').head(1)
last_3 = prev_sorted.groupby('SK_ID_CURR').head(3)
p_cat_cols = p.select_dtypes(include=['object']).columns.tolist()

p_binary_cols = [col for col in p.columns 
               if set(p[col].dropna().unique()).issubset({0, 1})]

p_num_cols = p.select_dtypes(exclude=['object']).columns.tolist()
if 'SK_ID_PREV' in p_num_cols:
    p_num_cols.remove('SK_ID_PREV')
if 'SK_ID_CURR' in p_num_cols:
    p_num_cols.remove('SK_ID_CURR')
for col in p_num_cols:
    if col in p_binary_cols:
        p_num_cols.remove(col)
agg_last1 = last_1.groupby('SK_ID_CURR')[p_num_cols].agg(['mean', 'std', 'max'])
agg_last1.columns = ['LAST1_' + '_'.join(col) for col in agg_last1.columns]
agg_last1 = agg_last1.copy()
agg_last1.reset_index(inplace=True)

agg_last3_raw = last_3.groupby('SK_ID_CURR')[p_num_cols].agg(['mean', 'max', 'std'])
agg_last3_raw.columns = ['LAST3_' + '_'.join(col) for col in agg_last3_raw.columns]
agg_last3 = agg_last3_raw.reset_index()
p_cat_dummies = pd.get_dummies(p[p_cat_cols], prefix=p_cat_cols)
p_cat_encoded = pd.concat([p[['SK_ID_CURR']], p_cat_dummies], axis=1)
p_cat_counts = p_cat_encoded.groupby('SK_ID_CURR').sum()
p_cat_counts.columns = ['prev_cat_CNT_' + col for col in p_cat_counts.columns]
p_cat_counts.reset_index(inplace=True)
p_binary_agg = p.groupby('SK_ID_CURR')[p_binary_cols].agg(['mean', 'sum'])
p_binary_agg.columns = [
    f'prev_{col}_{stat}' for col, stat in p_binary_agg.columns
]
p_binary_agg.reset_index(inplace=True)
p_agg = p.groupby('SK_ID_CURR')[p_num_cols].agg(['mean', 'max'])
p_agg.columns = ['p_NUM_' + '_'.join(col) for col in p_agg.columns]
p_aggregate = p_agg.reset_index()
p_num_bin_agg = p_aggregate.merge(p_binary_agg, on='SK_ID_CURR', how='outer')


In [10]:
temp_merge_2 = p_num_bin_agg.merge(agg_last1, on='SK_ID_CURR', how='outer')


In [11]:
temp_merge_3 = temp_merge_2.merge(agg_last3, on='SK_ID_CURR', how='outer')

In [12]:
p_final_merged = temp_merge_3.merge(p_cat_counts, on='SK_ID_CURR', how='outer')
p_final_merged.columns = [col.replace(' ', '_') for col in p_final_merged.columns]


The final previous_application table is saved: 

In [13]:
p_final_merged.to_csv('p_final_merged.csv', index=False)

# Preprocessing: 

Now, the test dataset is processed in the same way as the training data in the previous notebook: 

In [14]:
test.replace('XNA', np.nan, inplace=True)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].replace(365243, np.nan)
test.loc[(test['OWN_CAR_AGE'].isnull()) & (test['FLAG_OWN_CAR'] == 'N'), 'OWN_CAR_AGE'] = 0
upper_limit = test['AMT_INCOME_TOTAL'].quantile(0.99)
test['AMT_INCOME_TOTAL'] = test['AMT_INCOME_TOTAL'].clip(upper=upper_limit)


In [15]:
group_map = {
    'Accountants': 'Accountants',
    'Cleaning staff': 'Service',
    'Cooking staff': 'Service',
    'Core staff': 'Core staff',
    'Drivers': 'Drivers', 
    'HR staff': 'Other',
    'High skill tech staff': 'Tech',
    'IT staff': 'Tech',
    'Laborers': 'Laborers', 
    'Low-skil Laborers': 'Laborers',
    'Managers': 'Managers',
    'Medicine staff': 'Medicine',
    'Private service staff': 'Other',
    'Realty agents': 'Other',
    'Sales staff': 'Sales',
    'Secretaries': 'Other',
    'Security staff': 'Other', 
    'Waiters/barmen staff': 'Service'
}
    
test['OCCUPATION_TYPE_GROUPED'] = test['OCCUPATION_TYPE'].map(group_map)

In [16]:
group_map = {
    'Businessman': 'Other',
    'Commercial associate': 'Commercial associate',
    'Pensioner': 'Pensioner',
    'State servant': 'State servant',
    'Student': 'Other', 
    'Unemployed': 'Other',
    'Working': 'Working',
}
    
test['NAME_INCOME_TYPE_GROUPED'] = test['NAME_INCOME_TYPE'].map(group_map)

In [17]:
group_map = {
    'Business Entity Type 3': 'Business',
    'Business Entity Type 2': 'Business',
    'Business Entity Type 1 ': 'Business',
    'XNA': 'Unknown', 
    'Self-employed': 'Self-employed',         
    'Other': 'Other',                  
    'Medicine': 'Public Sector',                
    'Government': 'Public Sector',                
    'School': 'Public Sector',  
    'Kindergarten': 'Public Sector',
    'Security Ministries': 'Public Sector',                  
    'Housing': 'Public Sector',
    'Military': 'Public Sector',
    'Police': 'Public Sector',
    'Postal': 'Public Sector',
    'Security Ministries': 'Public Sector',
    'University': 'Public Sector',
    'Emergency': 'Public Sector',
    'Trade: type 7': 'Trade',
    'Trade: type 6': 'Trade',
    'Trade: type 5': 'Trade',
    'Trade: type 4': 'Trade',
    'Trade: type 3': 'Trade',
    'Trade: type 2': 'Trade', 
    'Trade: type 1': 'Trade', 
    'Electricity ': 'Trade',        
    'Construction': 'Trade',              
    'Transport: type 4': 'Transport',
    'Transport: type 3': 'Transport',
    'Transport: type 2': 'Transport',
    'Transport: type 1': 'Transport',
    'Industry: type 13': 'Industry',
    'Industry: type 12 ': 'Industry',
    'Industry: type 10': 'Industry',
    'Industry: type 9': 'Industry',
    'Industry: type 8': 'Industry',
    'Industry: type 7': 'Industry', 
    'Industry: type 6': 'Industry',
    'Industry: type 5': 'Industry',
    'Industry: type 4 ': 'Industry',       
    'Industry: type 3': 'Industry', 
    'Industry: type 2 ': 'Industry',
    'Industry: type 1 ': 'Industry',      
    'Industry: type 11': 'Industry', 
    'Agriculture': 'Industry',
    'Services ': 'Service',  
    'Hotel': 'Service',
    'Restaurant': 'Service',
    'Cleaning': 'Service',
    'Realtor': 'Service',
    'Legal Services': 'Service',
    'Advertising': 'Other',   
    'Religion': 'Other',
    'Culture': 'Other',
    'Bank': 'Finance',
    'Insurance': 'Finance',
    'Telecom': 'Other',
    'Mobile': 'Other'
}
    
test['ORGANIZATION_TYPE_GROUPED'] = test['ORGANIZATION_TYPE'].map(group_map)

In [18]:
group_map = {
    'House / apartment': 'House / apartment',
    'Municipal apartment': 'Other apartment',
    'Office apartment': 'Other apartment',
    'Rented apartment': 'Other apartment',
    'With parents': 'With parents',
    'Co-op apartment': 'Other apartment',
}
    
test['NAME_HOUSING_TYPE_GROUPED'] = test['NAME_HOUSING_TYPE'].map(group_map)

In [19]:
test = test.drop(columns=['OCCUPATION_TYPE', 'NAME_INCOME_TYPE', 'ORGANIZATION_TYPE', 'NAME_HOUSING_TYPE'])

In [20]:
cat_features = test.select_dtypes(include='object').columns.tolist()
cat_features_index = [test.columns.get_loc(col) for col in cat_features]
for col in cat_features: 
    test[col] = test[col].fillna('missing') 
    test[col] = test[col].astype('category')

In [21]:
test['credit_annuity_ratio'] = (test['AMT_CREDIT'] / test['AMT_ANNUITY'].replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).fillna(0)
test['age_score_ratio'] = (test['DAYS_BIRTH'] / test['EXT_SOURCE_1'].replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).fillna(0)
test['score_credit_ratio'] = (test['EXT_SOURCE_2'] / test['AMT_CREDIT'].replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).fillna(0)
test['income_goods_ratio'] = (test['AMT_INCOME_TOTAL'] / test['AMT_GOODS_PRICE'].replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).fillna(0)
test['SUM_FLAG_DOCUMENT'] = test['FLAG_DOCUMENT_2'] + test['FLAG_DOCUMENT_3'] + test['FLAG_DOCUMENT_4'] + test['FLAG_DOCUMENT_5'] + test['FLAG_DOCUMENT_6'] + test['FLAG_DOCUMENT_7'] + test['FLAG_DOCUMENT_8'] + test['FLAG_DOCUMENT_9'] + test['FLAG_DOCUMENT_10'] + test['FLAG_DOCUMENT_11'] + test['FLAG_DOCUMENT_12'] + test['FLAG_DOCUMENT_13'] + test['FLAG_DOCUMENT_14'] + test['FLAG_DOCUMENT_15'] + test['FLAG_DOCUMENT_16'] + test['FLAG_DOCUMENT_17'] + test['FLAG_DOCUMENT_18'] + test['FLAG_DOCUMENT_19'] + test['FLAG_DOCUMENT_20'] + test['FLAG_DOCUMENT_21']

In [22]:
cat_features = test.select_dtypes(include=['object', 'category']).columns.tolist()

In [23]:
test_encoded = pd.get_dummies(test, columns=cat_features, drop_first=False)
test_encoded.columns = test_encoded.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

In [24]:
dupe_columns = test_encoded.columns[test_encoded.columns.duplicated()]
test_encoded = test_encoded.loc[:, ~test_encoded.columns.duplicated()]


In [25]:
bool_cols = test_encoded.select_dtypes(include='bool').columns
test_encoded[bool_cols] = test_encoded[bool_cols].astype(int)

In [26]:
app_train = test_encoded.merge(bureau_final, on='SK_ID_CURR', how='left')

In [27]:
test_data = app_train.merge(p_final_merged, on='SK_ID_CURR', how='left')

# Model preparation

The model preparation is performed as in the previous notebook: 

In [28]:
test_data = clean_column_names(test_data)
test_data.set_index('SK_ID_CURR', inplace=True)
cat_features = test_data.select_dtypes(include='object').columns.tolist()
cat_features_index = [test_data.columns.get_loc(col) for col in cat_features]
for col in cat_features: 
    test_data[col] = test_data[col].fillna('missing') 
    test_data[col] = test_data[col].astype('category')
test_data.replace([np.inf, -np.inf], np.nan, inplace=True)


The dictionary was defined in the previous notebook also: 

In [29]:
class_weight_dict = {0: np.float64(0.5439092983356032), 1: np.float64(6.193554884189325)}

Using selected features from the previous notebook: 

In [30]:
features = ['EXT_SOURCE_3',
 'EXT_SOURCE_2',
 'EXT_SOURCE_1',
 'DAYS_EMPLOYED',
 'credit_annuity_ratio',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'age_score_ratio',
 'NAME_EDUCATION_TYPE_Higher_education',
 'p_NUM_DAYS_LAST_DUE_1ST_VERSION_max',
 'score_credit_ratio',
 'NAME_FAMILY_STATUS_Married',
 'CODE_GENDER_F',
 'BUREAU_bureau_NUM_AMT_CREDIT_SUM_DEBT_mean_mean',
 'DAYS_BIRTH',
 'p_NUM_ip_NUM_AMT_PAYMENT_min_mean',
 'p_NUM_ip_NUM_late_payment_lag_max_mean',
 'FLAG_OWN_CAR_N',
 'prev_cat_CNT_NAME_CONTRACT_STATUS_Refused',
 'CODE_GENDER_M',
 'BUREAU_bureau_NUM_AMT_CREDIT_MAX_OVERDUE_mean_mean',
 'BUREAU_bureau_CNT_CREDIT_ACTIVE_Closed_mean',
 'prev_cat_CNT_NAME_YIELD_GROUP_low_action',
 'LAST3_ip_NUM_late_payment_lag_max_mean',
 'ORGANIZATION_TYPE_GROUPED_Public_Sector',
 'BUREAU_bureau_NUM_credit_usage_mean_mean',
 'SUM_FLAG_DOCUMENT',
 'AMT_CREDIT',
 'REGION_RATING_CLIENT_W_CITY',
 'p_NUM_AMT_DOWN_PAYMENT_max',
 'prev_cat_CNT_NAME_YIELD_GROUP_high',
 'DAYS_ID_PUBLISH',
 'p_NUM_DAYS_LAST_DUE_1ST_VERSION_mean',
 'p_NUM_ccb_NUM_CNT_DRAWINGS_ATM_CURRENT_mean_max',
 'p_NUM_DAYS_LAST_DUE_max',
 'p_NUM_ip_NUM_AMT_PAYMENT_min_max',
 'p_NUM_ip_NUM_late_payment_lag_max_max',
 'BUREAU_bureau_NUM_DAYS_CREDIT_mean_mean',
 'BUREAU_bureau_NUM_bureau_debt_over_limit_mean_mean',
 'p_NUM_POS_NUM_CNT_INSTALMENT_FUTURE_min_mean',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'FLAG_DOCUMENT_3',
 'p_NUM_POS_NUM_MONTHS_BALANCE_max_max',
 'p_NUM_POS_NUM_CNT_INSTALMENT_FUTURE_mean_mean',
 'BUREAU_bureau_NUM_AMT_CREDIT_SUM_mean_mean',
 'OWN_CAR_AGE',
 'prev_cat_CNT_PRODUCT_COMBINATION_Cash_X_Sell__low',
 'LAST3_ip_NUM_late_payment_lag_max_max',
 'prev_cat_CNT_CODE_REJECT_REASON_XAP',
 'p_NUM_ip_NUM_NUM_INSTALMENT_VERSION_mean_max',
 'NAME_EDUCATION_TYPE_Secondary_secondary_special',
 'BUREAU_bureau_CNT_CREDIT_TYPE_Mortgage_mean',
 'NAME_CONTRACT_TYPE_Cash_loans',
 'LAST3_ip_NUM_AMT_PAYMENT_min_max',
 'REG_CITY_NOT_LIVE_CITY',
 'p_NUM_POS_NUM_CNT_INSTALMENT_std_mean',
 'LAST3_ip_NUM_late_payment_lag_mean_max',
 'DAYS_LAST_PHONE_CHANGE',
 'p_NUM_AMT_ANNUITY_mean',
 'LAST3_ip_NUM_DAYS_INSTALMENT_std_max',
 'prev_cat_CNT_NAME_PRODUCT_TYPE_walk_in',
 'LAST3_POS_NUM_CNT_INSTALMENT_FUTURE_mean_std',
 'BUREAU_bureau_NUM_DAYS_CREDIT_ENDDATE_mean_mean',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'p_NUM_POS_NUM_CNT_INSTALMENT_std_max',
 'p_NUM_ip_NUM_NUM_INSTALMENT_VERSION_std_max',
 'LAST3_ip_NUM_AMT_PAYMENT_min_mean',
 'p_NUM_AMT_DOWN_PAYMENT_mean',
 'LAST1_ip_NUM_late_payment_lag_max_mean',
 'NAME_INCOME_TYPE_GROUPED_Working',
 'DAYS_REGISTRATION',
 'p_NUM_POS_NUM_MONTHS_BALANCE_mean_max',
 'p_NUM_ip_NUM_NUM_INSTALMENT_VERSION_std_mean',
 'BUREAU_bureau_CNT_CREDIT_TYPE_Microloan_mean',
 'p_NUM_CNT_PAYMENT_mean',
 'p_NUM_ip_NUM_AMT_INSTALMENT_min_mean',
 'p_NUM_ip_NUM_AMT_PAYMENT_mean_max',
 'income_goods_ratio',
 'prev_cat_CNT_PRODUCT_COMBINATION_POS_industry_with_interest',
 'p_NUM_ip_NUM_DAYS_ENTRY_PAYMENT_max_max',
 'APARTMENTS_MEDI',
 'LAST3_ip_NUM_late_payment_lag_std_mean',
 'p_NUM_ip_NUM_AMT_INSTALMENT_min_max',
 'AMT_INCOME_TOTAL',
 'LAST3_ccb_NUM_CNT_DRAWINGS_CURRENT_std_mean',
 'p_NUM_ccb_NUM_CNT_DRAWINGS_CURRENT_std_max',
 'BUREAU_bureau_NUM_AMT_CREDIT_SUM_LIMIT_mean_mean',
 'BUREAU_bureau_CNT_CREDIT_ACTIVE_Active_mean',
 'p_NUM_POS_NUM_credit_term_ratio_mean_mean',
 'p_NUM_POS_NUM_CNT_INSTALMENT_FUTURE_min_max',
 'p_NUM_POS_NUM_CNT_INSTALMENT_FUTURE_mean_max',
 'LAST3_ip_NUM_late_payment_lag_std_max',
 'p_NUM_ip_NUM_DAYS_INSTALMENT_std_max',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'prev_cat_CNT_NAME_GOODS_CATEGORY_Furniture',
 'p_NUM_ccb_NUM_CNT_DRAWINGS_ATM_CURRENT_mean_mean',
 'LAST3_DAYS_LAST_DUE_1ST_VERSION_max',
 'p_NUM_RATE_DOWN_PAYMENT_max',
 'LAST3_POS_NUM_SK_DPD_DEF_std_std',
 'p_NUM_ip_NUM_DAYS_INSTALMENT_min_mean',
 'p_NUM_HOUR_APPR_PROCESS_START_max',
 'p_NUM_ccb_NUM_AMT_BALANCE_max_max',
 'p_NUM_POS_CNT_NAME_CONTRACT_STATUS_Active_max',
 'p_NUM_ip_NUM_DAYS_ENTRY_PAYMENT_std_max',
 'LAST3_HOUR_APPR_PROCESS_START_max',
 'LAST3_ip_NUM_DAYS_ENTRY_PAYMENT_std_max',
 'APARTMENTS_AVG',
 'p_NUM_DAYS_FIRST_DRAWING_max',
 'BUREAU_bureau_CNT_CREDIT_TYPE_Car_loan_mean',
 'OCCUPATION_TYPE_GROUPED_Core_staff',
 'p_NUM_RATE_DOWN_PAYMENT_mean',
 'LAST3_RATE_DOWN_PAYMENT_max',
 'prev_cat_CNT_NAME_PRODUCT_TYPE_XNA',
 'p_NUM_AMT_GOODS_PRICE_mean',
 'LAST3_RATE_DOWN_PAYMENT_std',
 'LAST1_ip_NUM_AMT_PAYMENT_min_mean',
 'p_NUM_HOUR_APPR_PROCESS_START_mean',
 'LAST1_AMT_GOODS_PRICE_mean',
 'LAST1_HOUR_APPR_PROCESS_START_mean',
 'p_NUM_CNT_PAYMENT_max',
 'p_NUM_POS_CNT_NAME_CONTRACT_STATUS_Active_mean',
 'LAST3_ip_NUM_DAYS_ENTRY_PAYMENT_max_max',
 'LAST3_POS_NUM_CNT_INSTALMENT_mean_std',
 'p_NUM_DAYS_DECISION_mean',
 'OCCUPATION_TYPE_GROUPED_Drivers',
 'REGION_POPULATION_RELATIVE',
 'BUREAU_bureau_NUM_bureau_CNT_STATUS_1_mean_mean',
 'LAST3_POS_NUM_credit_term_ratio_mean_mean',
 'p_NUM_ip_NUM_DAYS_ENTRY_PAYMENT_std_mean',
 'LAST3_CNT_PAYMENT_mean',
 'LAST1_ip_NUM_late_payment_lag_std_mean',
 'p_NUM_ccb_NUM_CNT_DRAWINGS_ATM_CURRENT_std_mean',
 'prev_cat_CNT_CODE_REJECT_REASON_LIMIT',
 'ORGANIZATION_TYPE_GROUPED_Self_employed',
 'BUREAU_bureau_NUM_bureau_CNT_STATUS_0_mean_mean',
 'LAST3_AMT_ANNUITY_std',
 'p_NUM_ccb_NUM_AMT_CREDIT_LIMIT_ACTUAL_mean_mean',
 'p_NUM_DAYS_TERMINATION_max',
 'LAST3_ip_NUM_DAYS_ENTRY_PAYMENT_std_mean',
 'LAST3_AMT_CREDIT_mean',
 'prev_cat_CNT_NAME_CONTRACT_STATUS_Approved',
 'TOTALAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'LAST3_DAYS_LAST_DUE_1ST_VERSION_std',
 'prev_cat_CNT_NAME_CLIENT_TYPE_New',
 'p_NUM_POS_NUM_SK_DPD_DEF_std_max',
 'LAST3_HOUR_APPR_PROCESS_START_std',
 'p_NUM_ip_NUM_AMT_PAYMENT_max_max',
 'p_NUM_POS_NUM_credit_term_ratio_min_mean',
 'prev_cat_CNT_CHANNEL_TYPE_AP___Cash_loan_',
 'LAST3_CNT_PAYMENT_max',
 'p_NUM_ccb_NUM_AMT_RECEIVABLE_PRINCIPAL_mean_max',
 'BUREAU_bureau_NUM_DAYS_CREDIT_UPDATE_mean_mean',
 'BUREAU_bureau_NUM_bureau_NUM_MONTHS_BALANCE_std_mean_mean',
 'p_NUM_AMT_APPLICATION_max',
 'YEARS_BEGINEXPLUATATION_MODE',
 'LAST3_ip_NUM_late_payment_lag_max_std',
 'p_NUM_DAYS_LAST_DUE_mean',
 'LAST3_AMT_ANNUITY_max',
 'BUREAU_bureau_NUM_DAYS_ENDDATE_FACT_mean_mean',
 'p_NUM_ip_NUM_DAYS_INSTALMENT_std_mean',
 'prev_cat_CNT_CODE_REJECT_REASON_HC',
 'LAST3_POS_NUM_credit_term_ratio_std_mean',
 'prev_cat_CNT_PRODUCT_COMBINATION_Cash_X_Sell__high',
 'LAST3_POS_NUM_SK_DPD_DEF_std_mean',
 'LAST3_RATE_DOWN_PAYMENT_mean',
 'p_NUM_ccb_NUM_CNT_DRAWINGS_CURRENT_mean_mean',
 'p_NUM_ip_NUM_AMT_PAYMENT_max_mean',
 'p_NUM_ccb_NUM_max_drawings_receivable_ratio_std_max',
 'p_NUM_ip_NUM_late_payment_lag_std_mean',
 'p_NUM_ccb_NUM_CNT_DRAWINGS_CURRENT_mean_max',
 'LAST3_POS_NUM_credit_term_ratio_min_std',
 'OCCUPATION_TYPE_GROUPED_Accountants',
 'p_NUM_ccb_NUM_max_drawings_receivable_ratio_mean_mean',
 'p_NUM_DAYS_FIRST_DRAWING_mean',
 'OCCUPATION_TYPE_GROUPED_Laborers',
 'LAST3_AMT_DOWN_PAYMENT_max',
 'LAST1_ip_NUM_late_payment_lag_mean_mean',
 'YEARS_BUILD_MODE',
 'p_NUM_ip_NUM_NUM_INSTALMENT_VERSION_mean_mean',
 'LAST3_ip_NUM_NUM_INSTALMENT_VERSION_std_mean',
 'LAST3_ip_NUM_DAYS_INSTALMENT_min_mean',
 'p_NUM_DAYS_DECISION_max',
 'p_NUM_POS_NUM_CNT_INSTALMENT_FUTURE_std_mean',
 'p_NUM_POS_NUM_credit_term_ratio_std_max',
 'p_NUM_ccb_NUM_max_drawings_receivable_ratio_max_max',
 'prev_cat_CNT_CHANNEL_TYPE_Channel_of_corporate_sales',
 'LAST3_POS_NUM_SK_DPD_DEF_mean_mean',
 'LAST3_ip_NUM_NUM_INSTALMENT_VERSION_mean_mean',
 'p_NUM_ccb_NUM_AMT_BALANCE_mean_mean',
 'p_NUM_ip_NUM_AMT_PAYMENT_mean_mean',
 'LAST3_ip_NUM_AMT_INSTALMENT_min_max',
 'p_NUM_ip_NUM_NUM_INSTALMENT_VERSION_max_max',
 'LAST1_SELLERPLACE_AREA_mean',
 'LAST1_ip_NUM_DAYS_INSTALMENT_std_mean',
 'p_NUM_ip_NUM_late_payment_lag_mean_mean',
 'LAST1_DAYS_TERMINATION_mean',
 'YEARS_BEGINEXPLUATATION_AVG',
 'LAST3_ip_NUM_late_payment_lag_min_std',
 'LAST3_SELLERPLACE_AREA_mean',
 'p_NUM_POS_NUM_SK_DPD_DEF_max_mean',
 'LAST3_ip_NUM_NUM_INSTALMENT_NUMBER_max_max',
 'p_NUM_SELLERPLACE_AREA_max',
 'LIVINGAPARTMENTS_MODE',
 'ORGANIZATION_TYPE_GROUPED_Finance',
 'LAST3_ip_NUM_NUM_INSTALMENT_VERSION_mean_max',
 'BUREAU_bureau_NUM_bureau_CNT_STATUS_C_mean_mean',
 'p_NUM_ip_NUM_AMT_PAYMENT_std_mean',
 'LAST3_DAYS_DECISION_std',
 'LAST3_ip_NUM_AMT_INSTALMENT_mean_max',
 'LAST3_POS_NUM_SK_DPD_mean_std',
 'LAST3_SELLERPLACE_AREA_std',
 'prev_cat_CNT_PRODUCT_COMBINATION_POS_industry_without_interest',
 'p_NUM_ip_NUM_AMT_INSTALMENT_std_mean',
 'p_NUM_ccb_NUM_AMT_CREDIT_LIMIT_ACTUAL_mean_max',
 'p_NUM_ccb_NUM_AMT_DRAWINGS_CURRENT_std_max',
 'p_NUM_ip_NUM_late_payment_lag_mean_max',
 'LAST3_HOUR_APPR_PROCESS_START_mean',
 'p_NUM_POS_NUM_MONTHS_BALANCE_std_mean',
 'LAST3_DAYS_TERMINATION_std',
 'LAST3_AMT_GOODS_PRICE_mean',
 'LAST3_POS_NUM_credit_term_ratio_min_max',
 'LAST3_ip_NUM_AMT_INSTALMENT_max_mean',
 'LAST3_AMT_ANNUITY_mean',
 'LAST3_POS_NUM_SK_DPD_DEF_std_max',
 'p_NUM_POS_NUM_CNT_INSTALMENT_max_mean',
 'p_NUM_ip_NUM_late_payment_lag_std_max',
 'LAST3_ip_NUM_DAYS_INSTALMENT_std_mean',
 'LAST3_POS_NUM_credit_term_ratio_mean_max',
 'LAST3_DAYS_LAST_DUE_max',
 'LAST3_DAYS_TERMINATION_max',
 'LAST3_POS_NUM_CNT_INSTALMENT_FUTURE_mean_mean',
 'LAST3_DAYS_DECISION_mean',
 'p_NUM_ccb_NUM_AMT_PAYMENT_TOTAL_CURRENT_mean_mean',
 'LIVINGAREA_MODE',
 'p_NUM_POS_NUM_MONTHS_BALANCE_std_max',
 'p_NUM_ip_NUM_DAYS_INSTALMENT_mean_max',
 'prev_cat_CNT_NAME_CONTRACT_TYPE_Consumer_loans',
 'LAST1_ip_NUM_DAYS_ENTRY_PAYMENT_max_mean',
 'p_NUM_ip_NUM_DAYS_INSTALMENT_mean_mean',
 'YEARS_BUILD_AVG',
 'p_NUM_POS_NUM_MONTHS_BALANCE_min_mean',
 'LAST3_ccb_NUM_CNT_DRAWINGS_CURRENT_mean_mean',
 'LAST3_POS_NUM_CNT_INSTALMENT_std_mean',
 'FLOORSMAX_AVG',
 'LAST3_ip_NUM_late_payment_lag_mean_mean',
 'LAST3_ip_NUM_AMT_INSTALMENT_min_mean',
 'NAME_INCOME_TYPE_GROUPED_State_servant',
 'p_NUM_POS_NUM_MONTHS_BALANCE_min_max',
 'prev_cat_CNT_NAME_PORTFOLIO_POS',
 'p_NUM_ccb_NUM_CNT_DRAWINGS_ATM_CURRENT_std_max',
 'p_NUM_ip_NUM_AMT_INSTALMENT_max_mean',
 'LAST1_ip_NUM_AMT_INSTALMENT_min_mean',
 'prev_cat_CNT_NAME_YIELD_GROUP_low_normal',
 'LAST3_POS_NUM_CNT_INSTALMENT_std_std',
 'LAST3_ip_NUM_late_payment_lag_std_std',
 'p_NUM_ip_NUM_DAYS_INSTALMENT_max_max',
 'LAST3_POS_NUM_SK_DPD_max_std',
 'LAST3_ip_NUM_late_payment_lag_min_max',
 'LAST1_ip_NUM_NUM_INSTALMENT_VERSION_mean_mean',
 'LAST3_ip_NUM_late_payment_lag_mean_std',
 'LAST1_ip_NUM_AMT_INSTALMENT_std_mean',
 'FLAG_WORK_PHONE',
 'BUREAU_bureau_NUM_bureau_NUM_MONTHS_BALANCE_mean_mean_mean',
 'APARTMENTS_MODE',
 'p_NUM_POS_NUM_CNT_INSTALMENT_FUTURE_std_max',
 'LAST3_ip_NUM_NUM_INSTALMENT_VERSION_std_max',
 'LAST3_POS_NUM_credit_term_ratio_mean_std',
 'LAST3_POS_NUM_MONTHS_BALANCE_std_std',
 'prev_cat_CNT_PRODUCT_COMBINATION_POS_household_without_interest',
 'LAST3_CNT_PAYMENT_std',
 'LANDAREA_AVG',
 'p_NUM_AMT_CREDIT_mean',
 'p_NUM_ccb_NUM_max_drawings_receivable_ratio_std_mean',
 'BUREAU_bureau_NUM_bureau_CNT_STATUS_X_mean_mean',
 'BUREAU_bureau_NUM_AMT_ANNUITY_mean_mean',
 'BUREAU_bureau_CNT_CREDIT_TYPE_Consumer_credit_mean',
 'LAST3_ip_NUM_DAYS_INSTALMENT_min_std',
 'LAST3_AMT_DOWN_PAYMENT_mean',
 'p_NUM_ip_NUM_AMT_INSTALMENT_max_max',
 'BUREAU_bureau_NUM_bureau_NUM_MONTHS_BALANCE_min_mean_mean',
 'LAST1_RATE_DOWN_PAYMENT_mean',
 'p_NUM_POS_NUM_MONTHS_BALANCE_mean_mean',
 'LAST3_ip_NUM_DAYS_INSTALMENT_mean_std',
 'LIVINGAREA_AVG',
 'LAST3_DAYS_FIRST_DUE_mean',
 'LAST3_POS_NUM_MONTHS_BALANCE_min_mean',
 'LAST3_ip_NUM_late_payment_lag_min_mean',
 'p_NUM_POS_NUM_credit_term_ratio_min_max',
 'LAST3_ccb_NUM_AMT_INST_MIN_REGULARITY_std_mean',
 'NONLIVINGAREA_AVG',
 'LAST3_ip_NUM_DAYS_ENTRY_PAYMENT_mean_mean',
 'p_NUM_ccb_NUM_AMT_PAYMENT_CURRENT_max_mean',
 'LAST3_POS_CNT_NAME_CONTRACT_STATUS_Active_max',
 'p_NUM_POS_NUM_SK_DPD_DEF_max_max',
 'LAST1_POS_NUM_credit_term_ratio_mean_mean',
 'BUREAU_bureau_NUM_AMT_CREDIT_SUM_OVERDUE_mean_mean',
 'FLOORSMAX_MEDI',
 'LAST1_POS_NUM_credit_term_ratio_std_mean',
 'LAST3_ip_NUM_NUM_INSTALMENT_NUMBER_std_max',
 'p_NUM_ip_NUM_late_payment_lag_min_max',
 'LAST1_ip_NUM_DAYS_ENTRY_PAYMENT_min_mean',
 'prev_cat_CNT_NAME_SELLER_INDUSTRY_Connectivity',
 'LAST1_POS_NUM_SK_DPD_std_mean',
 'LAST1_ccb_NUM_CNT_DRAWINGS_CURRENT_std_mean',
 'LAST3_POS_CNT_NAME_CONTRACT_STATUS_Active_std',
 'p_NUM_ccb_NUM_AMT_PAYMENT_CURRENT_mean_mean',
 'p_NUM_ccb_NUM_AMT_RECEIVABLE_PRINCIPAL_mean_mean',
 'LAST1_DAYS_LAST_DUE_mean',
 'LAST3_POS_NUM_CNT_INSTALMENT_FUTURE_min_max',
 'p_NUM_SELLERPLACE_AREA_mean',
 'p_NUM_ip_NUM_DAYS_ENTRY_PAYMENT_min_mean',
 'LAST1_POS_NUM_MONTHS_BALANCE_mean_mean',
 'LAST1_ip_NUM_AMT_PAYMENT_max_mean',
 'LAST3_ip_NUM_AMT_PAYMENT_std_mean',
 'p_NUM_POS_NUM_SK_DPD_DEF_std_mean',
 'p_NUM_POS_NUM_CNT_INSTALMENT_mean_max',
 'LAST1_DAYS_LAST_DUE_1ST_VERSION_mean',
 'LAST3_ccb_NUM_max_drawings_receivable_ratio_mean_mean',
 'LAST3_AMT_APPLICATION_std',
 'p_NUM_ip_NUM_late_payment_lag_min_mean',
 'LIVINGAPARTMENTS_AVG',
 'LAST3_POS_NUM_credit_term_ratio_std_std',
 'p_NUM_ccb_NUM_AMT_CREDIT_LIMIT_ACTUAL_min_mean',
 'LAST3_POS_NUM_CNT_INSTALMENT_FUTURE_min_mean',
 'p_NUM_ccb_NUM_CNT_DRAWINGS_CURRENT_max_max',
 'p_NUM_ccb_NUM_AMT_BALANCE_min_max',
 'LAST1_POS_CNT_NAME_CONTRACT_STATUS_Active_mean',
 'LAST1_ip_NUM_AMT_INSTALMENT_mean_mean',
 'LAST3_ccb_NUM_AMT_INST_MIN_REGULARITY_max_mean',
 'p_NUM_ip_NUM_NUM_INSTALMENT_NUMBER_mean_mean',
 'p_NUM_ccb_NUM_CNT_INSTALMENT_MATURE_CUM_std_mean',
 'HOUR_APPR_PROCESS_START',
 'LAST3_ccb_NUM_AMT_BALANCE_mean_max',
 'LAST3_ip_NUM_AMT_INSTALMENT_max_max',
 'LAST3_POS_CNT_NAME_CONTRACT_STATUS_Active_mean',
 'LIVINGAPARTMENTS_MEDI',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'prev_cat_CNT_NAME_GOODS_CATEGORY_Photo___Cinema_Equipment',
 'BASEMENTAREA_AVG',
 'LAST3_DAYS_FIRST_DRAWING_mean',
 'LAST1_ip_NUM_AMT_PAYMENT_std_mean',
 'FLAG_DOCUMENT_11',
 'BASEMENTAREA_MEDI',
 'p_NUM_AMT_CREDIT_max',
 'p_NUM_AMT_APPLICATION_mean',
 'LAST3_ip_NUM_AMT_PAYMENT_mean_max',
 'LAST3_ccb_NUM_CNT_DRAWINGS_ATM_CURRENT_std_mean',
 'BUREAU_bureau_NUM_bureau_NUM_MONTHS_BALANCE_max_mean_mean',
 'p_NUM_ccb_NUM_AMT_DRAWINGS_CURRENT_mean_mean',
 'LAST3_POS_NUM_MONTHS_BALANCE_std_mean',
 'LAST3_POS_NUM_CNT_INSTALMENT_min_mean',
 'LAST1_POS_NUM_CNT_INSTALMENT_FUTURE_std_mean',
 'LAST3_ip_NUM_AMT_PAYMENT_max_std',
 'LAST3_ip_NUM_DAYS_ENTRY_PAYMENT_mean_std',
 'LAST3_POS_CNT_NAME_CONTRACT_STATUS_Signed_mean',
 'p_NUM_ccb_NUM_AMT_RECIVABLE_mean_max',
 'BUREAU_bureau_CNT_CREDIT_TYPE_Credit_card_mean',
 'LAST1_POS_NUM_CNT_INSTALMENT_FUTURE_mean_mean',
 'p_NUM_ip_NUM_NUM_INSTALMENT_NUMBER_mean_max',
 'LAST3_DAYS_LAST_DUE_std',
 'LAST3_ip_NUM_DAYS_INSTALMENT_max_std',
 'LAST3_ip_NUM_AMT_PAYMENT_std_std',
 'p_NUM_POS_NUM_CNT_INSTALMENT_mean_mean',
 'p_NUM_ccb_NUM_AMT_BALANCE_mean_max',
 'LAST1_ip_NUM_DAYS_INSTALMENT_mean_mean',
 'LAST3_POS_NUM_CNT_INSTALMENT_mean_mean',
 'prev_cat_CNT_NAME_CASH_LOAN_PURPOSE_XAP',
 'LAST3_POS_NUM_CNT_INSTALMENT_std_max',
 'LAST1_ip_NUM_NUM_INSTALMENT_VERSION_std_mean',
 'LAST3_ip_NUM_AMT_INSTALMENT_max_std',
 'LAST3_DAYS_LAST_DUE_mean',
 'LAST3_ip_NUM_DAYS_ENTRY_PAYMENT_max_std',
 'LAST1_AMT_ANNUITY_mean',
 'p_NUM_ccb_NUM_AMT_BALANCE_std_mean']

In [31]:
test_features = test_data[features]
test_features.head()

Unnamed: 0_level_0,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,DAYS_EMPLOYED,credit_annuity_ratio,AMT_ANNUITY,AMT_GOODS_PRICE,age_score_ratio,NAME_EDUCATION_TYPE_Higher_education,p_NUM_DAYS_LAST_DUE_1ST_VERSION_max,...,LAST1_ip_NUM_DAYS_INSTALMENT_mean_mean,LAST3_POS_NUM_CNT_INSTALMENT_mean_mean,prev_cat_CNT_NAME_CASH_LOAN_PURPOSE_XAP,LAST3_POS_NUM_CNT_INSTALMENT_std_max,LAST1_ip_NUM_NUM_INSTALMENT_VERSION_std_mean,LAST3_ip_NUM_AMT_INSTALMENT_max_std,LAST3_DAYS_LAST_DUE_mean,LAST3_ip_NUM_DAYS_ENTRY_PAYMENT_max_std,LAST1_AMT_ANNUITY_mean,p_NUM_ccb_NUM_AMT_BALANCE_std_mean
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,0.15952,0.789654,0.752614,-2329.0,27.664697,20560.5,450000.0,-25565.545495,1,-1499.0,...,-1664.0,4.0,1.0,0.0,0.5,,-1619.0,,3951.0,
100005,0.432962,0.291656,0.56499,-4469.0,12.82487,17370.0,180000.0,-31972.235877,0,-376.0,...,,11.7,1.0,0.948683,,,-466.0,,,
100013,0.610991,0.699787,,-4458.0,9.505482,69777.0,630000.0,0.0,1,224.0,...,,20.472222,2.0,4.478343,,249329.619926,-219.0,172.534055,,
100028,0.612704,0.509677,0.525734,-1866.0,32.130726,49018.5,1575000.0,-26583.786845,0,-496.0,...,,7.857143,4.0,2.672612,,,-646.0,,,7894.798137
100038,,0.425687,0.202145,-2191.0,19.506034,32067.0,625500.0,-64508.152622,0,-457.0,...,,12.0,1.0,0.0,,,-457.0,,24463.71,


# Evaluate model

Now, the models can be evaluated with the processed data: 

In [32]:
model_lgb = joblib.load('best_catboost_model.pkl')
model_cat = joblib.load('best_catboost_model.pkl')
model_voting = joblib.load('voting.pkl')

In [33]:
y_pred_lgb = model_lgb.predict_proba(test_features)[:, 1]
y_pred_cat = model_cat.predict_proba(test_features)[:, 1]
y_pred_voting = model_voting.predict_proba(test_features)[:, 1]

In [34]:
submission_lgb = pd.DataFrame({
    "SK_ID_CURR": test_features.index,
    "TARGET": y_pred_lgb
})

submission_cat = pd.DataFrame({
    "SK_ID_CURR": test_features.index,
    "TARGET": y_pred_cat
})

submission_voting = pd.DataFrame({
    "SK_ID_CURR": test_features.index,
    "TARGET": y_pred_voting
})


In [35]:
submission_lgb.to_csv('submission_lgb.csv', index=False)
submission_cat.to_csv('submission_cat.csv', index=False)
submission_voting.to_csv('submission_voting.csv', index=False)

The light-gbm model got a score of 0.78709 of Kaggle's public score, which is acceptable for this project. 
The catboost model obtained a score of 0.78803 as a public score, which is a little higher than the light-gb model's score (as expected). 
The voting model obtained the highest Kaggle score as expected, which was 0.78879. 

# Conclusion and improvements

In conclusion, this project has trained a light-gb, a catboost, a hist-gb, and a voting model to predict the default probability of Home Credit's customers. The best performing model was found to be the voting model, which takes and votes on predictions from the three boosted models. This got a performance of 0.792 for roc-auc for the validation dataset, which is the main evaluation metric. When this model was evaluated with the test dataset on Kaggle's platform, it obtained a score of 0.78879. 

However, the fastest and more explainable model is light-gb, with roc-auc score of 0.790 for the validation dataset, and 0.78709 for the test dataset on Kaggle's platform. This light-gb model was chosen for deployment, as speed and confidence in model reasoning are deemed more important that a slight decrease in performance for Home Credit's organization.

Both of these roc-auc scores are above 0.78, which is the objective score for this project. 

From looking at the SHAP summary plot for this model, key observations include: 

- External credit rating scores are by far the most important in determining the default probability of a customer. In each instance, it has been seen in SHAP summary plots that having high external credit scores lead to a lower likelihood of a customer defaulting on a loan. 
- Sociodemographic factors also influence a customer’s likelihood of defaulting on their Home Credit loan, notably: the length of their employment, age, gender, marriage status, if they own a car, and if they have higher education or not. 
- Finally, economic factors and repayment tendencies describing the customer’s loan influence their default probability, notably: the annuity on their current loan application, the credit/annuity ratio of their current loan application, the amount of debt they had on previous loans at the credit bureau, or their late payment behaviour on previous Home Credit loans. 



Project improvements include:  
- The models could be trained with more recent data, as this Kaggle competition is 8 years old. This would to keep the model relevant and up to date and ensure that Home Credit continues to provide their loaning services to the maximum amount of trustworthy customers. 
- In addition, infrastructure for more thorough testing and monitoring for model deployment could be implemented, to ensure that the model's performance and input data stay constant over time. 
- It would also be interesting to have trained other (simpler) model types, such as a logistic regression model with a smaller number of features. As long as obtained roc-auc score does not drop significantly with the simpler model, Home Credit would be able to pinpoint exactly why a customer's loan application is rejected or not, and by exactly how much each feature influences the acceptance decision. 
- Threshold tuning or model calibrated could have been performed to test model reliability.
- Adding a more user friendly and speedy streamlit interface, with model explanations, would provide a more robust and pleasant user experience. 
- Finding more ways to optimize memory usage for model deployment would increase robustness and ease of model deployment. 

# Appendix:

Obtaining the valid_data dataset for use in streamlit model deployment, where all Home Credit customer id values are ensured to be in all tables before aggregation: 

In [None]:
train = pd.read_csv("application_train.csv")
bureau = pd.read_csv("bureau_final.csv")
p_merged = pd.read_csv("p_final_merged.csv")
valid_ids = set(train["SK_ID_CURR"]) & set(bureau["SK_ID_CURR"]) & set(p_merged["SK_ID_CURR"])
list_of_ids = list(valid_ids)

In [15]:
valid_data = train[train["SK_ID_CURR"].isin(list_of_ids)]
valid_data.head(20)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
5,100008,0,Cash loans,M,N,Y,0,99000.0,490495.5,27517.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
6,100009,0,Cash loans,F,Y,Y,1,171000.0,1560726.0,41301.0,...,0,0,0,0,0.0,0.0,0.0,1.0,1.0,2.0
7,100010,0,Cash loans,M,Y,Y,0,360000.0,1530000.0,42075.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
8,100011,0,Cash loans,F,N,Y,0,112500.0,1019610.0,33826.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
10,100014,0,Cash loans,F,N,Y,1,112500.0,652500.0,21177.0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
11,100015,0,Cash loans,F,N,Y,0,38419.155,148365.0,10678.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


In [5]:
valid_data = valid_data.drop(columns=['TARGET'])
valid_data

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
5,100008,Cash loans,M,N,Y,0,99000.0,490495.5,27517.5,454500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307503,456247,Cash loans,F,N,Y,0,112500.0,345510.0,17770.5,247500.0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,2.0
307505,456249,Cash loans,F,N,Y,0,112500.0,225000.0,22050.0,225000.0,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0
307508,456253,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,585000.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,319500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
valid_data.to_csv("valid_data.csv", index=False)

Obtaining a .json format of a row in the Home Credit dataset: 

In [14]:
sample = train[train["SK_ID_CURR"] == 456236].to_dict(orient="records")[0]
sample.pop('TARGET', None);
sample_json = json.dumps(sample)
sample_json

'{"SK_ID_CURR": 456236, "NAME_CONTRACT_TYPE": "Cash loans", "CODE_GENDER": "M", "FLAG_OWN_CAR": "Y", "FLAG_OWN_REALTY": "Y", "CNT_CHILDREN": 0, "AMT_INCOME_TOTAL": 585000.0, "AMT_CREDIT": 1575000.0, "AMT_ANNUITY": 43443.0, "AMT_GOODS_PRICE": 1575000.0, "NAME_TYPE_SUITE": "Unaccompanied", "NAME_INCOME_TYPE": "Working", "NAME_EDUCATION_TYPE": "Secondary / secondary special", "NAME_FAMILY_STATUS": "Married", "NAME_HOUSING_TYPE": "House / apartment", "REGION_POPULATION_RELATIVE": 0.028663, "DAYS_BIRTH": -20965, "DAYS_EMPLOYED": -1618, "DAYS_REGISTRATION": -1764.0, "DAYS_ID_PUBLISH": -4410, "OWN_CAR_AGE": 2.0, "FLAG_MOBIL": 1, "FLAG_EMP_PHONE": 1, "FLAG_WORK_PHONE": 0, "FLAG_CONT_MOBILE": 1, "FLAG_PHONE": 0, "FLAG_EMAIL": 0, "OCCUPATION_TYPE": "Sales staff", "CNT_FAM_MEMBERS": 2.0, "REGION_RATING_CLIENT": 2, "REGION_RATING_CLIENT_W_CITY": 2, "WEEKDAY_APPR_PROCESS_START": "FRIDAY", "HOUR_APPR_PROCESS_START": 10, "REG_REGION_NOT_LIVE_REGION": 0, "REG_REGION_NOT_WORK_REGION": 0, "LIVE_REGION_N