# Home Credit Default

In [78]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr, randint, uniform
import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective
import warnings
import pickle
import gc

gc.enable()
warnings.filterwarnings("ignore")
np.random.seed(333)

### Load the data

In [79]:
application = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/application_test.csv")
bureau_balance = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/bureau_balance.csv")
bureau = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/bureau.csv")
credit_card = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/credit_card_balance.csv")
installments = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/installments_payments.csv")
pos_cash = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/POS_CASH_balance.csv")
previous_application = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/previous_application.csv")

### Subset to only those records matching with an application

In [80]:
app_key = set(application["SK_ID_CURR"])
bureau = bureau[bureau["SK_ID_CURR"].isin(app_key)]
bur_key = set(bureau["SK_ID_BUREAU"])

In [81]:
bureau_balance = bureau_balance[bureau_balance["SK_ID_BUREAU"].isin(bur_key)]
credit_card = credit_card[credit_card["SK_ID_CURR"].isin(app_key)]
installments = installments[installments["SK_ID_CURR"].isin(app_key)]
pos_cash = pos_cash[pos_cash["SK_ID_CURR"].isin(app_key)]
previous_application = previous_application[previous_application["SK_ID_CURR"].isin(app_key)]
del app_key, bur_key
gc.collect()

184

# Feature construction

## Application

In [82]:
application.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


In [83]:
application.shape

(48744, 121)

In [84]:
application["ORGANIZATION_TYPE_3"] = application["ORGANIZATION_TYPE"].apply(lambda x: ("type 3" in x) or ("Type 3") in x).astype(int)
application["ORGANIZATION_SELF_EMPLOYED"] = (application["ORGANIZATION_TYPE"] == "Self employed").astype(int)
application["ORGANIZATION_XNA"] = (application["ORGANIZATION_TYPE"] == "XNA").astype(int)
application.drop("ORGANIZATION_TYPE", axis=1, inplace=True)

## Previous application

In [85]:
previous_application.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
13,1397919,321676,Consumer loans,7654.86,53779.5,57564.0,0.0,53779.5,SUNDAY,15,...,Consumer electronics,8.0,low_action,POS household without interest,365243.0,-378.0,-168.0,-168.0,-163.0,1.0
14,2273188,270658,Consumer loans,9644.22,26550.0,27252.0,0.0,26550.0,SATURDAY,10,...,Consumer electronics,3.0,middle,POS household with interest,365243.0,-693.0,-633.0,-633.0,-627.0,0.0
17,1285768,142748,Revolving loans,9000.0,180000.0,180000.0,,180000.0,FRIDAY,13,...,XNA,0.0,XNA,Card X-Sell,-277.0,-257.0,365243.0,365243.0,365243.0,0.0
26,2536650,338725,Cash loans,16708.32,369000.0,369000.0,,369000.0,WEDNESDAY,13,...,Consumer electronics,48.0,middle,Cash X-Sell: middle,365243.0,-1457.0,-47.0,-47.0,-41.0,0.0
31,2191093,182450,Consumer loans,9789.255,100485.0,100485.0,0.0,100485.0,WEDNESDAY,9,...,Consumer electronics,12.0,low_normal,POS household with interest,365243.0,-360.0,-30.0,-270.0,-261.0,0.0


In [86]:
previous_application.shape

(256513, 37)

#### Create synthetic target within previous_application

Use common features between application and previous_application to predict the probability of default for previous applications

* `AMT_ANNUITY`
* `AMT_CREDIT`
* `AMT_GOODS_PRICE`
* `HOUR_APPR_PROCESS_START`
* `NAME_CONTRACT_TYPE`
* `NAME_TYPE_SUITE`
* `WEEKDAY_APPR_PROCESS_START`

### Create synthetic target within previous_application

In [87]:
with open("/Users/danielsaxton/home_credit_default_risk/linear_model.pkl", "rb") as f:
    clf = pickle.load(f)
    
impute = Imputer(strategy="median")
scale = StandardScaler()

In [88]:
cols = ["AMT_ANNUITY", 
        "AMT_CREDIT", 
        "AMT_GOODS_PRICE", 
        "HOUR_APPR_PROCESS_START", 
        "NAME_CONTRACT_TYPE", 
        "NAME_TYPE_SUITE", 
        "WEEKDAY_APPR_PROCESS_START"]

prev_temp = pd.get_dummies(previous_application[cols])

dummy_cols = ["AMT_CREDIT",
              "AMT_GOODS_PRICE",
              "HOUR_APPR_PROCESS_START",
              "NAME_CONTRACT_TYPE_Cash loans",
              "NAME_CONTRACT_TYPE_Revolving loans",
              "NAME_TYPE_SUITE_Children",
              "NAME_TYPE_SUITE_Family",
              "NAME_TYPE_SUITE_Group of people",
              "NAME_TYPE_SUITE_Other_A",
              "NAME_TYPE_SUITE_Other_B",
              "NAME_TYPE_SUITE_Spouse, partner",
              "NAME_TYPE_SUITE_Unaccompanied",
              "WEEKDAY_APPR_PROCESS_START_FRIDAY",
              "WEEKDAY_APPR_PROCESS_START_MONDAY",
              "WEEKDAY_APPR_PROCESS_START_SATURDAY",
              "WEEKDAY_APPR_PROCESS_START_SUNDAY",
              "WEEKDAY_APPR_PROCESS_START_THURSDAY",
              "WEEKDAY_APPR_PROCESS_START_TUESDAY",
              "WEEKDAY_APPR_PROCESS_START_WEDNESDAY"]

prev_temp.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,HOUR_APPR_PROCESS_START,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Consumer loans,NAME_CONTRACT_TYPE_Revolving loans,NAME_CONTRACT_TYPE_XNA,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,...,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY
13,7654.86,57564.0,53779.5,15,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
14,9644.22,27252.0,26550.0,10,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
17,9000.0,180000.0,180000.0,13,0,0,1,0,0,0,...,0,0,1,1,0,0,0,0,0,0
26,16708.32,369000.0,369000.0,13,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
31,9789.255,100485.0,100485.0,9,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [89]:
previous_application["SYNTHETIC_TARGET"] = clf.predict_proba(scale.fit_transform(impute.fit_transform(prev_temp[dummy_cols])))[:,1]
del prev_temp
gc.collect()

18

### Derived features

In [90]:
previous_application["DAYS_FIRST_DRAWING_SENTINEL"] = (previous_application["DAYS_FIRST_DRAWING"] == 365243).astype(int)
previous_application["DAYS_FIRST_DUE_SENTINEL"] = (previous_application["DAYS_FIRST_DUE"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_1ST_VERSION_SENTINEL"] = (previous_application["DAYS_LAST_DUE_1ST_VERSION"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_SENTINEL"] = (previous_application["DAYS_LAST_DUE"] == 365243).astype(int)
previous_application["DAYS_TERMINATION_SENTINEL"] = (previous_application["DAYS_TERMINATION"] == 365243).astype(int)

In [91]:
def previous_agg_func(g):
    d = {"COUNT_PREV_APP": len(g), 
        "MIN_PREV_DAYS_TERMINATION": np.nanmin(g["DAYS_TERMINATION"]), 
        "MAX_PREV_DAYS_TERMINATION": np.nanmax(g["DAYS_TERMINATION"]), 
        "AVG_PREV_DAYS_TERMINATION": np.nanmean(g["DAYS_TERMINATION"]), 
        "RANGE_PREV_DAYS_TERMINATION": np.nanmax(g["DAYS_TERMINATION"]) - np.nanmin(g["DAYS_TERMINATION"]),  
        "MIN_PREV_AMT_CREDIT": np.nanmin(g["AMT_CREDIT"]),
        "MAX_PREV_AMT_CREDIT": np.nanmax(g["AMT_CREDIT"]),
        "AVG_PREV_AMT_CREDIT": np.nanmean(g["AMT_CREDIT"]),
        "MIN_PREV_AMT_CREDIT_WEIGHTED": np.nanmin(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])),
        "MAX_PREV_AMT_CREDIT_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])),
        "AVG_PREV_AMT_CREDIT_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])), 
        "MIN_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmin(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "MAX_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmax(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "AVG_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmean(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmin(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),
        "MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),
        "AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),  
        "MIN_PREV_AMT_ANNUITY": np.nanmin(g["AMT_ANNUITY"]), 
        "MAX_PREV_AMT_ANNUITY": np.nanmin(g["AMT_ANNUITY"]), 
        "AVG_PREV_AMT_ANNUITY": np.nanmean(g["AMT_ANNUITY"]), 
        "MIN_PREV_AMT_ANNUITY_WEIGHTED": np.nanmin(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_AMT_ANNUITY_WEIGHTED": np.nanmin(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_AMT_ANNUITY_WEIGHTED": np.nanmean(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "MIN_DAYS_DECISION": np.nanmin(g["DAYS_DECISION"]), 
        "MAX_DAYS_DECISION": np.nanmax(g["DAYS_DECISION"]), 
        "RANGE_DAYS_DECISION": np.nanmax(g["DAYS_DECISION"]) - np.nanmin(g["DAYS_DECISION"]),
        "SUM_DAYS_LAST_DUE_NULL": np.nansum(g["DAYS_LAST_DUE"].isnull()), 
        "AVG_DAYS_LAST_DUE_NULL": np.nanmean(g["DAYS_LAST_DUE"].isnull()), 
        "AVG_PREV_REQ_AMOUNT_WEIGHTED": np.nanmean(g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_REQ_AMOUNT_WEIGHTED": np.nanmax(g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_REQ_AMOUNT": np.nanmean(g["AMT_APPLICATION"]), 
        "MAX_PREV_REQ_AMOUNT": np.nanmax(g["AMT_APPLICATION"]), 
        "AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED": np.nanmean(g["RATE_DOWN_PAYMENT"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_PROP_APPROVED_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_PROP_APPROVED_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_RATE_DOWNPAYMENT": np.nanmean(g["RATE_DOWN_PAYMENT"]), 
        "AVG_PREV_PROP_APPROVED": np.nanmean(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "MAX_PREV_PROP_APPROVED": np.nanmax(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "MIN_PREV_PROP_APPROVED": np.nanmin(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "AVG_PREV_REQ_AMOUNT": np.nanmean(g["AMT_APPLICATION"]), 
        "MAX_PREV_REQ_AMOUNT": np.nanmax(g["AMT_APPLICATION"]), 
        "AVG_PREV_RATE_DOWNPAYMENT": np.nanmean(g["RATE_DOWN_PAYMENT"]), 
        "AVG_PREV_INT_RATE": np.nanmean(g["RATE_INTEREST_PRIMARY"]), 
        "SUM_PREV_URGENT_NEEDS": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Urgent needs"), 
        "SUM_PREV_REPAIRS": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Repairs"), 
        "SUM_PREV_OTHER": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Other"), 
        "SUM_PREV_LIMIT_REJECT": np.nansum(g["CODE_REJECT_REASON"] == "LIMIT"), 
        "SUM_REFUSED_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Refused"), 
        "SUM_CANC_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Canceled"), 
        "SUM_APPR_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Approved"), 
        "SUM_PREV_HC_REJECT": np.nansum(g["CODE_REJECT_REASON"] == "HC"), 
        "SUM_PREV_INSURE_REQ": np.nansum(g["NFLAG_INSURED_ON_APPROVAL"]), 
        "SUM_NFLAG_INSURED_ON_APPROVAL_NULL": np.nansum(g["NFLAG_INSURED_ON_APPROVAL"].isnull()), 
        "AVG_NFLAG_INSURED_ON_APPROVAL_NULL": np.nanmean(g["NFLAG_INSURED_ON_APPROVAL"].isnull()), 
        "COUNT_PREV_WALK_IN": np.nansum(g["NAME_PRODUCT_TYPE"] == "walk-in"), 
        "COUNT_PREV_HIGH_YIELD": np.nansum(g["NAME_YIELD_GROUP"] == "high"), 
        "COUNT_PREV_LOW_YIELD": np.nansum(g["NAME_YIELD_GROUP"].apply(lambda x: x.startswith("low"))), 
        "AVG_SYNTH_TARGET": np.nanmean(g["SYNTHETIC_TARGET"]), 
        "SUM_SYNTH_TARGET_WEIGHTED": np.nansum(g["SYNTHETIC_TARGET"] / abs(g["DAYS_DECISION"])), 
        "SUM_SYNTH_TARGET": np.nansum(g["SYNTHETIC_TARGET"]), 
        "MAX_SYNTH_TARGET": np.nanmax(g["SYNTHETIC_TARGET"]), 
        "MIN_SYNTH_TARGET": np.nanmin(g["SYNTHETIC_TARGET"]), 
        "RANGE_SYNTH_TARGET": np.nanmax(g["SYNTHETIC_TARGET"]) - np.min(g["SYNTHETIC_TARGET"]), 
        "SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE": np.nansum(g["DAYS_LAST_DUE_1ST_VERSION"] == g["DAYS_LAST_DUE"]), 
        "SUM_DAYS_FIRST_DRAWING_SENTINEL": np.nansum(g["DAYS_FIRST_DRAWING_SENTINEL"]), 
        "MAX_DAYS_FIRST_DRAWING_SENTINEL": np.nanmax(g["DAYS_FIRST_DRAWING_SENTINEL"]), 
        "SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED": np.nansum(g["DAYS_FIRST_DRAWING_SENTINEL"] / abs(g["DAYS_DECISION"])), 
        "MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED": np.nanmax(g["DAYS_FIRST_DRAWING_SENTINEL"] / abs(g["DAYS_DECISION"])),         
        "SUM_DAYS_LAST_DUE_LT_FIRST_VERSION": np.nansum(g["DAYS_LAST_DUE"] < g["DAYS_LAST_DUE_1ST_VERSION"]), 
        "SUM_DAYS_FIRST_DRAWING_DAYS_FIRST_DUE_SENTINEL": np.nansum(g["DAYS_FIRST_DRAWING_SENTINEL"] * g["DAYS_FIRST_DUE_SENTINEL"]), 
        "MAX_DAYS_FIRST_DRAWING_DAYS_DUE_SENTINEL": np.nanmax(g["DAYS_FIRST_DRAWING_SENTINEL"] * g["DAYS_FIRST_DUE_SENTINEL"]), 
        "SUM_DAYS_FIRST_SENTINEL_COMP_DAYS_LAST_SENTINEL": np.nansum((1 - g["DAYS_FIRST_DUE_SENTINEL"]) * g["DAYS_LAST_DUE_1ST_VERSION_SENTINEL"]), 
        "MAX_DAYS_FIRST_SENTINEL_COMP_DAYS_LAST_SENTINEL": np.nansum((1 - g["DAYS_FIRST_DUE_SENTINEL"]) * g["DAYS_LAST_DUE_1ST_VERSION_SENTINEL"])}
    
    return pd.Series(d)

In [92]:
previous_agg = previous_application.groupby("SK_ID_CURR").apply(previous_agg_func).reset_index()
previous_agg.head()

Unnamed: 0,SK_ID_CURR,AVG_DAYS_LAST_DUE_NULL,AVG_NFLAG_INSURED_ON_APPROVAL_NULL,AVG_PREV_AMT_ANNUITY,AVG_PREV_AMT_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT,AVG_PREV_AMT_CREDIT_DIV_ANNUITY,AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT_WEIGHTED,AVG_PREV_DAYS_TERMINATION,...,SUM_NFLAG_INSURED_ON_APPROVAL_NULL,SUM_PREV_HC_REJECT,SUM_PREV_INSURE_REQ,SUM_PREV_LIMIT_REJECT,SUM_PREV_OTHER,SUM_PREV_REPAIRS,SUM_PREV_URGENT_NEEDS,SUM_REFUSED_CONTRACT,SUM_SYNTH_TARGET,SUM_SYNTH_TARGET_WEIGHTED
0,100001,0.0,0.0,3951.0,2.27069,23787.0,6.020501,0.00346,13.67069,-1612.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105484,6.1e-05
1,100005,0.5,0.5,4813.2,6.358256,20076.75,8.342371,0.01102,26.521466,-460.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144306,0.000302
2,100013,0.25,0.25,11478.195,15.897086,146134.125,11.523312,0.016079,177.826452,-710.333333,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.325683,0.000727
3,100028,0.4,0.4,8091.585,6.556266,92920.5,16.733328,0.012038,69.509847,121182.666667,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419282,0.000417
4,100038,0.5,0.5,17782.155,116.957306,300550.5,14.564047,0.09871,2346.918261,-449.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196694,0.001045


### Join features

In [93]:
df = pd.merge(application, previous_agg, how="left", on="SK_ID_CURR")
del previous_application, previous_agg
gc.collect()

187

## Bureau balance

In [None]:
bureau_balance["STATUS"].value_counts()

In [94]:
bureau_balance["STATUS"] = bureau_balance["STATUS"].where(lambda x: x != "X").fillna("0")
bureau_balance["STATUS"].value_counts()

0    4857649
C    4527854
1      74443
5       9806
2       5371
3       1857
4       1149
Name: STATUS, dtype: int64

In [95]:
bureau_balance.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
1303,5718466,0,0
1304,5718466,-1,0
1305,5718466,-2,0
1306,5718466,-3,0
1307,5718466,-4,0


In [96]:
bureau_balance.shape

(9478129, 3)

### Derived features

In [97]:
def bureau_balance_agg_func(g):
    d = {"LEN_BUREAU_BALANCE": np.nansum(g["STATUS"] != "C"), 
        "SUM_CURRENT_BUREAU_BALANCE": np.nansum(g["STATUS"] == "0"), 
        "SUM_DQ_BUREAU_BALANCE": np.nansum(g["STATUS"].isin(["1", "2", "3", "3", "4", "5"])),
        "WORST_DQ_BUREAU_BALANCE": np.nanmax(g["STATUS"].apply(lambda x: 0 if x == "C" else int(x))), 
        "AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nansum(abs(g["MONTHS_BALANCE"]) * (g["STATUS"] != "C")) / np.nansum(g["STATUS"] != "C")}
    
    return pd.Series(d)

In [98]:
bureau_balance_agg = bureau_balance.groupby("SK_ID_BUREAU").apply(bureau_balance_agg_func).reset_index()
bureau_balance_agg.head()

Unnamed: 0,SK_ID_BUREAU,AVG_MONTHS_BALANCE_BUREAU_BALANCE,LEN_BUREAU_BALANCE,SUM_CURRENT_BUREAU_BALANCE,SUM_DQ_BUREAU_BALANCE,WORST_DQ_BUREAU_BALANCE
0,5001710,65.0,35.0,35.0,0.0,0.0
1,5001711,1.5,4.0,4.0,0.0,0.0
2,5001712,13.5,10.0,10.0,0.0,0.0
3,5001713,10.5,22.0,22.0,0.0,0.0
4,5001714,7.0,15.0,15.0,0.0,0.0


### Join features

In [99]:
bureau_joined = pd.merge(bureau, 
                         bureau_balance_agg, 
                         how="left", 
                         on="SK_ID_BUREAU")

## Bureau

In [100]:
bureau_joined.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,...,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,AVG_MONTHS_BALANCE_BUREAU_BALANCE,LEN_BUREAU_BALANCE,SUM_CURRENT_BUREAU_BALANCE,SUM_DQ_BUREAU_BALANCE,WORST_DQ_BUREAU_BALANCE
0,261883,5718424,Active,currency 1,-326,0,-142.0,,,0,...,,0.0,Consumer credit,-4,,5.0,11.0,11.0,0.0,0.0
1,261883,5718426,Closed,currency 1,-1039,0,-769.0,-769.0,,0,...,,0.0,Consumer credit,-761,0.0,30.0,9.0,9.0,0.0,0.0
2,261883,5718427,Closed,currency 1,-702,0,28.0,-671.0,,0,...,,0.0,Consumer credit,-659,0.0,23.0,1.0,1.0,0.0,0.0
3,261883,5718428,Closed,currency 1,-378,0,-144.0,-144.0,0.0,0,...,0.0,0.0,Credit card,-144,,9.0,7.0,7.0,0.0,0.0
4,261883,5718429,Closed,currency 1,-671,0,-343.0,-343.0,0.0,0,...,0.0,0.0,Consumer credit,-337,,17.0,11.0,11.0,0.0,0.0


In [101]:
bureau_joined.shape

(251103, 22)

### Derived features

In [102]:
def bureau_agg_func(g):
    d = {
        "AVG_LEN_BUREAU_BALANCE": np.nanmean(g["LEN_BUREAU_BALANCE"]), 
        "SUM_LEN_BUREAU_BALANCE": np.nansum(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CURRENT": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CURRENT_WEIGHTED": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]) / np.nansum(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmin(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]) - np.nanmin(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "SUM_SUM_CURRENT_BUREAU_BALANCE": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]), 

        
        "AVG_PROP_CURRENT": np.nanmean(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]), 
        "MIN_PROP_CURRENT": np.nanmin(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ": np.nanmean(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ": np.nanmax(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]),
        "AVG_PROP_CURRENT_WEIGHTED": np.nanmean(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_PROP_CURRENT_WEIGHTED": np.nanmin(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ_WEIGHTED": np.nanmean(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ_WEIGHTED": np.nanmax(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_CURRENT_WEIGHTED_AMT": np.nanmean(g["AMT_CREDIT_SUM"] * g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_PROP_CURRENT_WEIGHTED_AMT": np.nanmin(g["AMT_CREDIT_SUM"] * g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ_WEIGHTED_AMT": np.nanmean(g["AMT_CREDIT_SUM"] * g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ_WEIGHTED_AMT": np.nanmax(g["AMT_CREDIT_SUM"] * g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]),          
        "MAX_WORST_DQ_BUREAU_BALANCE": np.nanmax(g["WORST_DQ_BUREAU_BALANCE"]), 
        "AVG_WORST_DQ_BUREAU_BALANCE": np.nanmean(g["WORST_DQ_BUREAU_BALANCE"]), 
        "MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED": np.nanmax(g["WORST_DQ_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED": np.nanmean(g["WORST_DQ_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        
        
        "MIN_DAYS_CREDIT_ENDDATE": np.nanmin(g["DAYS_CREDIT_ENDDATE"]), 
        "MAX_DAYS_CREDIT_ENDDATE": np.nanmax(g["DAYS_CREDIT_ENDDATE"]), 
        "SUM_DAYS_CREDIT_ENDDATE": np.nansum(g["DAYS_CREDIT_ENDDATE"]), 
        "SUM_NULL_DAYS_ENDDATE_FACT": np.nansum(g["DAYS_ENDDATE_FACT"].isnull()), 
        "COUNT_BUREAU_RECORDS": len(g), 
        "COUNT_ACTIVE": np.nansum(g["CREDIT_ACTIVE"] == "Active"), 
        "MAX_CREDIT_DAY_OVERDUE_WEIGHTED": np.nanmax(g["CREDIT_DAY_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_CREDIT_DAY_OVERDUE_WEIGHTED": np.nansum(g["CREDIT_DAY_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "MAX_CREDIT_DAY_OVERDUE": np.nanmax(g["CREDIT_DAY_OVERDUE"]), 
        "SUM_CREDIT_DAY_OVERDUE": np.nansum(g["CREDIT_DAY_OVERDUE"]), 
        "DAYS_SINCE_APPLIED": - np.nanmax(g["DAYS_CREDIT"]), 
        "SUM_INVERSE_DAYS_CREDIT": - np.nansum(1 / g["DAYS_CREDIT"]), 
        "MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED": np.nanmax(g["AMT_CREDIT_MAX_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED": np.nansum(g["AMT_CREDIT_MAX_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "MAX_AMT_CREDIT_MAX_OVERDUE": np.nanmax(g["AMT_CREDIT_MAX_OVERDUE"]), 
        "SUM_AMT_CREDIT_MAX_OVERDUE": np.nansum(g["AMT_CREDIT_MAX_OVERDUE"]), 
        "SUM_CNT_CREDIT_PROLONG": np.nansum(g["CNT_CREDIT_PROLONG"]), 
        "SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED": np.nansum(g["AMT_CREDIT_SUM_DEBT"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_AMT_CREDIT_SUM_DEBT": np.nansum(g["AMT_CREDIT_SUM_DEBT"]),
        "BUREAU_UTILIZATION_AVG": np.nanmean(g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]), 
        "BUREAU_UTILIZATION_MAX": np.nanmax(g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]), 
        "BUREAU_PROP_SUM_OVERDUE_AVG": np.nanmean(g["AMT_CREDIT_SUM_OVERDUE"] / g["AMT_CREDIT_SUM_DEBT"]), 
        "BUREAU_PROP_SUM_OVERDUE_MAX": np.nanmean(g["AMT_CREDIT_SUM_OVERDUE"] / g["AMT_CREDIT_SUM_DEBT"]),
        "BUREAU_PROP_MAX_OVERDUE_AVG": np.nanmean(g["AMT_CREDIT_MAX_OVERDUE"] / g["AMT_CREDIT_SUM_DEBT"]), 
        "BUREAU_PROP_MAX_OVERDUE_MAX": np.nanmean(g["AMT_CREDIT_MAX_OVERDUE"] / g["AMT_CREDIT_SUM_DEBT"]),
        "MAX_DAYS_CREDIT_UPDATE": np.nanmax(g["DAYS_CREDIT_UPDATE"]), 
        "DAYS_CREDIT_RANGE": np.nanmax(g["DAYS_CREDIT"]) - np.nanmin(g["DAYS_CREDIT"]), 
        "TOTAL_AMT_CREDIT_SUM_WEIGHTED": np.nansum(g["AMT_CREDIT_SUM"] / abs(g["DAYS_CREDIT_UPDATE"])),
        "TOTAL_AMT_CREDIT_SUM": np.nansum(g["AMT_CREDIT_SUM"]),
        "COUNT_CREDIT_CARD": np.nansum(g["CREDIT_TYPE"] == "Credit card"), 
        "COUNT_CAR_LOAN": np.nansum(g["CREDIT_TYPE"] == "Car loan"), 
        "COUNT_MORTGAGE": np.nansum(g["CREDIT_TYPE"] == "Mortgage"), 
        "SUM_AMT_ANNUITY": np.nansum(g["AMT_ANNUITY"])}
    
    return pd.Series(d)

In [103]:
bureau_agg = bureau_joined.groupby("SK_ID_CURR").apply(bureau_agg_func).reset_index()
bureau_agg.head()

Unnamed: 0,SK_ID_CURR,AVG_PROP_CURRENT,AVG_PROP_CURRENT_WEIGHTED,AVG_PROP_CURRENT_WEIGHTED_AMT,AVG_PROP_DQ,AVG_PROP_DQ_WEIGHTED,AVG_PROP_DQ_WEIGHTED_AMT,AVG_WORST_DQ_BUREAU_BALANCE,AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED,BUREAU_PROP_MAX_OVERDUE_AVG,...,SUM_AMT_CREDIT_SUM_DEBT,SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED,SUM_CNT_CREDIT_PROLONG,SUM_CREDIT_DAY_OVERDUE,SUM_CREDIT_DAY_OVERDUE_WEIGHTED,SUM_DAYS_CREDIT_ENDDATE,SUM_INVERSE_DAYS_CREDIT,SUM_NULL_DAYS_ENDDATE_FACT,TOTAL_AMT_CREDIT_SUM,TOTAL_AMT_CREDIT_SUM_WEIGHTED
0,100001,0.992481,0.349547,120775.784672,0.007519,0.000835,282.105263,0.142857,0.015873,,...,596686.5,53216.5875,0.0,0.0,0.0,577.0,0.029363,3.0,1453365.0,100412.66129
1,100005,1.0,0.539216,107036.117647,0.0,0.0,0.0,0.0,0.0,0.0,...,568408.5,50188.368035,0.0,0.0,0.0,1318.0,0.026109,2.0,657126.0,53154.691016
2,100013,0.934538,0.027611,11763.832141,0.065462,0.001329,935.629196,0.75,0.016119,,...,0.0,0.0,0.0,0.0,0.0,-4272.0,0.002409,0.0,2072280.06,9516.034492
3,100028,1.0,0.064056,9514.816939,0.0,0.0,0.0,0.0,0.0,0.0,...,186304.5,2458.279889,0.0,0.0,0.0,23877.0,0.013641,5.0,1520875.08,14667.35831
4,100042,0.984386,0.085411,60135.414884,0.015614,0.000548,376.210147,0.357143,0.014949,0.0,...,3074162.895,262991.394491,0.0,0.0,0.0,8685.0,0.020479,8.0,9540657.0,461427.590883


### Join features

In [104]:
df = pd.merge(df, bureau_agg, how="left", on="SK_ID_CURR")
del bureau_balance, bureau_balance_agg, bureau, bureau_agg, bureau_joined
gc.collect()

226

## Credit card

In [105]:
credit_card.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
5,2646502,380010,-7,82903.815,270000,0.0,0.0,0.0,0.0,4449.105,...,82773.315,82773.315,0.0,0,0.0,0.0,2.0,Active,7,0
6,1079071,171320,-6,353451.645,585000,67500.0,67500.0,0.0,0.0,14684.175,...,351881.145,351881.145,1.0,1,0.0,0.0,6.0,Active,0,0
7,2095912,118650,-7,47962.125,45000,45000.0,45000.0,0.0,0.0,0.0,...,47962.125,47962.125,1.0,1,0.0,0.0,51.0,Active,0,0


In [106]:
credit_card.shape

(612347, 23)

### Derived features

In [107]:
def credit_card_agg_func(g):
    d = {"AVG_BALANCE": np.nanmean(g["AMT_BALANCE"]), 
        "MAX_BALANCE": np.nanmax(g["AMT_BALANCE"]), 
        "SUM_BALANCE": np.nansum(g["AMT_BALANCE"]), 
        "MAX_MONTHS_BALANCE": np.nanmax(abs(g["MONTHS_BALANCE"])), 
        "MIN_MONTHS_BALANCE": np.nanmin(abs(g["MONTHS_BALANCE"])), 
        "RANGE_MONTHS_BALANCE": np.nanmax(g["MONTHS_BALANCE"]) - np.nanmin(g["MONTHS_BALANCE"]), 
        "AVG_UTILIZATION": np.nanmean(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]), 
        "MAX_UTILIZATION": np.nanmax(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]), 
        "AVG_BALANCE_WEIGHTED": np.nanmean(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_BALANCE_WEIGHTED": np.nanmax(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "SUM_BALANCE_WEIGHTED": np.nansum(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "AVG_UTILIZATION_WEIGHTED": np.nanmean(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_UTILIZATION_WEIGHTED": np.nanmax(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_DPD_WEIGHTED": np.nanmax(g["SK_DPD"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_DPD_DEF_WEIGHTED": np.nanmax(g["SK_DPD_DEF"] / abs(g["MONTHS_BALANCE"])), 
        "SUM_CNT_DRAWINGS_CURRENT": np.nansum(g["CNT_DRAWINGS_CURRENT"]), 
        "AVG_CNT_DRAWINGS_CURRENT": np.nanmean(g["CNT_DRAWINGS_CURRENT"]), 
        "MAX_CNT_DRAWINGS_CURRENT": np.nanmax(g["CNT_DRAWINGS_CURRENT"]), 
        "SUM_AMT_DRAWINGS_CURRENT": np.nansum(g["AMT_DRAWINGS_CURRENT"]), 
        "AVG_AMT_DRAWINGS_CURRENT": np.nanmean(g["AMT_DRAWINGS_CURRENT"]), 
        "MAX_AMT_DRAWINGS_CURRENT": np.nanmax(g["AMT_DRAWINGS_CURRENT"]), 
        "MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmin(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"]), 
        "AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmean(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"]), 
        "MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmax(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"]),}
    
    return pd.Series(d)

In [108]:
credit_card_agg = credit_card.groupby("SK_ID_CURR").apply(credit_card_agg_func).reset_index()
credit_card_agg.head()

Unnamed: 0,SK_ID_CURR,AVG_AMT_DRAWINGS_CURRENT,AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,AVG_BALANCE,AVG_BALANCE_WEIGHTED,AVG_CNT_DRAWINGS_CURRENT,AVG_UTILIZATION,AVG_UTILIZATION_WEIGHTED,MAX_AMT_DRAWINGS_CURRENT,MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,...,MAX_MONTHS_BALANCE,MAX_UTILIZATION,MAX_UTILIZATION_WEIGHTED,MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,MIN_MONTHS_BALANCE,RANGE_MONTHS_BALANCE,SUM_AMT_DRAWINGS_CURRENT,SUM_BALANCE,SUM_BALANCE_WEIGHTED,SUM_CNT_DRAWINGS_CURRENT
0,100013,5953.125,inf,18159.919219,230.066978,0.239583,0.115301,0.001461,157500.0,inf,...,96.0,1.02489,0.012345,0.0,1.0,95.0,571500.0,1743352.245,22086.429911,23.0
1,100028,6156.400408,inf,8085.058163,1461.966014,2.387755,0.035934,0.006498,22823.55,inf,...,49.0,0.165937,0.165937,0.565555,1.0,48.0,301663.62,396167.85,71636.334672,117.0
2,100042,5923.886786,inf,33356.183036,527.899184,0.380952,0.370624,0.005866,87750.0,inf,...,84.0,1.034649,0.018974,0.965,1.0,83.0,497606.49,2801919.375,44343.531419,32.0
3,100066,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,...,15.0,0.0,0.0,,1.0,14.0,0.0,0.0,0.0,0.0
4,100067,3237.026897,inf,27182.729483,2120.662439,0.666667,0.604061,0.047126,46897.695,inf,...,87.0,1.05738,1.03055,0.0,1.0,86.0,281621.34,2364897.465,184497.632232,58.0


### Join features

In [109]:
df = pd.merge(df, credit_card_agg, how="left", on="SK_ID_CURR")
del credit_card, credit_card_agg
gc.collect()

164

## Installments

In [110]:
installments.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
9,1413990,109741,1.0,4,-570.0,-609.0,14308.47,14308.47
13,1723268,197273,1.0,14,-755.0,-800.0,6093.99,6093.99
19,2329072,183463,1.0,11,-1699.0,-1693.0,7283.295,7283.295
23,2631836,140337,0.0,116,-322.0,-322.0,560.88,560.88
24,2702769,102122,1.0,1,-281.0,-286.0,5178.195,5178.195


In [111]:
installments.shape

(2013809, 8)

### Derived features

In [112]:
def installment_agg_func(g):
    d = {"COUNT_UNDERPAYMENT": np.nansum(g["AMT_PAYMENT"] / g["AMT_INSTALMENT"] < 0.5), 
        "SUM_UNDERPAYMENT": np.nansum(g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]), 
        "SUM_UNDERPAYMENT_WEIGHTED": np.nansum((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]) / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MAX_UNDERPAYMENT": np.nanmax(g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]), 
        "AVG_PAYMENT_SIZE_WEIGHTED": np.nanmean(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "AVG_PAYMENT_SIZE": np.nanmean(g["AMT_PAYMENT"]), 
        "MAX_PAYMENT_SIZE_WEIGHTED": np.nanmax(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MAX_PAYMENT_SIZE": np.nanmax(g["AMT_PAYMENT"]), 
        "MIN_PAYMENT_SIZE_WEIGHTED": np.nanmin(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MIN_PAYMENT_SIZE": np.nanmin(g["AMT_PAYMENT"]),
        "SUM_PAYMENT_WEIGHTED": np.nansum(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "SUM_PAYMENT": np.nansum(g["AMT_PAYMENT"]),
        "SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT": np.nansum(g["DAYS_ENTRY_PAYMENT"] > g["DAYS_INSTALMENT"]), 
        "MAX_DAYS_ENTRY_PAYMENT": np.nanmax(g["DAYS_ENTRY_PAYMENT"]), 
        "MIN_DAYS_ENTRY_PAYMENT": np.nanmin(g["DAYS_ENTRY_PAYMENT"]), 
        "RANGE_DAYS_ENTRY_PAYMENT": np.nanmax(g["DAYS_ENTRY_PAYMENT"]) - np.nanmin(g["DAYS_ENTRY_PAYMENT"])}
    
    return pd.Series(d)

In [113]:
installment_agg = installments.groupby("SK_ID_CURR").apply(installment_agg_func).reset_index()
installment_agg.head()

Unnamed: 0,SK_ID_CURR,AVG_PAYMENT_SIZE,AVG_PAYMENT_SIZE_WEIGHTED,COUNT_UNDERPAYMENT,MAX_DAYS_ENTRY_PAYMENT,MAX_PAYMENT_SIZE,MAX_PAYMENT_SIZE_WEIGHTED,MAX_UNDERPAYMENT,MIN_DAYS_ENTRY_PAYMENT,MIN_PAYMENT_SIZE,MIN_PAYMENT_SIZE_WEIGHTED,RANGE_DAYS_ENTRY_PAYMENT,SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT,SUM_PAYMENT,SUM_PAYMENT_WEIGHTED,SUM_UNDERPAYMENT,SUM_UNDERPAYMENT_WEIGHTED
0,100001,5885.132143,3.116986,0.0,-1628.0,17397.9,10.686671,0.0,-2916.0,3951.0,1.365586,1288.0,1.0,41195.925,21.8189,0.0,0.0
1,100005,6240.205,11.09417,0.0,-470.0,17656.245,37.566479,0.0,-736.0,4813.2,6.539674,266.0,1.0,56161.845,99.847528,0.0,0.0
2,100013,9740.235774,14.445926,10.0,-14.0,357347.745,1029.820591,23147.82,-2705.0,6.165,0.01181,2691.0,11.0,1509736.545,2239.118537,179437.725,381.708698
3,100028,4356.731549,11.108996,10.0,-29.0,38988.54,387.931034,8505.0,-1785.0,1.17,0.001572,1756.0,12.0,492310.665,1255.316553,70348.23,131.908966
4,100038,11100.3375,18.007955,0.0,-466.0,11100.6,23.81427,0.0,-802.0,11097.45,13.841147,336.0,0.0,133204.05,216.095461,0.0,0.0


### Join features

In [114]:
df = pd.merge(df, installment_agg, how="left", on="SK_ID_CURR")
del installments, installment_agg
gc.collect()

124

## Point of sale cash

In [115]:
pos_cash.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
23,1445556,220181,-41,12.0,12.0,Active,0,0
24,1420240,386836,-41,12.0,11.0,Active,0,0
31,1463929,108990,-41,12.0,12.0,Active,0,0
42,1298869,210084,-41,48.0,0.0,Completed,0,0
47,2027470,391768,-41,48.0,40.0,Active,0,0


In [116]:
pos_cash.shape

(1457983, 8)

### Derived features

In [117]:
def pos_cash_agg_func(g):
    d = {"MIN_CNT_INSTALMENT_DIFF_CNT_INSTALMENT_FUTURE_WEIGHTED": np.nanmin(g["CNT_INSTALMENT"] / g["CNT_INSTALMENT_FUTURE"]), 
        "MAX_CNT_INSTALMENT_DIFF_CNT_INSTALMENT_FUTURE_WEIGHTED": np.nanmin(g["CNT_INSTALMENT"] / g["CNT_INSTALMENT_FUTURE"]), 
        "AVG_CNT_INSTALMENT_DIFF_CNT_INSTALMENT_FUTURE_WEIGHTED": np.nanmean(g["CNT_INSTALMENT"] / g["CNT_INSTALMENT_FUTURE"]), 
        "MAX_POS_DPD": np.nanmax(g["SK_DPD"]), 
        "MAX_POS_DPD_DEF": np.nanmax(g["SK_DPD_DEF"])}
    
    return pd.Series(d)

In [118]:
pos_cash_agg = pos_cash.groupby("SK_ID_CURR").apply(pos_cash_agg_func).reset_index()
pos_cash_agg.head()

Unnamed: 0,SK_ID_CURR,AVG_CNT_INSTALMENT_DIFF_CNT_INSTALMENT_FUTURE_WEIGHTED,MAX_CNT_INSTALMENT_DIFF_CNT_INSTALMENT_FUTURE_WEIGHTED,MAX_POS_DPD,MAX_POS_DPD_DEF,MIN_CNT_INSTALMENT_DIFF_CNT_INSTALMENT_FUTURE_WEIGHTED
0,100001,inf,1.0,7.0,7.0,1.0
1,100005,inf,1.0,0.0,0.0,1.0
2,100013,inf,1.0,18.0,0.0,1.0
3,100028,inf,1.0,0.0,0.0,1.0
4,100038,inf,1.0,0.0,0.0,1.0


### Join features

In [119]:
df = pd.merge(df, pos_cash_agg, how="left", on="SK_ID_CURR")
del pos_cash, pos_cash_agg
gc.collect()

129

# Further construction and preprocessing

In [120]:
df["FLAG_OWN_CAR"] = (df["FLAG_OWN_CAR"] == "Y").astype(int)
df["FLAG_OWN_REALTY"] = (df["FLAG_OWN_REALTY"] == "Y").astype(int)

### Interactions between features

In [121]:
df["AMT_CREDIT_DIV_AMT_INCOME_TOTAL"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]
df["AMT_CREDIT_DIV_AMT_GOODS_PRICE"] = df["AMT_CREDIT"] / df["AMT_GOODS_PRICE"]
df["AMT_CREDIT_DIV_SUM_PAYMENT"] = df["AMT_CREDIT"] / df["SUM_PAYMENT"]
df["AMT_GOODS_PRICE_DIV_AMT_INCOME_TOTAL"] = df["AMT_GOODS_PRICE"] / df["AMT_INCOME_TOTAL"]
df["AMT_CREDIT_DIV_AMT_ANNUITY"] = df["AMT_CREDIT"] / df["AMT_ANNUITY"]
df["AMT_CREDIT_DIV_AVG_PREV_REQ_AMOUNT"] = df["AMT_CREDIT"] / df["AVG_PREV_REQ_AMOUNT"]
df["AMT_CREDIT_DIV_MAX_PREV_REQ_AMOUNT"] = df["AMT_CREDIT"] / df["MAX_PREV_REQ_AMOUNT"]
df["EXT_SOURCE_PROD"] = df["EXT_SOURCE_1"] * df["EXT_SOURCE_2"] * df["EXT_SOURCE_3"]
df["DAYS_EMPLOYED_DIV_DAYS_BIRTH"] = df["DAYS_EMPLOYED"] / df["DAYS_BIRTH"]
df["AVG_PAYMENT_SIZE_DIV_AMT_INCOME_TOTAL"] = df["AVG_PAYMENT_SIZE"] / df["AMT_INCOME_TOTAL"]
df["AVG_PAYMENT_SIZE_DIV_AMT_CREDIT"] = df["AVG_PAYMENT_SIZE"] / df["AMT_CREDIT"]
df["AVG_PAYMENT_SIZE_DIV_AMT_ANNUITY"] = df["AVG_PAYMENT_SIZE"] / df["AMT_ANNUITY"]
df["DAYS_REGISTRATION_PLUS_DAYS_ID_PUBLISH"] = df["DAYS_REGISTRATION"] + df["DAYS_ID_PUBLISH"]
df["SUM_REFUSED_CONTRACT_DIV_SUM_APPR_CONTRACT"] = df["SUM_REFUSED_CONTRACT"] / df["SUM_APPR_CONTRACT"]
df["MAX_UTILIZATION_DIV_AVG_UTILIZATION"] = df["MAX_UTILIZATION"] / df["AVG_UTILIZATION"]
df["MAX_PREV_REQ_AMOUNT_DIV_AMT_CREDIT"] = df["MAX_PREV_REQ_AMOUNT"] / df["AMT_CREDIT"]
df["AMT_INCOME_TOTAL_DIV_DAYS_BIRTH"] = df["AMT_INCOME_TOTAL"] / df["DAYS_BIRTH"]
df["SUM_DAYS_ID_REG_PHONE"] = df["DAYS_ID_PUBLISH"] + df["DAYS_REGISTRATION"] + df["DAYS_LAST_PHONE_CHANGE"]
df["SUM_REQ_CREDIT"] = df["AMT_REQ_CREDIT_BUREAU_HOUR"] + df["AMT_REQ_CREDIT_BUREAU_DAY"] + df["AMT_REQ_CREDIT_BUREAU_WEEK"] + df["AMT_REQ_CREDIT_BUREAU_MON"] + df["AMT_REQ_CREDIT_BUREAU_QRT"] + df["AMT_REQ_CREDIT_BUREAU_YEAR"]
df["DEF_30_PLUS_60_CNT_SOCIAL_CIRCLE"] = df["DEF_30_CNT_SOCIAL_CIRCLE"] + df["DEF_60_CNT_SOCIAL_CIRCLE"]
df["OWN_CAR_AGE_DIV_DAYS_BIRTH"] = df["OWN_CAR_AGE"] / df["DAYS_BIRTH"]
df["LANDAREA_DIV_TOTALAREA_MODE"] = df["LANDAREA_MODE"] / df["TOTALAREA_MODE"]
df["OWN_CAR_AGE_PLUS_DAYS_BIRTH"] = df["OWN_CAR_AGE"] + df["DAYS_BIRTH"]
df["AMT_ANNUITY_DIV_DAYS_BIRTH"] = df["AMT_ANNUITY"] / df["DAYS_BIRTH"]
df["AMT_ANNUITY_DIV_DAYS_EMPLOYED"] = df["AMT_ANNUITY"] / df["DAYS_EMPLOYED"]
df["AMT_ANNUITY_PROD_DAYS_EMPLOYED"] = df["AMT_ANNUITY"] * df["DAYS_EMPLOYED"]
df["DAYS_REGISTRATION_DIV_DAYS_ID_PUBLISH"] = df["DAYS_REGISTRATION"] / df["DAYS_ID_PUBLISH"]
df["DAYS_REGISTRATION_DIV_DAYS_LAST_PHONE_CHANGE"] = df["DAYS_REGISTRATION"] / df["DAYS_LAST_PHONE_CHANGE"]
df["REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULATION_RELATIVE"] = df["REGION_RATING_CLIENT_W_CITY"] / df["REGION_POPULATION_RELATIVE"]
df["REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULATION_RELATIVE"] = df["REGION_RATING_CLIENT_W_CITY"] * df["REGION_POPULATION_RELATIVE"]
df["SUM_REG_NOT_FLAG"] = df["REG_REGION_NOT_LIVE_REGION"] + df["REG_REGION_NOT_WORK_REGION"] + df["LIVE_REGION_NOT_WORK_REGION"] + df["REG_CITY_NOT_LIVE_CITY"] + df["REG_CITY_NOT_WORK_CITY"] + df["LIVE_CITY_NOT_WORK_CITY"]
df["SUM_AVG_BUILD"] = df["APARTMENTS_AVG"] + df["BASEMENTAREA_AVG"] + df["YEARS_BEGINEXPLUATATION_AVG"] + df["YEARS_BUILD_AVG"] + df["COMMONAREA_AVG"] + df["ELEVATORS_AVG"] + df["ENTRANCES_AVG"] + df["FLOORSMAX_AVG"] + df["FLOORSMIN_AVG"] + df["LANDAREA_AVG"] + df["LIVINGAPARTMENTS_AVG"] + df["LIVINGAREA_AVG"] + df["NONLIVINGAPARTMENTS_AVG"] + df["NONLIVINGAREA_AVG"]
df["SUM_MODE_BUILD"] = df["APARTMENTS_MODE"] + df["BASEMENTAREA_MODE"] + df["YEARS_BEGINEXPLUATATION_MODE"] + df["YEARS_BUILD_MODE"] + df["COMMONAREA_MODE"] + df["ELEVATORS_MODE"] + df["ENTRANCES_MODE"] + df["FLOORSMAX_MODE"] + df["FLOORSMIN_MODE"] + df["LANDAREA_MODE"] + df["LIVINGAPARTMENTS_MODE"] + df["LIVINGAREA_MODE"] + df["NONLIVINGAPARTMENTS_MODE"] + df["NONLIVINGAREA_MODE"]
df["SUM_MEDI_BUILD"] = df["APARTMENTS_MEDI"] + df["BASEMENTAREA_MEDI"] + df["YEARS_BEGINEXPLUATATION_MEDI"] + df["YEARS_BUILD_MEDI"] + df["COMMONAREA_MEDI"] + df["ELEVATORS_MEDI"] + df["ENTRANCES_MEDI"] + df["FLOORSMAX_MEDI"] + df["FLOORSMIN_MEDI"] + df["LANDAREA_MEDI"] + df["LIVINGAPARTMENTS_MEDI"] + df["LIVINGAREA_MEDI"] + df["NONLIVINGAPARTMENTS_MEDI"] + df["NONLIVINGAREA_MEDI"]
df["SUM_DOC_FLAG"] = df["FLAG_DOCUMENT_2"] + df["FLAG_DOCUMENT_3"] + df["FLAG_DOCUMENT_4"] + df["FLAG_DOCUMENT_5"] + df["FLAG_DOCUMENT_6"] + df["FLAG_DOCUMENT_7"] + df["FLAG_DOCUMENT_8"] + df["FLAG_DOCUMENT_9"] + df["FLAG_DOCUMENT_10"] + df["FLAG_DOCUMENT_11"] + df["FLAG_DOCUMENT_12"] + df["FLAG_DOCUMENT_13"] + df["FLAG_DOCUMENT_14"] + df["FLAG_DOCUMENT_15"] + df["FLAG_DOCUMENT_16"] + df["FLAG_DOCUMENT_17"] + df["FLAG_DOCUMENT_18"] + df["FLAG_DOCUMENT_19"] + df["FLAG_DOCUMENT_20"] + df["FLAG_DOCUMENT_21"]
df["CNT_CHILDREN_DIV_DAYS_BIRTH"] = df["CNT_CHILDREN"] / df["DAYS_BIRTH"]
df["CNT_CHILDREN_DIV_REGION_POPULATION_RELATIVE"] = df["CNT_CHILDREN"] / df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_REALTY_PROD_REGION_POPULATION_RELATIVE"] = df["FLAG_OWN_REALTY"] * df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_REALTY_DIV_REGION_POPULATION_RELATIVE"] = df["FLAG_OWN_REALTY"] / df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_CAR_DIV_OWN_CAR_AGE"] = df["FLAG_OWN_CAR"] / df["OWN_CAR_AGE"]
df["EXT_SOURCE_1_DIV_DAYS_BIRTH"] = df["EXT_SOURCE_1"] / df["DAYS_BIRTH"]
df["EXT_SOURCE_1_PROD_DAYS_BIRTH"] = df["EXT_SOURCE_1"] * df["DAYS_BIRTH"]

### Remove infinite values

In [122]:
df.replace([-np.inf, np.inf], np.nan, inplace=True)

### Remove income outliers

In [123]:
df.loc[df["AMT_INCOME_TOTAL"] > 500000, "AMT_INCOME_TOTAL"] = np.nan

### Handle special values for DAYS_EMPLOYED

In [124]:
df["DAYS_EMPLOYED_SENTINEL"] = (df["DAYS_EMPLOYED"] == 365243).astype(int)
df.loc[df["DAYS_EMPLOYED"] > 0, "DAYS_EMPLOYED"] = np.nan

In [125]:
df.shape

(48744, 329)

### Encode categorical features

In [126]:
df = pd.get_dummies(df, dummy_na=True)
df.columns = df.columns.str.replace("\s+", "_")

In [127]:
df.head()

Unnamed: 0,SK_ID_CURR,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone,_brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan
0,100001,0,1,0,135000.0,568800.0,20560.5,450000.0,0.01885,-19241,...,0,0,0,0,1,0,0,1,0,0
1,100005,0,1,0,99000.0,222768.0,17370.0,180000.0,0.035792,-18064,...,0,0,0,0,0,0,1,0,0,1
2,100013,1,1,0,202500.0,663264.0,69777.0,630000.0,0.019101,-20038,...,0,0,0,0,0,0,1,0,0,1
3,100028,0,1,2,315000.0,1575000.0,49018.5,1575000.0,0.026392,-13976,...,0,0,0,1,0,0,0,1,0,0
4,100038,1,0,1,180000.0,625500.0,32067.0,625500.0,0.010032,-13040,...,0,0,0,0,0,0,1,0,0,1


In [128]:
df.shape

(48744, 404)

### Write preprocessed data to file

In [129]:
df.to_csv("/Users/danielsaxton/home_credit_default_risk/preprocessed_test.csv", index=False, header=True)

# Modeling

In [143]:
df = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/preprocessed_train.csv")

In [144]:
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

## LightGBM

### Select features

In [133]:
clf = lgb.LGBMClassifier(n_estimators=1000, num_leaves=23, subsample=0.5)
clf.fit(df, y)

lgb_cols = df.columns[clf.feature_importances_ > 0]
len(lgb_cols)

358

### Feature importance

In [74]:
var_imp = pd.DataFrame({"Feature": df.columns, "Importance": clf.feature_importances_})[["Feature", "Importance"]].sort_values("Importance", ascending=False)
var_imp.head(50)

Unnamed: 0,Feature,Importance
30,EXT_SOURCE_2,429
31,EXT_SOURCE_3,404
277,AMT_CREDIT_DIV_AMT_ANNUITY,349
312,EXT_SOURCE_1_DIV_DAYS_BIRTH,280
300,DAYS_REGISTRATION_DIV_DAYS_LAST_PHONE_CHANGE,260
9,DAYS_EMPLOYED,243
8,DAYS_BIRTH,236
226,TOTAL_AMT_CREDIT_SUM,230
204,MAX_DAYS_CREDIT_ENDDATE,226
223,SUM_DAYS_CREDIT_ENDDATE,214


In [75]:
var_imp.query("Importance == 0")

Unnamed: 0,Feature,Importance
358,NAME_HOUSING_TYPE_nan,0
314,DAYS_EMPLOYED_SENTINEL,0
351,NAME_FAMILY_STATUS_nan,0
385,WEEKDAY_APPR_PROCESS_START_nan,0
90,FLAG_DOCUMENT_12,0
88,FLAG_DOCUMENT_10,0
349,NAME_FAMILY_STATUS_Unknown,0
85,FLAG_DOCUMENT_7,0
391,HOUSETYPE_MODE_block_of_flats,0
82,FLAG_DOCUMENT_4,0


In [142]:
var_imp.query("Feature == 'MAX_WORST_DQ_BUREAU_BALANCE'")

Unnamed: 0,Feature,Importance
209,MAX_WORST_DQ_BUREAU_BALANCE,6


### LightGBM

In [None]:
params = {"n_estimators": 20000, 
          "num_leaves": 233, 
          "learning_rate": 0.005, 
          "subsample": 0.5, 
          "colsample_bytree": 0.5, 
          "reg_lambda": 0.9}

lgb_data = lgb.Dataset(data=df[lgb_cols], 
                       label=y)

cv_result = lgb.cv(params=params, 
                   train_set=lgb_data, 
                   nfold=5, 
                   metrics="auc", 
                   early_stopping_rounds=200, 
                   stratified=True, 
                   shuffle=True, 
                   verbose_eval=100, 
                   show_stdv=True, 
                   seed=123)

cv_result = pd.DataFrame(cv_result)

[100]	cv_agg's auc: 0.761327 + 0.00264881
[200]	cv_agg's auc: 0.765148 + 0.00260561
[300]	cv_agg's auc: 0.76791 + 0.00249523
[400]	cv_agg's auc: 0.770759 + 0.00235126
[500]	cv_agg's auc: 0.773099 + 0.00219812
[600]	cv_agg's auc: 0.775225 + 0.0020933


In [None]:
cv_result.tail()

### For submission

In [134]:
lgb_model = lgb.LGBMClassifier(n_estimators=4497, 
                         num_leaves=113, 
                         learning_rate=0.005, 
                         subsample=0.5, 
                         colsample_bytree=0.5, 
                         reg_lambda=0.9)

lgb_model.fit(df[lgb_cols], y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
        learning_rate=0.005, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=4497,
        n_jobs=-1, num_leaves=113, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.9, silent=True, subsample=0.5,
        subsample_for_bin=200000, subsample_freq=1)

## Score test data and generate submission

In [135]:
df = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/preprocessed_test.csv")

In [136]:
df["NAME_INCOME_TYPE_Maternity_leave"] = 0

In [137]:
submission = pd.DataFrame({"SK_ID_CURR": df["SK_ID_CURR"], 
                           "TARGET": lgb_model.predict_proba(df[lgb_cols])[:,1]})

submission.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.025946
1,100005,0.100295
2,100013,0.016056
3,100028,0.039926
4,100038,0.203012


In [138]:
submission.to_csv("/Users/danielsaxton/home_credit_default_risk/submission.csv", index=False)