# Home Credit Default Modeling

In [31]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr, randint, uniform
import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective
import pickle
import gc

gc.enable()
np.random.seed(235)

path = "/Users/danielsaxton/home_credit_default_risk/"

### Load the data

In [32]:
application = pd.read_csv(path + "application_train.csv")
bureau_balance = pd.read_csv(path + "bureau_balance.csv")
bureau = pd.read_csv(path + "bureau.csv")
credit_card = pd.read_csv(path + "credit_card_balance.csv")
installments = pd.read_csv(path + "installments_payments.csv")
pos_cash = pd.read_csv(path + "POS_CASH_balance.csv")
previous_application = pd.read_csv(path + "previous_application.csv")

### Subset to only those records matching with an application

In [33]:
app_key = set(application["SK_ID_CURR"])
bureau = bureau[bureau["SK_ID_CURR"].isin(app_key)]
bur_key = set(bureau["SK_ID_BUREAU"])

In [34]:
bureau_balance = bureau_balance[bureau_balance["SK_ID_BUREAU"].isin(bur_key)]
credit_card = credit_card[credit_card["SK_ID_CURR"].isin(app_key)]
installments = installments[installments["SK_ID_CURR"].isin(app_key)]
pos_cash = pos_cash[pos_cash["SK_ID_CURR"].isin(app_key)]
previous_application = previous_application[previous_application["SK_ID_CURR"].isin(app_key)]
del app_key, bur_key
gc.collect()

131

# Feature construction

## Application

In [35]:
application.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
application.shape

(307511, 122)

In [37]:
application["ORGANIZATION_TYPE_3"] = application["ORGANIZATION_TYPE"].apply(lambda x: ("type 3" in x) or ("Type 3") in x).astype(int)
application["ORGANIZATION_SELF_EMPLOYED"] = (application["ORGANIZATION_TYPE"] == "Self employed").astype(int)
application["ORGANIZATION_XNA"] = (application["ORGANIZATION_TYPE"] == "XNA").astype(int)
application.drop("ORGANIZATION_TYPE", axis=1, inplace=True)

## Previous application

In [38]:
previous_application.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [39]:
previous_application.shape

(1413701, 37)

### Create synthetic target within previous_application and add sentinel fields

In [41]:
with open(path + "linear_model.pkl", "rb") as f:
    clf = pickle.load(f)
    
impute = Imputer(strategy="median")
scale = StandardScaler()

cols = ["AMT_ANNUITY", 
        "AMT_CREDIT", 
        "AMT_GOODS_PRICE", 
        "HOUR_APPR_PROCESS_START", 
        "NAME_CONTRACT_TYPE", 
        "NAME_TYPE_SUITE", 
        "WEEKDAY_APPR_PROCESS_START"]

prev_temp = pd.get_dummies(previous_application[cols])

dummy_cols = ["AMT_CREDIT",
              "AMT_GOODS_PRICE",
              "HOUR_APPR_PROCESS_START",
              "NAME_CONTRACT_TYPE_Cash loans",
              "NAME_CONTRACT_TYPE_Revolving loans",
              "NAME_TYPE_SUITE_Children",
              "NAME_TYPE_SUITE_Family",
              "NAME_TYPE_SUITE_Group of people",
              "NAME_TYPE_SUITE_Other_A",
              "NAME_TYPE_SUITE_Other_B",
              "NAME_TYPE_SUITE_Spouse, partner",
              "NAME_TYPE_SUITE_Unaccompanied",
              "WEEKDAY_APPR_PROCESS_START_FRIDAY",
              "WEEKDAY_APPR_PROCESS_START_MONDAY",
              "WEEKDAY_APPR_PROCESS_START_SATURDAY",
              "WEEKDAY_APPR_PROCESS_START_SUNDAY",
              "WEEKDAY_APPR_PROCESS_START_THURSDAY",
              "WEEKDAY_APPR_PROCESS_START_TUESDAY",
              "WEEKDAY_APPR_PROCESS_START_WEDNESDAY"]

previous_application["SYNTHETIC_TARGET"] = clf.predict_proba(scale.fit_transform(impute.fit_transform(prev_temp[dummy_cols])))[:,1]
del prev_temp
gc.collect()

previous_application["DAYS_FIRST_DRAWING_SENTINEL"] = (previous_application["DAYS_FIRST_DRAWING"] == 365243).astype(int)
previous_application["DAYS_FIRST_DUE_SENTINEL"] = (previous_application["DAYS_FIRST_DUE"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_1ST_VERSION_SENTINEL"] = (previous_application["DAYS_LAST_DUE_1ST_VERSION"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_SENTINEL"] = (previous_application["DAYS_LAST_DUE"] == 365243).astype(int)
previous_application["DAYS_TERMINATION_SENTINEL"] = (previous_application["DAYS_TERMINATION"] == 365243).astype(int)

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,HOUR_APPR_PROCESS_START,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Consumer loans,NAME_CONTRACT_TYPE_Revolving loans,NAME_CONTRACT_TYPE_XNA,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,...,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY
0,1730.43,17145.0,17145.0,15,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,25188.615,679671.0,607500.0,11,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,15060.735,136444.5,112500.0,11,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,47041.335,470790.0,450000.0,7,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,31924.395,404055.0,337500.0,9,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Derived features

In [44]:
def previous_agg_func(g):
    mask12 = g["DAYS_DECISION"] >= -360
    mask24 = g["DAYS_DECISION"] >= -720
    
    d = {
        "AVG_SYNTH_TARGET_12M": np.nanmean(g["SYNTHETIC_TARGET"].where(mask12)), 
        "AVG_PREV_PROP_APPROVED_12M": np.nanmean((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask12)), 
        "AVG_PREV_PROP_APPROVED_24M": np.nanmean((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask24)), 
        "MAX_PREV_PROP_APPROVED_12M": np.nanmax((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask12)), 
        "MAX_PREV_PROP_APPROVED_24M": np.nanmax((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask24)), 
        
        
         
        "COUNT_PREV_APP": len(g), 
        "MIN_PREV_DAYS_TERMINATION": np.nanmin(g["DAYS_TERMINATION"]), 
        "MAX_PREV_DAYS_TERMINATION": np.nanmax(g["DAYS_TERMINATION"]), 
        "AVG_PREV_DAYS_TERMINATION": np.nanmean(g["DAYS_TERMINATION"]), 
        "RANGE_PREV_DAYS_TERMINATION": np.nanmax(g["DAYS_TERMINATION"]) - np.nanmin(g["DAYS_TERMINATION"]),  
        "MIN_PREV_AMT_CREDIT": np.nanmin(g["AMT_CREDIT"]),
        "MAX_PREV_AMT_CREDIT": np.nanmax(g["AMT_CREDIT"]),
        "AVG_PREV_AMT_CREDIT": np.nanmean(g["AMT_CREDIT"]),
        "MIN_PREV_AMT_CREDIT_WEIGHTED": np.nanmin(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])),
        "MAX_PREV_AMT_CREDIT_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])),
        "AVG_PREV_AMT_CREDIT_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])), 
        "MIN_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmin(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "MAX_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmax(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "AVG_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmean(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmin(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),
        "MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),
        "AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),  
        "MAX_PREV_AMT_ANNUITY": np.nanmin(g["AMT_ANNUITY"]), 
        "AVG_PREV_AMT_ANNUITY": np.nanmean(g["AMT_ANNUITY"]), 
        "MAX_PREV_AMT_ANNUITY_WEIGHTED": np.nanmin(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_AMT_ANNUITY_WEIGHTED": np.nanmean(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "MIN_DAYS_DECISION": np.nanmin(g["DAYS_DECISION"]), 
        "MAX_DAYS_DECISION": np.nanmax(g["DAYS_DECISION"]), 
        "RANGE_DAYS_DECISION": np.nanmax(g["DAYS_DECISION"]) - np.nanmin(g["DAYS_DECISION"]),
        "SUM_DAYS_LAST_DUE_NULL": np.nansum(g["DAYS_LAST_DUE"].isnull()), 
        "AVG_DAYS_LAST_DUE_NULL": np.nanmean(g["DAYS_LAST_DUE"].isnull()), 
        "AVG_PREV_REQ_AMOUNT_WEIGHTED": np.nanmean(g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_REQ_AMOUNT_WEIGHTED": np.nanmax(g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_REQ_AMOUNT": np.nanmean(g["AMT_APPLICATION"]), 
        "MAX_PREV_REQ_AMOUNT": np.nanmax(g["AMT_APPLICATION"]), 
        "AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED": np.nanmean(g["RATE_DOWN_PAYMENT"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_PROP_APPROVED_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_PROP_APPROVED_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_RATE_DOWNPAYMENT": np.nanmean(g["RATE_DOWN_PAYMENT"]), 
        "AVG_PREV_PROP_APPROVED": np.nanmean(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "MAX_PREV_PROP_APPROVED": np.nanmax(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "MIN_PREV_PROP_APPROVED": np.nanmin(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "AVG_PREV_REQ_AMOUNT": np.nanmean(g["AMT_APPLICATION"]), 
        "MAX_PREV_REQ_AMOUNT": np.nanmax(g["AMT_APPLICATION"]), 
        "AVG_PREV_RATE_DOWNPAYMENT": np.nanmean(g["RATE_DOWN_PAYMENT"]), 
        "AVG_PREV_INT_RATE": np.nanmean(g["RATE_INTEREST_PRIMARY"]), 
        "SUM_PREV_URGENT_NEEDS": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Urgent needs"), 
        "SUM_PREV_REPAIRS": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Repairs"), 
        "SUM_PREV_OTHER": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Other"), 
        "SUM_PREV_LIMIT_REJECT": np.nansum(g["CODE_REJECT_REASON"] == "LIMIT"), 
        "SUM_REFUSED_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Refused"), 
        "SUM_CANC_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Canceled"), 
        "SUM_APPR_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Approved"), 
        "SUM_PREV_HC_REJECT": np.nansum(g["CODE_REJECT_REASON"] == "HC"), 
        "SUM_PREV_INSURE_REQ": np.nansum(g["NFLAG_INSURED_ON_APPROVAL"]), 
        "COUNT_PREV_WALK_IN": np.nansum(g["NAME_PRODUCT_TYPE"] == "walk-in"), 
        "COUNT_PREV_HIGH_YIELD": np.nansum(g["NAME_YIELD_GROUP"] == "high"), 
        "COUNT_PREV_LOW_YIELD": np.nansum(g["NAME_YIELD_GROUP"].apply(lambda x: x.startswith("low"))), 
        "AVG_SYNTH_TARGET": np.nanmean(g["SYNTHETIC_TARGET"]), 
        "SUM_SYNTH_TARGET_WEIGHTED": np.nansum(g["SYNTHETIC_TARGET"] / abs(g["DAYS_DECISION"])), 
        "SUM_SYNTH_TARGET": np.nansum(g["SYNTHETIC_TARGET"]), 
        "MAX_SYNTH_TARGET": np.nanmax(g["SYNTHETIC_TARGET"]), 
        "MIN_SYNTH_TARGET": np.nanmin(g["SYNTHETIC_TARGET"]), 
        "RANGE_SYNTH_TARGET": np.nanmax(g["SYNTHETIC_TARGET"]) - np.min(g["SYNTHETIC_TARGET"]), 
        "SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE": np.nansum(g["DAYS_LAST_DUE_1ST_VERSION"] == g["DAYS_LAST_DUE"]), 
        "SUM_DAYS_FIRST_DRAWING_SENTINEL": np.nansum(g["DAYS_FIRST_DRAWING_SENTINEL"]), 
        "SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED": np.nansum(g["DAYS_FIRST_DRAWING_SENTINEL"] / abs(g["DAYS_DECISION"])), 
        "MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED": np.nanmax(g["DAYS_FIRST_DRAWING_SENTINEL"] / abs(g["DAYS_DECISION"])),         
        "SUM_DAYS_LAST_DUE_LT_FIRST_VERSION": np.nansum(g["DAYS_LAST_DUE"] < g["DAYS_LAST_DUE_1ST_VERSION"]), 
        "MAX_DAYS_FIRST_DRAWING_DAYS_DUE_SENTINEL": np.nanmax(g["DAYS_FIRST_DRAWING_SENTINEL"] * g["DAYS_FIRST_DUE_SENTINEL"]), 
        "MAX_DAYS_FIRST_SENTINEL_COMP_DAYS_LAST_SENTINEL": np.nansum((1 - g["DAYS_FIRST_DUE_SENTINEL"]) * g["DAYS_LAST_DUE_1ST_VERSION_SENTINEL"])}
    
    return pd.Series(d)

In [45]:
previous_agg = previous_application.groupby("SK_ID_CURR").apply(previous_agg_func).reset_index()
previous_agg.to_csv(path + "previous_agg.csv", index=False, header=True)
previous_agg.head()

Unnamed: 0,SK_ID_CURR,AVG_DAYS_LAST_DUE_NULL,AVG_NFLAG_INSURED_ON_APPROVAL_NULL,AVG_PREV_AMT_ANNUITY,AVG_PREV_AMT_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT,AVG_PREV_AMT_CREDIT_DIV_ANNUITY,AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT_WEIGHTED,AVG_PREV_DAYS_TERMINATION,...,SUM_DAYS_LAST_DUE_NULL,SUM_PREV_HC_REJECT,SUM_PREV_INSURE_REQ,SUM_PREV_LIMIT_REJECT,SUM_PREV_OTHER,SUM_PREV_REPAIRS,SUM_PREV_URGENT_NEEDS,SUM_REFUSED_CONTRACT,SUM_SYNTH_TARGET,SUM_SYNTH_TARGET_WEIGHTED
0,100002,0.0,0.0,9251.775,15.266955,179055.0,19.353584,0.031937,295.470297,-17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071617,0.000118
1,100003,0.0,0.0,56553.99,70.901357,484191.0,8.677472,0.008318,612.90394,-1047.333333,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.235876,0.000225
2,100004,0.0,0.0,5357.25,6.573313,20106.0,3.753045,0.004605,24.669939,-714.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118435,0.000145
3,100006,0.555556,0.555556,23651.175,96.293912,291695.5,17.767287,0.081751,1358.887335,182481.75,...,5.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.58882,0.002612
4,100007,0.166667,0.166667,12278.805,16.715844,166638.75,12.644075,0.016725,248.03877,72143.8,...,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.567406,0.000622


### Join features

In [46]:
df = pd.merge(application, previous_agg, how="left", on="SK_ID_CURR")
del previous_application, previous_agg
gc.collect()

164

## Bureau balance

In [47]:
bureau_balance["STATUS"].value_counts()

C    7027575
0    4615684
X    2837039
1     155330
5      40528
2      15583
3       5976
4       3897
Name: STATUS, dtype: int64

In [48]:
bureau_balance["STATUS"] = bureau_balance["STATUS"].where(lambda x: x != "X").fillna("0")
bureau_balance["STATUS"].value_counts()

0    7452723
C    7027575
1     155330
5      40528
2      15583
3       5976
4       3897
Name: STATUS, dtype: int64

In [49]:
bureau_balance.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [50]:
bureau_balance.shape

(14701612, 3)

### Derived features

Remove range, use len instead below

In [51]:
def bureau_balance_agg_func(g):
    # mask6 = g["MONTHS_BALANCE"] >= -6
    # mask12 = g["MONTHS_BALANCE"] >= -12
    closed = g["STATUS"] == "C"
    
    d = {"LEN_BUREAU_BALANCE": np.nansum(~closed), 
        "SUM_CLOSED_BUREAU_BALANCE": np.nansum(closed), 
        "SUM_CURRENT_BUREAU_BALANCE": np.nansum(g["STATUS"] == "0"), 
        "SUM_DQ_BUREAU_BALANCE": np.nansum(g["STATUS"].isin(["1", "2", "3", "3", "4", "5"])),
        "WORST_DQ_BUREAU_BALANCE": np.nanmax(g["STATUS"].apply(lambda x: 0 if x == "C" else int(x))), 
        "AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nansum(abs(g["MONTHS_BALANCE"]).where(~closed)) / np.nansum(~closed), 
        "MIN_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmin(g["MONTHS_BALANCE"].where(~closed)), 
        "MAX_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["MONTHS_BALANCE"].where(~closed))}
    
    return pd.Series(d)

In [52]:
bureau_balance_agg = bureau_balance.groupby("SK_ID_BUREAU").apply(bureau_balance_agg_func).reset_index()
bureau_balance_agg.to_csv(path + "bureau_balance_agg.csv", index=False, header=True)
bureau_balance_agg.head()

Unnamed: 0,SK_ID_BUREAU,AVG_MONTHS_BALANCE_BUREAU_BALANCE,LEN_BUREAU_BALANCE,MAX_MONTHS_BALANCE_BUREAU_BALANCE,MIN_MONTHS_BALANCE_BUREAU_BALANCE,RANGE_MONTHS_BALANCE_BUREAU_BALANCE,SUM_CLOSED_BUREAU_BALANCE,SUM_CURRENT_BUREAU_BALANCE,SUM_DQ_BUREAU_BALANCE,WORST_DQ_BUREAU_BALANCE
0,5008804,14.0,3.0,-13.0,-15.0,2.0,13.0,2.0,1.0,1.0
1,5008805,13.0,3.0,-12.0,-14.0,2.0,12.0,2.0,1.0,1.0
2,5008806,18.0,23.0,-7.0,-29.0,22.0,7.0,23.0,0.0,0.0
3,5008807,83.5,8.0,-80.0,-87.0,7.0,0.0,8.0,0.0,0.0
4,5008808,2.0,5.0,0.0,-4.0,4.0,0.0,5.0,0.0,0.0


### Join features

In [53]:
bureau_joined = pd.merge(bureau, 
                         bureau_balance_agg, 
                         how="left", 
                         on="SK_ID_BUREAU")

## Bureau

In [54]:
bureau_joined.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,...,AMT_ANNUITY,AVG_MONTHS_BALANCE_BUREAU_BALANCE,LEN_BUREAU_BALANCE,MAX_MONTHS_BALANCE_BUREAU_BALANCE,MIN_MONTHS_BALANCE_BUREAU_BALANCE,RANGE_MONTHS_BALANCE_BUREAU_BALANCE,SUM_CLOSED_BUREAU_BALANCE,SUM_CURRENT_BUREAU_BALANCE,SUM_DQ_BUREAU_BALANCE,WORST_DQ_BUREAU_BALANCE
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,...,,,,,,,,,,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,...,,,,,,,,,,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,...,,,,,,,,,,
3,215354,5714465,Active,currency 1,-203,0,,,,0,...,,,,,,,,,,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,...,,,,,,,,,,


In [55]:
bureau_joined.shape

(1465325, 26)

### Derived features

In [56]:
def bureau_agg_func(g):
    # mask6 = g["DAYS_CREDIT"] >= -180
    
    d = {
        "AVG_LEN_BUREAU_BALANCE": np.nanmean(g["LEN_BUREAU_BALANCE"]), 
        "SUM_LEN_BUREAU_BALANCE": np.nansum(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CURRENT": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CLOSED": np.nansum(g["SUM_CLOSED_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CURRENT_WEIGHTED": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]) / np.nansum(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmin(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]) - np.nanmin(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "SUM_SUM_CURRENT_BUREAU_BALANCE": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]), 
        "AVG_PROP_CURRENT": np.nanmean(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ": np.nanmean(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ": np.nanmax(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]),
        "AVG_PROP_CURRENT_WEIGHTED": np.nanmean(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_PROP_CURRENT_WEIGHTED": np.nanmin(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ_WEIGHTED": np.nanmean(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ_WEIGHTED": np.nanmax(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_CURRENT_WEIGHTED_AMT": np.nanmean(g["AMT_CREDIT_SUM"] * g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_PROP_CURRENT_WEIGHTED_AMT": np.nanmin(g["AMT_CREDIT_SUM"] * g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ_WEIGHTED_AMT": np.nanmean(g["AMT_CREDIT_SUM"] * g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ_WEIGHTED_AMT": np.nanmax(g["AMT_CREDIT_SUM"] * g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]),          
        "AVG_WORST_DQ_BUREAU_BALANCE": np.nanmean(g["WORST_DQ_BUREAU_BALANCE"]), 
        "MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED": np.nanmax(g["WORST_DQ_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED": np.nanmean(g["WORST_DQ_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "TOTAL_AMT_CREDIT_SUM_POS_DAYS": np.nansum(g["AMT_CREDIT_SUM"].where(g["DAYS_CREDIT_ENDDATE"] > 0)),
        "SUM_DAYS_CREDIT_ENDDATE_POS_DAYS": np.nansum(g["DAYS_CREDIT_ENDDATE"].where(g["DAYS_CREDIT_ENDDATE"] > 0)), 
        "MAX_LEN_BUREAU_BALANCE": np.nanmax(g["LEN_BUREAU_BALANCE"]), 
        "SUM_LEN_BUREAU_BALANCE": np.nanmax(g["LEN_BUREAU_BALANCE"]), 
        "MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmin(g["MIN_MONTHS_BALANCE_BUREAU_BALANCE"]), 
         
         
         
        "MIN_DAYS_CREDIT_ENDDATE": np.nanmin(g["DAYS_CREDIT_ENDDATE"]), 
        "MAX_DAYS_CREDIT_ENDDATE": np.nanmax(g["DAYS_CREDIT_ENDDATE"]), 
        "SUM_DAYS_CREDIT_ENDDATE": np.nansum(g["DAYS_CREDIT_ENDDATE"]), 
        "SUM_NULL_DAYS_ENDDATE_FACT": np.nansum(g["DAYS_ENDDATE_FACT"].isnull()), 
        "COUNT_BUREAU_RECORDS": len(g), 
        "COUNT_ACTIVE": np.nansum(g["CREDIT_ACTIVE"] == "Active"), 
        "MAX_CREDIT_DAY_OVERDUE_WEIGHTED": np.nanmax(g["CREDIT_DAY_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_CREDIT_DAY_OVERDUE_WEIGHTED": np.nansum(g["CREDIT_DAY_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "MAX_CREDIT_DAY_OVERDUE": np.nanmax(g["CREDIT_DAY_OVERDUE"]), 
        "SUM_CREDIT_DAY_OVERDUE": np.nansum(g["CREDIT_DAY_OVERDUE"]), 
        "DAYS_SINCE_APPLIED": - np.nanmax(g["DAYS_CREDIT"]), 
        "SUM_INVERSE_DAYS_CREDIT": - np.nansum(1 / g["DAYS_CREDIT"]), 
        "MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED": np.nanmax(g["AMT_CREDIT_MAX_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED": np.nansum(g["AMT_CREDIT_MAX_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "MAX_AMT_CREDIT_MAX_OVERDUE": np.nanmax(g["AMT_CREDIT_MAX_OVERDUE"]), 
        "SUM_AMT_CREDIT_MAX_OVERDUE": np.nansum(g["AMT_CREDIT_MAX_OVERDUE"]), 
        "SUM_CNT_CREDIT_PROLONG": np.nansum(g["CNT_CREDIT_PROLONG"]), 
        "SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED": np.nansum(g["AMT_CREDIT_SUM_DEBT"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_AMT_CREDIT_SUM_DEBT": np.nansum(g["AMT_CREDIT_SUM_DEBT"]),
        "BUREAU_UTILIZATION_AVG": np.nanmean(g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]), 
        "BUREAU_UTILIZATION_MAX": np.nanmax(g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]), 
        "BUREAU_PROP_SUM_OVERDUE_AVG": np.nanmean(g["AMT_CREDIT_SUM_OVERDUE"] / g["AMT_CREDIT_SUM_DEBT"]), 
        "BUREAU_PROP_MAX_OVERDUE_AVG": np.nanmean(g["AMT_CREDIT_MAX_OVERDUE"] / g["AMT_CREDIT_SUM_DEBT"]), 
        "MAX_DAYS_CREDIT_UPDATE": np.nanmax(g["DAYS_CREDIT_UPDATE"]), 
        "RANGE_DAYS_CREDIT_UPDATE": np.nanmax(g["DAYS_CREDIT_UPDATE"]) - np.nanmin(g["DAYS_CREDIT_UPDATE"]), 
        "DAYS_CREDIT_RANGE": np.nanmax(g["DAYS_CREDIT"]) - np.nanmin(g["DAYS_CREDIT"]), 
        "TOTAL_AMT_CREDIT_SUM_WEIGHTED": np.nansum(g["AMT_CREDIT_SUM"] / abs(g["DAYS_CREDIT_UPDATE"])),
        "TOTAL_AMT_CREDIT_SUM": np.nansum(g["AMT_CREDIT_SUM"]),
        "COUNT_CREDIT_CARD": np.nansum(g["CREDIT_TYPE"] == "Credit card"), 
        "COUNT_CAR_LOAN": np.nansum(g["CREDIT_TYPE"] == "Car loan"), 
        "COUNT_MORTGAGE": np.nansum(g["CREDIT_TYPE"] == "Mortgage"), 
        "SUM_AMT_ANNUITY": np.nansum(g["AMT_ANNUITY"])}
    
    return pd.Series(d)

In [57]:
bureau_agg = bureau_joined.groupby("SK_ID_CURR").apply(bureau_agg_func).reset_index()
bureau_agg.to_csv(path + "bureau_agg.csv", index=False, header=True)
bureau_agg.head()

Unnamed: 0,SK_ID_CURR,AVG_LEN_BUREAU_BALANCE,AVG_PROP_CURRENT,AVG_PROP_CURRENT_WEIGHTED,AVG_PROP_CURRENT_WEIGHTED_AMT,AVG_PROP_DQ,AVG_PROP_DQ_WEIGHTED,AVG_PROP_DQ_WEIGHTED_AMT,AVG_WORST_DQ_BUREAU_BALANCE,AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED,...,SUM_DAYS_CREDIT_ENDDATE,SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,SUM_INVERSE_DAYS_CREDIT,SUM_LEN_BUREAU_BALANCE,SUM_NULL_DAYS_ENDDATE_FACT,SUM_RANGE_MONTHS_BALANCE_BUREAU_BALANCE,SUM_SUM_CURRENT_BUREAU_BALANCE,TOTAL_AMT_CREDIT_SUM,TOTAL_AMT_CREDIT_SUM_POS_DAYS,TOTAL_AMT_CREDIT_SUM_WEIGHTED
0,100002,10.875,0.716964,0.109328,4863.768166,0.283036,0.010476,1617.905476,0.75,0.027542,...,-2094.0,927.0,0.017755,87.0,2.0,19.0,60.0,865055.565,638235.0,69432.89321
1,100003,,,,,,,,,,...,-2178.0,1216.0,0.003938,0.0,1.0,,0.0,1017400.5,810000.0,19188.078259
2,100004,,,,,,,,,,...,-977.0,0.0,0.003205,0.0,0.0,,0.0,189037.8,0.0,386.044202
3,100007,,,,,,,,,,...,-783.0,0.0,0.00087,0.0,0.0,,0.0,146250.0,0.0,186.781609
4,100008,,,,,,,,,,...,-1174.0,471.0,0.014644,0.0,1.0,,0.0,468445.5,267606.0,16948.724416


### Join features

In [58]:
df = pd.merge(df, bureau_agg, how="left", on="SK_ID_CURR")
del bureau_balance, bureau_balance_agg, bureau, bureau_agg, bureau_joined
gc.collect()

226

## Credit card

In [59]:
credit_card.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0
8,2181852,367360,-4,291543.075,292500,90000.0,289339.425,0.0,199339.425,130.5,...,286831.575,286831.575,3.0,8,0.0,5.0,3.0,Active,0,0
9,1235299,203885,-5,201261.195,225000,76500.0,111026.7,0.0,34526.7,6338.34,...,197224.695,197224.695,3.0,9,0.0,6.0,38.0,Active,0,0


In [60]:
credit_card.shape

(3227965, 23)

### Derived features

In [61]:
def credit_card_agg_func(g):
    mask6 = g["MONTHS_BALANCE"] >= -6
    mask12 = g["MONTHS_BALANCE"] >= -12
    
    d = {
        "DIFF_AVG_BALANCE_6M_12M": np.nanmean(g["AMT_BALANCE"].where(mask6)) - np.nanmean(g["AMT_BALANCE"].where(mask6 ^ mask12)),
        "AVG_BALANCE_6M": np.nanmean(g["AMT_BALANCE"].where(mask6)),
        "AVG_UTILIZATION_6M": np.nanmean((g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]).where(mask6)),
         
         
         
        "AVG_BALANCE": np.nanmean(g["AMT_BALANCE"]), 
        "MAX_BALANCE": np.nanmax(g["AMT_BALANCE"]), 
        "SUM_BALANCE": np.nansum(g["AMT_BALANCE"]), 
        "MAX_MONTHS_BALANCE": np.nanmax(abs(g["MONTHS_BALANCE"])), 
        "MIN_MONTHS_BALANCE": np.nanmin(abs(g["MONTHS_BALANCE"])), 
        "RANGE_MONTHS_BALANCE": np.nanmax(g["MONTHS_BALANCE"]) - np.nanmin(g["MONTHS_BALANCE"]), 
        "AVG_UTILIZATION": np.nanmean(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]), 
        "MAX_UTILIZATION": np.nanmax(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]), 
        "AVG_BALANCE_WEIGHTED": np.nanmean(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_BALANCE_WEIGHTED": np.nanmax(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "SUM_BALANCE_WEIGHTED": np.nansum(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "AVG_UTILIZATION_WEIGHTED": np.nanmean(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_UTILIZATION_WEIGHTED": np.nanmax(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_DPD_WEIGHTED": np.nanmax(g["SK_DPD"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_DPD_DEF_WEIGHTED": np.nanmax(g["SK_DPD_DEF"] / abs(g["MONTHS_BALANCE"])), 
        "SUM_CNT_DRAWINGS_CURRENT": np.nansum(g["CNT_DRAWINGS_CURRENT"]), 
        "AVG_CNT_DRAWINGS_CURRENT": np.nanmean(g["CNT_DRAWINGS_CURRENT"]), 
        "MAX_CNT_DRAWINGS_CURRENT": np.nanmax(g["CNT_DRAWINGS_CURRENT"]), 
        "SUM_AMT_DRAWINGS_CURRENT": np.nansum(g["AMT_DRAWINGS_CURRENT"]), 
        "AVG_AMT_DRAWINGS_CURRENT": np.nanmean(g["AMT_DRAWINGS_CURRENT"]), 
        "MAX_AMT_DRAWINGS_CURRENT": np.nanmax(g["AMT_DRAWINGS_CURRENT"]), 
        "MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmin(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"]), 
        "AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmean(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"]), 
        "MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmax(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"])}
    
    return pd.Series(d)

In [62]:
credit_card_agg = credit_card.groupby("SK_ID_CURR").apply(credit_card_agg_func).reset_index()
credit_card_agg.to_csv(path + "credit_card_agg.csv", index=False, header=True)
credit_card_agg.head()

Unnamed: 0,SK_ID_CURR,AVG_AMT_DRAWINGS_CURRENT,AVG_AMT_DRAWINGS_CURRENT_6M,AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,AVG_BALANCE,AVG_BALANCE_6M,AVG_BALANCE_WEIGHTED,AVG_CNT_DRAWINGS_CURRENT,AVG_UTILIZATION,AVG_UTILIZATION_6M,...,MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,MIN_MONTHS_BALANCE,NUM_CREDIT_CARD,RANGE_MONTHS_BALANCE,STD_BALANCE_12M,STD_BALANCE_6M,SUM_AMT_DRAWINGS_CURRENT,SUM_BALANCE,SUM_BALANCE_WEIGHTED,SUM_CNT_DRAWINGS_CURRENT
0,100006,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,1.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100011,2432.432432,0.0,inf,54482.111149,0.0,891.528045,0.054054,0.302678,0.0,...,1.0,2.0,1.0,73.0,0.0,0.0,180000.0,4031676.225,65973.075311,4.0
2,100021,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,2.0,1.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100023,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,4.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100036,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,2.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0


### Join features

In [63]:
df = pd.merge(df, credit_card_agg, how="left", on="SK_ID_CURR")
del credit_card, credit_card_agg
gc.collect()

164

## Installments

In [64]:
installments.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [65]:
installments.shape

(11591592, 8)

### Derived features

In [66]:
def installment_agg_func(g):
    mask6 = g["DAYS_ENTRY_PAYMENT"] >= -180
    mask12 = g["DAYS_ENTRY_PAYMENT"] >= -360
    
    d = {
        "SUM_UNDERPAYMENT_12M": np.nansum((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask12)), 
        "SUM_UNDERPAYMENT_6M": np.nansum((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask6)), 
        "MAX_PAYMENT_SIZE_6M": np.nanmax(g["AMT_PAYMENT"].where(mask6)), 
        "MAX_PAYMENT_SIZE_12M": np.nanmax(g["AMT_PAYMENT"].where(mask12)), 
        "MIN_PAYMENT_SIZE_6M": np.nanmin(g["AMT_PAYMENT"].where(mask6)),
        "MAX_ABS_DAYS_INSTALMENT": np.nanmax(abs(g["DAYS_INSTALMENT"])), 
        
        
        "COUNT_UNDERPAYMENT": np.nansum(g["AMT_PAYMENT"] / g["AMT_INSTALMENT"] < 0.5), 
        "SUM_UNDERPAYMENT": np.nansum(g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]), 
        "SUM_UNDERPAYMENT_WEIGHTED": np.nansum((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]) / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MAX_UNDERPAYMENT": np.nanmax(g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]), 
        "AVG_PAYMENT_SIZE_WEIGHTED": np.nanmean(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "AVG_PAYMENT_SIZE": np.nanmean(g["AMT_PAYMENT"]), 
        "MAX_PAYMENT_SIZE_WEIGHTED": np.nanmax(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MAX_PAYMENT_SIZE": np.nanmax(g["AMT_PAYMENT"]), 
        "MIN_PAYMENT_SIZE_WEIGHTED": np.nanmin(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MIN_PAYMENT_SIZE": np.nanmin(g["AMT_PAYMENT"]),
        "SUM_PAYMENT_WEIGHTED": np.nansum(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "SUM_PAYMENT": np.nansum(g["AMT_PAYMENT"]),
        "SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT": np.nansum(g["DAYS_ENTRY_PAYMENT"] > g["DAYS_INSTALMENT"]), 
        "MAX_DAYS_ENTRY_PAYMENT": np.nanmax(g["DAYS_ENTRY_PAYMENT"]), 
        "MIN_DAYS_ENTRY_PAYMENT": np.nanmin(g["DAYS_ENTRY_PAYMENT"]), 
        "RANGE_DAYS_ENTRY_PAYMENT": np.nanmax(g["DAYS_ENTRY_PAYMENT"]) - np.nanmin(g["DAYS_ENTRY_PAYMENT"])}
    
    return pd.Series(d)

In [67]:
installment_agg = installments.groupby("SK_ID_CURR").apply(installment_agg_func).reset_index()
installment_agg.to_csv(path + "installment_agg.csv", index=False, header=True)
installment_agg.head()

Unnamed: 0,SK_ID_CURR,AVG_PAYMENT_SIZE,AVG_PAYMENT_SIZE_WEIGHTED,COUNT_UNDERPAYMENT,MAX_DAYS_ENTRY_PAYMENT,MAX_PAYMENT_SIZE,MAX_PAYMENT_SIZE_6M,MAX_PAYMENT_SIZE_WEIGHTED,MAX_UNDERPAYMENT,MIN_DAYS_ENTRY_PAYMENT,MIN_PAYMENT_SIZE,MIN_PAYMENT_SIZE_6M,MIN_PAYMENT_SIZE_WEIGHTED,RANGE_DAYS_ENTRY_PAYMENT,SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT,SUM_PAYMENT,SUM_PAYMENT_WEIGHTED,SUM_UNDERPAYMENT,SUM_UNDERPAYMENT_6M,SUM_UNDERPAYMENT_WEIGHTED
0,100002,11559.247105,95.448632,0.0,-49.0,53093.745,53093.745,1083.545816,0.0,-587.0,9251.775,9251.775,15.761116,538.0,0.0,219625.695,1813.524009,0.0,0.0,0.0
1,100003,64754.586,100.798053,0.0,-544.0,560835.36,,1030.947353,0.0,-2324.0,6662.97,,2.899015,1780.0,0.0,1618864.65,2519.951327,0.0,0.0,0.0
2,100004,7096.155,9.434878,0.0,-727.0,10573.965,,14.544656,0.0,-795.0,5357.25,,6.738679,68.0,0.0,21288.465,28.304633,0.0,0.0,0.0
3,100006,62947.088438,543.096731,0.0,-12.0,691786.89,691786.89,3975.786724,0.0,-575.0,2482.92,29027.52,4.318122,563.0,0.0,1007153.415,8689.547693,0.0,0.0,0.0
4,100007,12214.060227,49.833434,3.0,-14.0,22678.785,16037.64,1145.545714,22655.655,-2318.0,0.18,16037.64,0.000125,2304.0,16.0,806127.975,3289.00667,29857.365,0.0,25.402727


### Join features

In [68]:
df = pd.merge(df, installment_agg, how="left", on="SK_ID_CURR")
del installments, installment_agg
gc.collect()

124

## Point of sale cash

In [69]:
pos_cash.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [70]:
pos_cash.shape

(8543375, 8)

### Derived features

In [71]:
def pos_cash_agg_func(g):
    d = {"MAX_POS_DPD": np.nanmax(g["SK_DPD"]), 
        "MAX_POS_DPD_DEF": np.nanmax(g["SK_DPD_DEF"]), 
        "NUM_POS_CASH": g["SK_ID_PREV"].nunique()}
    
    return pd.Series(d)

In [72]:
pos_cash_agg = pos_cash.groupby("SK_ID_CURR").apply(pos_cash_agg_func).reset_index()
pos_cash_agg.to_csv(path + "pos_cash_agg.csv", index=False, header=True)
pos_cash_agg.head()

Unnamed: 0,SK_ID_CURR,MAX_POS_DPD,MAX_POS_DPD_DEF,NUM_POS_CASH
0,100002,0,0,1
1,100003,0,0,3
2,100004,0,0,1
3,100006,0,0,3
4,100007,0,0,5


### Join features

In [73]:
df = pd.merge(df, pos_cash_agg, how="left", on="SK_ID_CURR")
del pos_cash, pos_cash_agg
gc.collect()

96

# Further construction and preprocessing

In [84]:
df["TOTAL_AMT_CREDIT_SUM_DIV_SUM_DAYS_CREDIT_ENDDATE"] = df["TOTAL_AMT_CREDIT_SUM"] / df["SUM_DAYS_CREDIT_ENDDATE"]
df["TOTAL_AMT_CREDIT_SUM_POS_DAYS_DIV_SUM_DAYS_CREDIT_ENDDATE_POS_DAYS"] = df["TOTAL_AMT_CREDIT_SUM_POS_DAYS"] / df["SUM_DAYS_CREDIT_ENDDATE_POS_DAYS"]
df["MAX_ABS_DAYS_INSTALMENT_DIV_DAYS_BIRTH"] = df["MAX_ABS_DAYS_INSTALMENT"] / df["DAYS_BIRTH"]



df["FLAG_OWN_CAR"] = (df["FLAG_OWN_CAR"] == "Y").astype(int)
df["FLAG_OWN_REALTY"] = (df["FLAG_OWN_REALTY"] == "Y").astype(int)
df["AMT_CREDIT_DIV_AMT_INCOME_TOTAL"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]
df["AMT_CREDIT_DIV_AMT_GOODS_PRICE"] = df["AMT_CREDIT"] / df["AMT_GOODS_PRICE"]
df["AMT_CREDIT_DIV_SUM_PAYMENT"] = df["AMT_CREDIT"] / df["SUM_PAYMENT"]
df["AMT_GOODS_PRICE_DIV_AMT_INCOME_TOTAL"] = df["AMT_GOODS_PRICE"] / df["AMT_INCOME_TOTAL"]
df["AMT_CREDIT_DIV_AMT_ANNUITY"] = df["AMT_CREDIT"] / df["AMT_ANNUITY"]
df["AMT_CREDIT_DIV_AVG_PREV_REQ_AMOUNT"] = df["AMT_CREDIT"] / df["AVG_PREV_REQ_AMOUNT"]
df["AMT_CREDIT_DIV_MAX_PREV_REQ_AMOUNT"] = df["AMT_CREDIT"] / df["MAX_PREV_REQ_AMOUNT"]
df["EXT_SOURCE_PROD"] = df["EXT_SOURCE_1"] * df["EXT_SOURCE_2"] * df["EXT_SOURCE_3"]
df["DAYS_EMPLOYED_DIV_DAYS_BIRTH"] = df["DAYS_EMPLOYED"] / df["DAYS_BIRTH"]
df["AVG_PAYMENT_SIZE_DIV_AMT_INCOME_TOTAL"] = df["AVG_PAYMENT_SIZE"] / df["AMT_INCOME_TOTAL"]
df["AVG_PAYMENT_SIZE_DIV_AMT_CREDIT"] = df["AVG_PAYMENT_SIZE"] / df["AMT_CREDIT"]
df["AVG_PAYMENT_SIZE_DIV_AMT_ANNUITY"] = df["AVG_PAYMENT_SIZE"] / df["AMT_ANNUITY"]
df["DAYS_REGISTRATION_PLUS_DAYS_ID_PUBLISH"] = df["DAYS_REGISTRATION"] + df["DAYS_ID_PUBLISH"]
df["SUM_REFUSED_CONTRACT_DIV_SUM_APPR_CONTRACT"] = df["SUM_REFUSED_CONTRACT"] / df["SUM_APPR_CONTRACT"]
df["MAX_UTILIZATION_DIV_AVG_UTILIZATION"] = df["MAX_UTILIZATION"] / df["AVG_UTILIZATION"]
df["MAX_PREV_REQ_AMOUNT_DIV_AMT_CREDIT"] = df["MAX_PREV_REQ_AMOUNT"] / df["AMT_CREDIT"]
df["AMT_INCOME_TOTAL_DIV_DAYS_BIRTH"] = df["AMT_INCOME_TOTAL"] / df["DAYS_BIRTH"]
df["SUM_DAYS_ID_REG_PHONE"] = df["DAYS_ID_PUBLISH"] + df["DAYS_REGISTRATION"] + df["DAYS_LAST_PHONE_CHANGE"]
df["SUM_REQ_CREDIT"] = df["AMT_REQ_CREDIT_BUREAU_HOUR"] + df["AMT_REQ_CREDIT_BUREAU_DAY"] + df["AMT_REQ_CREDIT_BUREAU_WEEK"] + df["AMT_REQ_CREDIT_BUREAU_MON"] + df["AMT_REQ_CREDIT_BUREAU_QRT"] + df["AMT_REQ_CREDIT_BUREAU_YEAR"]
df["DEF_30_PLUS_60_CNT_SOCIAL_CIRCLE"] = df["DEF_30_CNT_SOCIAL_CIRCLE"] + df["DEF_60_CNT_SOCIAL_CIRCLE"]
df["OWN_CAR_AGE_DIV_DAYS_BIRTH"] = df["OWN_CAR_AGE"] / df["DAYS_BIRTH"]
df["LANDAREA_DIV_TOTALAREA_MODE"] = df["LANDAREA_MODE"] / df["TOTALAREA_MODE"]
df["OWN_CAR_AGE_PLUS_DAYS_BIRTH"] = df["OWN_CAR_AGE"] + df["DAYS_BIRTH"]
df["AMT_ANNUITY_DIV_DAYS_BIRTH"] = df["AMT_ANNUITY"] / df["DAYS_BIRTH"]
df["AMT_ANNUITY_DIV_DAYS_EMPLOYED"] = df["AMT_ANNUITY"] / df["DAYS_EMPLOYED"]
df["AMT_ANNUITY_PROD_DAYS_EMPLOYED"] = df["AMT_ANNUITY"] * df["DAYS_EMPLOYED"]
df["DAYS_REGISTRATION_DIV_DAYS_ID_PUBLISH"] = df["DAYS_REGISTRATION"] / df["DAYS_ID_PUBLISH"]
df["DAYS_REGISTRATION_DIV_DAYS_LAST_PHONE_CHANGE"] = df["DAYS_REGISTRATION"] / df["DAYS_LAST_PHONE_CHANGE"]
df["REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULATION_RELATIVE"] = df["REGION_RATING_CLIENT_W_CITY"] / df["REGION_POPULATION_RELATIVE"]
df["REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULATION_RELATIVE"] = df["REGION_RATING_CLIENT_W_CITY"] * df["REGION_POPULATION_RELATIVE"]
df["SUM_REG_NOT_FLAG"] = df["REG_REGION_NOT_LIVE_REGION"] + df["REG_REGION_NOT_WORK_REGION"] + df["LIVE_REGION_NOT_WORK_REGION"] + df["REG_CITY_NOT_LIVE_CITY"] + df["REG_CITY_NOT_WORK_CITY"] + df["LIVE_CITY_NOT_WORK_CITY"]
df["SUM_AVG_BUILD"] = df["APARTMENTS_AVG"] + df["BASEMENTAREA_AVG"] + df["YEARS_BEGINEXPLUATATION_AVG"] + df["YEARS_BUILD_AVG"] + df["COMMONAREA_AVG"] + df["ELEVATORS_AVG"] + df["ENTRANCES_AVG"] + df["FLOORSMAX_AVG"] + df["FLOORSMIN_AVG"] + df["LANDAREA_AVG"] + df["LIVINGAPARTMENTS_AVG"] + df["LIVINGAREA_AVG"] + df["NONLIVINGAPARTMENTS_AVG"] + df["NONLIVINGAREA_AVG"]
df["SUM_MODE_BUILD"] = df["APARTMENTS_MODE"] + df["BASEMENTAREA_MODE"] + df["YEARS_BEGINEXPLUATATION_MODE"] + df["YEARS_BUILD_MODE"] + df["COMMONAREA_MODE"] + df["ELEVATORS_MODE"] + df["ENTRANCES_MODE"] + df["FLOORSMAX_MODE"] + df["FLOORSMIN_MODE"] + df["LANDAREA_MODE"] + df["LIVINGAPARTMENTS_MODE"] + df["LIVINGAREA_MODE"] + df["NONLIVINGAPARTMENTS_MODE"] + df["NONLIVINGAREA_MODE"]
df["SUM_MEDI_BUILD"] = df["APARTMENTS_MEDI"] + df["BASEMENTAREA_MEDI"] + df["YEARS_BEGINEXPLUATATION_MEDI"] + df["YEARS_BUILD_MEDI"] + df["COMMONAREA_MEDI"] + df["ELEVATORS_MEDI"] + df["ENTRANCES_MEDI"] + df["FLOORSMAX_MEDI"] + df["FLOORSMIN_MEDI"] + df["LANDAREA_MEDI"] + df["LIVINGAPARTMENTS_MEDI"] + df["LIVINGAREA_MEDI"] + df["NONLIVINGAPARTMENTS_MEDI"] + df["NONLIVINGAREA_MEDI"]
df["SUM_DOC_FLAG"] = df["FLAG_DOCUMENT_2"] + df["FLAG_DOCUMENT_3"] + df["FLAG_DOCUMENT_4"] + df["FLAG_DOCUMENT_5"] + df["FLAG_DOCUMENT_6"] + df["FLAG_DOCUMENT_7"] + df["FLAG_DOCUMENT_8"] + df["FLAG_DOCUMENT_9"] + df["FLAG_DOCUMENT_10"] + df["FLAG_DOCUMENT_11"] + df["FLAG_DOCUMENT_12"] + df["FLAG_DOCUMENT_13"] + df["FLAG_DOCUMENT_14"] + df["FLAG_DOCUMENT_15"] + df["FLAG_DOCUMENT_16"] + df["FLAG_DOCUMENT_17"] + df["FLAG_DOCUMENT_18"] + df["FLAG_DOCUMENT_19"] + df["FLAG_DOCUMENT_20"] + df["FLAG_DOCUMENT_21"]
df["CNT_CHILDREN_DIV_DAYS_BIRTH"] = df["CNT_CHILDREN"] / df["DAYS_BIRTH"]
df["CNT_CHILDREN_DIV_REGION_POPULATION_RELATIVE"] = df["CNT_CHILDREN"] / df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_REALTY_PROD_REGION_POPULATION_RELATIVE"] = df["FLAG_OWN_REALTY"] * df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_REALTY_DIV_REGION_POPULATION_RELATIVE"] = df["FLAG_OWN_REALTY"] / df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_CAR_DIV_OWN_CAR_AGE"] = df["FLAG_OWN_CAR"] / df["OWN_CAR_AGE"]
df["EXT_SOURCE_1_DIV_DAYS_BIRTH"] = df["EXT_SOURCE_1"] / df["DAYS_BIRTH"]
df["EXT_SOURCE_1_PROD_DAYS_BIRTH"] = df["EXT_SOURCE_1"] * df["DAYS_BIRTH"]

### Remove infinite values

In [76]:
df.replace([-np.inf, np.inf], np.nan, inplace=True)

### Remove income outliers

In [77]:
df.loc[df["AMT_INCOME_TOTAL"] > 500000, "AMT_INCOME_TOTAL"] = np.nan

### Handle special values for DAYS_EMPLOYED

In [78]:
df.loc[df["DAYS_EMPLOYED"] > 0, "DAYS_EMPLOYED"] = np.nan

In [85]:
df.shape

(307511, 428)

### Encode categorical features

In [80]:
df = pd.get_dummies(df, dummy_na=True)
df.columns = df.columns.str.replace("\s+", "_")

In [81]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone,_brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan
0,100002,1,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0,0,0,0,1,0,0,1,0,0
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,0,0,0,0,0,0,0,1,0,0
2,100004,0,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0,0,0,0,0,0,1,0,0,1
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0,0,0,0,0,0,1,0,0,1
4,100007,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,0,0,0,0,0,0,1,0,0,1


In [82]:
df.shape

(307511, 426)

### Write preprocessed data to file

In [87]:
df.to_csv(path + "preprocessed_train.csv", index=False, header=True)

# Modeling

In [5]:
df = pd.read_csv(path + "preprocessed_train.csv")

In [88]:
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

## LightGBM

### Select features

In [89]:
clf = lgb.LGBMClassifier(n_estimators=1000, num_leaves=23, subsample=0.5)
clf.fit(df, y)

lgb_cols = df.columns[clf.feature_importances_ > 0]
len(lgb_cols)

382

### Feature importance

In [90]:
var_imp = pd.DataFrame({"Feature": df.columns, "Importance": clf.feature_importances_})[["Feature", "Importance"]].sort_values("Importance", ascending=False)

In [98]:
var_imp.head(100).tail(50)

Unnamed: 0,Feature,Importance
5,AMT_ANNUITY,135
136,MAX_PREV_AMT_ANNUITY_WEIGHTED,135
10,DAYS_REGISTRATION,134
153,MIN_PREV_PROP_APPROVED,133
284,SUM_PAYMENT,131
146,MAX_SYNTH_TARGET,130
312,OWN_CAR_AGE_DIV_DAYS_BIRTH,129
320,REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULAT...,129
195,DAYS_CREDIT_RANGE,127
297,AMT_CREDIT_DIV_AVG_PREV_REQ_AMOUNT,127


Average importance over non-zero importance features

In [92]:
int(np.nanmean(var_imp["Importance"].where(var_imp["Importance"] > 0)))

57

Importance of selected features

In [93]:
features = ["NUM_POS_CASH", 
    "MAX_POS_DPD_DEF",
    "SUM_UNDERPAYMENT", 
    "AVG_LEN_BUREAU_BALANCE",
    "SUM_LEN_BUREAU_BALANCE",
    "PROP_CURRENT",
    "PROP_CLOSED",
    "PROP_CURRENT_WEIGHTED",
    "MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE",
    "MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE",
    "RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE",
    "SUM_SUM_CURRENT_BUREAU_BALANCE",
    "AVG_PROP_CURRENT",
    "MIN_PROP_CURRENT",
    "AVG_PROP_DQ",
    "MAX_PROP_DQ",
    "AVG_PROP_CURRENT_WEIGHTED",
    "MIN_PROP_CURRENT_WEIGHTED",
    "AVG_PROP_DQ_WEIGHTED",
    "MAX_PROP_DQ_WEIGHTED",
    "AVG_PROP_CURRENT_WEIGHTED_AMT",
    "MIN_PROP_CURRENT_WEIGHTED_AMT",
    "AVG_PROP_DQ_WEIGHTED_AMT",
    "MAX_PROP_DQ_WEIGHTED_AMT",
    "MAX_WORST_DQ_BUREAU_BALANCE",
    "AVG_WORST_DQ_BUREAU_BALANCE",
    "MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED",
    "AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED",
    "TOTAL_AMT_CREDIT_SUM_POS_DAYS",
    "SUM_DAYS_CREDIT_ENDDATE_POS_DAYS",
    "MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE",
    "AVG_SYNTH_TARGET_12M",
    "STD_BALANCE_6M",
    "STD_BALANCE_12M",
    "NUM_CREDIT_CARD",
    "DIFF_AVG_BALANCE_6M_12M",
    "AVG_BALANCE_6M",
    "AVG_UTILIZATION_6M",
    "AVG_AMT_DRAWINGS_CURRENT_6M",
    "SUM_UNDERPAYMENT_6M",
    "MAX_PAYMENT_SIZE_6M",
    "MIN_PAYMENT_SIZE_6M",
    "TOTAL_AMT_CREDIT_SUM_DIV_SUM_DAYS_CREDIT_ENDDATE",
    "TOTAL_AMT_CREDIT_SUM_POS_DAYS_DIV_SUM_DAYS_CREDIT_ENDDATE_POS_DAYS"]

var_imp[var_imp["Feature"].isin(features)]

Unnamed: 0,Feature,Importance
425,TOTAL_AMT_CREDIT_SUM_POS_DAYS_DIV_SUM_DAYS_CRE...,196
424,TOTAL_AMT_CREDIT_SUM_DIV_SUM_DAYS_CREDIT_ENDDATE,185
126,AVG_SYNTH_TARGET_12M,164
280,MIN_PAYMENT_SIZE_6M,156
275,MAX_PAYMENT_SIZE_6M,154
237,TOTAL_AMT_CREDIT_SUM_POS_DAYS,106
286,SUM_UNDERPAYMENT,90
178,AVG_PROP_CURRENT_WEIGHTED_AMT,86
175,AVG_LEN_BUREAU_BALANCE,83
230,SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,83


Features with zero importance

In [95]:
var_imp.query("Importance == 0")

Unnamed: 0,Feature,Importance
411,HOUSETYPE_MODE_terraced_house,0
13,FLAG_MOBIL,0
419,WALLSMATERIAL_MODE_Wooden,0
409,HOUSETYPE_MODE_block_of_flats,0
421,EMERGENCYSTATE_MODE_No,0
422,EMERGENCYSTATE_MODE_Yes,0
403,WEEKDAY_APPR_PROCESS_START_nan,0
415,WALLSMATERIAL_MODE_Monolithic,0
374,NAME_HOUSING_TYPE_Rented_apartment,0
80,FLAG_DOCUMENT_2,0


### LightGBM

In [96]:
params = {"n_estimators": 20000, 
          "num_leaves": 113, 
          "learning_rate": 0.005, 
          "subsample": 0.5, 
          "colsample_bytree": 0.5, 
          "reg_lambda": 0.9}

lgb_data = lgb.Dataset(data=df[lgb_cols], 
                       label=y)

cv_result = lgb.cv(params=params, 
                   train_set=lgb_data, 
                   nfold=5, 
                   metrics="auc", 
                   early_stopping_rounds=200, 
                   stratified=True, 
                   shuffle=True, 
                   verbose_eval=100, 
                   show_stdv=True, 
                   seed=123)

cv_result = pd.DataFrame(cv_result)

[100]	cv_agg's auc: 0.757383 + 0.00262752
[200]	cv_agg's auc: 0.761646 + 0.00256754
[300]	cv_agg's auc: 0.76526 + 0.00259728
[400]	cv_agg's auc: 0.76808 + 0.00244443
[500]	cv_agg's auc: 0.77055 + 0.00240747
[600]	cv_agg's auc: 0.772851 + 0.0022421
[700]	cv_agg's auc: 0.774789 + 0.00210225
[800]	cv_agg's auc: 0.77638 + 0.00199247
[900]	cv_agg's auc: 0.777787 + 0.00188086
[1000]	cv_agg's auc: 0.778992 + 0.00181389
[1100]	cv_agg's auc: 0.780142 + 0.00167474
[1200]	cv_agg's auc: 0.78106 + 0.00160194
[1300]	cv_agg's auc: 0.781893 + 0.00160379
[1400]	cv_agg's auc: 0.782606 + 0.00158053
[1500]	cv_agg's auc: 0.783205 + 0.00158437
[1600]	cv_agg's auc: 0.783715 + 0.00155639
[1700]	cv_agg's auc: 0.784115 + 0.00157239
[1800]	cv_agg's auc: 0.784507 + 0.00155631
[1900]	cv_agg's auc: 0.784846 + 0.00155919
[2000]	cv_agg's auc: 0.785111 + 0.00157966
[2100]	cv_agg's auc: 0.785391 + 0.00158255
[2200]	cv_agg's auc: 0.785584 + 0.00154804
[2300]	cv_agg's auc: 0.785795 + 0.00151811
[2400]	cv_agg's auc: 0.785

In [97]:
cv_result.tail()

Unnamed: 0,auc-mean,auc-stdv
5193,0.787511,0.001523
5194,0.787513,0.001523
5195,0.787514,0.001524
5196,0.787513,0.001525
5197,0.787519,0.001522


### For submission

In [None]:
lgb_model = lgb.LGBMClassifier(n_estimators=4497, 
                         num_leaves=113, 
                         learning_rate=0.005, 
                         subsample=0.5, 
                         colsample_bytree=0.5, 
                         reg_lambda=0.9)

lgb_model.fit(df[lgb_cols], y)

## Score test data and generate submission

In [None]:
df = pd.read_csv(path + "preprocessed_test.csv")

In [None]:
df["NAME_INCOME_TYPE_Maternity_leave"] = 0

In [None]:
submission = pd.DataFrame({"SK_ID_CURR": df["SK_ID_CURR"], 
                           "TARGET": lgb_model.predict_proba(df[lgb_cols])[:,1]})

submission.head()

In [None]:
submission.to_csv(path + "submission.csv", index=False)