# Home Credit Preprocessing

In [17]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
import pickle
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
import warnings
import gc

%load_ext autotime

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None
gc.enable()
path = "/Users/dsaxton/home_credit_default/"

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 13.4 ms


# Previous application

#### Aggregation function

In [63]:
def previous_agg_func(g):
    mask6 = g["DAYS_DECISION"] >= -180
    mask12 = g["DAYS_DECISION"] >= -360
    mask24 = g["DAYS_DECISION"] >= -720

    d = {
        "AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M": np.nanmean((g["AMT_CREDIT"] / g["AMT_ANNUITY"]).where(mask6)), 
        "MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M": np.nanmin((g["AMT_CREDIT"] / g["AMT_ANNUITY"]).where(mask6)), 
        "MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M": np.nanmax((g["AMT_CREDIT"] / g["AMT_ANNUITY"]).where(mask6)), 
        "AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M": np.nanmean((g["AMT_CREDIT"] / g["AMT_GOODS_PRICE"]).where(mask6)), 
        "MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M": np.nanmax((g["AMT_CREDIT"] / g["AMT_GOODS_PRICE"]).where(mask6)), 
        "AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M": np.nanmean((g["AMT_CREDIT"] + g["AMT_ANNUITY"]).where(mask6)), 
        "MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M": np.nanmin((g["AMT_CREDIT"] + g["AMT_ANNUITY"]).where(mask6)), 
        "COUNT_NAME_CLIENT_TYPE_REPEATER_12M": np.nansum((g["NAME_CLIENT_TYPE"] == "Repeater").where(mask12)), 
        "COUNT_NAME_CLIENT_TYPE_NEW_12M": np.nansum((g["NAME_CLIENT_TYPE"] == "New").where(mask12)), 
        "SUM_NAME_PAYMENT_TYPE_XNA_6M": np.nansum((g["NAME_PAYMENT_TYPE"] == "XNA").where(mask6)), 
        "SUM_NAME_SELLER_INDUSTRY_CSTR_6M": np.nansum((g["NAME_SELLER_INDUSTRY"] == "Construction").where(mask6)), 
        "SUM_NAME_SELLER_INDUSTRY_XNA_6M": np.nansum((g["NAME_SELLER_INDUSTRY"] == "XNA").where(mask6)), 
        "SUM_NAME_GOODS_CATEGORY_XNA_6M": np.nansum((g["NAME_GOODS_CATEGORY"] == "XNA").where(mask6)), 
        "SUM_PRODUCT_COMBINATION_POS_MOBILE_INTEREST_12M": np.nansum((g["PRODUCT_COMBINATION"] == "POS mobile with interest").where(mask12)),
        "SUM_PRODUCT_COMBINATION_POS_HOUSE_INTEREST_12M": np.nansum((g["PRODUCT_COMBINATION"] == "POS household with interest").where(mask12)), 
        "SUM_REFUSED_CONTRACT_6M": np.nansum((g["NAME_CONTRACT_STATUS"] == "Refused").where(mask12)), 
        "AVG_RATE_INTEREST_PRIMARY_12M": np.nanmean(g["RATE_INTEREST_PRIMARY"].where(mask12)), 
        "MAX_RATE_INTEREST_PRIMARY_12M": np.nanmax(g["RATE_INTEREST_PRIMARY"].where(mask12)), 
        "MIN_RATE_INTEREST_PRIMARY_12M": np.nanmin(g["RATE_INTEREST_PRIMARY"].where(mask12)), 
        "AVG_RATE_INTEREST_PRIVILEGED_12M": np.nanmean(g["RATE_INTEREST_PRIVILEGED"].where(mask12)), 
        "MAX_UTILIZATION_3M": np.nanmax((g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]).where(mask3)),
        "MIN_PREV_AMT_ANNUITY_12M": np.nanmin(g["AMT_ANNUITY"].where(mask12)), 
        "MIN_PREV_AMT_ANNUITY_24M": np.nanmin(g["AMT_ANNUITY"].where(mask24)), 
        "MIN_PREV_PROP_APPROVED_12M": np.nanmin((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask12)), 
        "AVG_SYNTH_TARGET_12M": np.nanmean(g["SYNTHETIC_TARGET"].where(mask12)), 
        "AVG_PREV_PROP_APPROVED_12M": np.nanmean((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask12)), 
        "AVG_PREV_PROP_APPROVED_24M": np.nanmean((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask24)), 
        "MAX_PREV_PROP_APPROVED_12M": np.nanmax((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask12)), 
        "MAX_PREV_PROP_APPROVED_24M": np.nanmax((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask24)), 
        "COUNT_PREV_APP": len(g), 
        "MIN_PREV_DAYS_TERMINATION": np.nanmin(g["DAYS_TERMINATION"]), 
        "MAX_PREV_DAYS_TERMINATION": np.nanmax(g["DAYS_TERMINATION"]), 
        "AVG_PREV_DAYS_TERMINATION": np.nanmean(g["DAYS_TERMINATION"]), 
        "RANGE_PREV_DAYS_TERMINATION": np.nanmax(g["DAYS_TERMINATION"]) - np.nanmin(g["DAYS_TERMINATION"]),  
        "MIN_PREV_AMT_CREDIT": np.nanmin(g["AMT_CREDIT"]),
        "MAX_PREV_AMT_CREDIT": np.nanmax(g["AMT_CREDIT"]),
        "AVG_PREV_AMT_CREDIT": np.nanmean(g["AMT_CREDIT"]),
        "MIN_PREV_AMT_CREDIT_WEIGHTED": np.nanmin(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])),
        "MAX_PREV_AMT_CREDIT_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])),
        "AVG_PREV_AMT_CREDIT_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])), 
        "MIN_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmin(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "MAX_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmax(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "AVG_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmean(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmin(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),
        "MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),
        "AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),  
        "MIN_PREV_AMT_ANNUITY": np.nanmin(g["AMT_ANNUITY"]), 
        "MAX_PREV_AMT_ANNUITY": np.nanmax(g["AMT_ANNUITY"]), 
        "AVG_PREV_AMT_ANNUITY": np.nanmean(g["AMT_ANNUITY"]), 
        "MIN_PREV_AMT_ANNUITY_WEIGHTED": np.nanmin(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_AMT_ANNUITY_WEIGHTED": np.nanmax(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_AMT_ANNUITY_WEIGHTED": np.nanmean(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "MIN_DAYS_DECISION": np.nanmin(g["DAYS_DECISION"]), 
        "MAX_DAYS_DECISION": np.nanmax(g["DAYS_DECISION"]), 
        "RANGE_DAYS_DECISION": np.nanmax(g["DAYS_DECISION"]) - np.nanmin(g["DAYS_DECISION"]),
        "SUM_DAYS_LAST_DUE_NULL": np.nansum(g["DAYS_LAST_DUE"].isnull()), 
        "AVG_DAYS_LAST_DUE_NULL": np.nanmean(g["DAYS_LAST_DUE"].isnull()), 
        "AVG_PREV_REQ_AMOUNT_WEIGHTED": np.nanmean(g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_REQ_AMOUNT_WEIGHTED": np.nanmax(g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_REQ_AMOUNT": np.nanmean(g["AMT_APPLICATION"]), 
        "MAX_PREV_REQ_AMOUNT": np.nanmax(g["AMT_APPLICATION"]), 
        "AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED": np.nanmean(g["RATE_DOWN_PAYMENT"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_PROP_APPROVED_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_PROP_APPROVED_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_RATE_DOWNPAYMENT": np.nanmean(g["RATE_DOWN_PAYMENT"]), 
        "AVG_PREV_PROP_APPROVED": np.nanmean(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "MAX_PREV_PROP_APPROVED": np.nanmax(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "MIN_PREV_PROP_APPROVED": np.nanmin(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "AVG_PREV_REQ_AMOUNT": np.nanmean(g["AMT_APPLICATION"]), 
        "MAX_PREV_REQ_AMOUNT": np.nanmax(g["AMT_APPLICATION"]), 
        "AVG_PREV_RATE_DOWNPAYMENT": np.nanmean(g["RATE_DOWN_PAYMENT"]), 
        "AVG_PREV_INT_RATE": np.nanmean(g["RATE_INTEREST_PRIMARY"]), 
        "SUM_PREV_URGENT_NEEDS": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Urgent needs"), 
        "SUM_PREV_REPAIRS": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Repairs"), 
        "SUM_PREV_OTHER": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Other"), 
        "SUM_PREV_LIMIT_REJECT": np.nansum(g["CODE_REJECT_REASON"] == "LIMIT"), 
        "SUM_REFUSED_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Refused"), 
        "SUM_CANC_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Canceled"), 
        "SUM_APPR_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Approved"), 
        "SUM_PREV_HC_REJECT": np.nansum(g["CODE_REJECT_REASON"] == "HC"), 
        "SUM_PREV_INSURE_REQ": np.nansum(g["NFLAG_INSURED_ON_APPROVAL"]), 
        "COUNT_PREV_WALK_IN": np.nansum(g["NAME_PRODUCT_TYPE"] == "walk-in"), 
        "COUNT_PREV_HIGH_YIELD": np.nansum(g["NAME_YIELD_GROUP"] == "high"), 
        "COUNT_PREV_LOW_YIELD": np.nansum(g["NAME_YIELD_GROUP"].apply(lambda x: x.startswith("low"))), 
        "AVG_SYNTH_TARGET": np.nanmean(g["SYNTHETIC_TARGET"]), 
        "SUM_SYNTH_TARGET_WEIGHTED": np.nansum(g["SYNTHETIC_TARGET"] / abs(g["DAYS_DECISION"])), 
        "SUM_SYNTH_TARGET": np.nansum(g["SYNTHETIC_TARGET"]), 
        "MAX_SYNTH_TARGET": np.nanmax(g["SYNTHETIC_TARGET"]), 
        "MIN_SYNTH_TARGET": np.nanmin(g["SYNTHETIC_TARGET"]), 
        "RANGE_SYNTH_TARGET": np.nanmax(g["SYNTHETIC_TARGET"]) - np.min(g["SYNTHETIC_TARGET"]), 
        "SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE": np.nansum(g["DAYS_LAST_DUE_1ST_VERSION"] == g["DAYS_LAST_DUE"]), 
        "SUM_DAYS_FIRST_DRAWING_SENTINEL": np.nansum(g["DAYS_FIRST_DRAWING_SENTINEL"]), 
        "SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED": np.nansum(g["DAYS_FIRST_DRAWING_SENTINEL"] / abs(g["DAYS_DECISION"])), 
        "MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED": np.nanmax(g["DAYS_FIRST_DRAWING_SENTINEL"] / abs(g["DAYS_DECISION"])),         
        "SUM_DAYS_LAST_DUE_LT_FIRST_VERSION": np.nansum(g["DAYS_LAST_DUE"] < g["DAYS_LAST_DUE_1ST_VERSION"]), 
    }

    return pd.Series(d)

time: 191 ms


#### Process data and write to file

In [64]:
previous_application = pd.read_csv(path + "previous_application.csv")

with open(path + "linear_model.pkl", "rb") as f:
    clf = pickle.load(f)

impute = Imputer(strategy="median")
scale = StandardScaler()

cols = ["AMT_ANNUITY", 
        "AMT_CREDIT", 
        "AMT_GOODS_PRICE", 
        "HOUR_APPR_PROCESS_START", 
        "NAME_CONTRACT_TYPE", 
        "NAME_TYPE_SUITE", 
        "WEEKDAY_APPR_PROCESS_START"]

prev_temp = pd.get_dummies(previous_application[cols])

dummy_cols = ["AMT_CREDIT",
              "AMT_GOODS_PRICE",
              "HOUR_APPR_PROCESS_START",
              "NAME_CONTRACT_TYPE_Cash loans",
              "NAME_CONTRACT_TYPE_Revolving loans",
              "NAME_TYPE_SUITE_Children",
              "NAME_TYPE_SUITE_Family",
              "NAME_TYPE_SUITE_Group of people",
              "NAME_TYPE_SUITE_Other_A",
              "NAME_TYPE_SUITE_Other_B",
              "NAME_TYPE_SUITE_Spouse, partner",
              "NAME_TYPE_SUITE_Unaccompanied",
              "WEEKDAY_APPR_PROCESS_START_FRIDAY",
              "WEEKDAY_APPR_PROCESS_START_MONDAY",
              "WEEKDAY_APPR_PROCESS_START_SATURDAY",
              "WEEKDAY_APPR_PROCESS_START_SUNDAY",
              "WEEKDAY_APPR_PROCESS_START_THURSDAY",
              "WEEKDAY_APPR_PROCESS_START_TUESDAY",
              "WEEKDAY_APPR_PROCESS_START_WEDNESDAY"]

previous_application["SYNTHETIC_TARGET"] = clf.predict_proba(scale.fit_transform(impute.fit_transform(prev_temp[dummy_cols])))[:,1]
previous_application["DAYS_FIRST_DRAWING_SENTINEL"] = (previous_application["DAYS_FIRST_DRAWING"] == 365243).astype(int)
previous_application["DAYS_FIRST_DUE_SENTINEL"] = (previous_application["DAYS_FIRST_DUE"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_1ST_VERSION_SENTINEL"] = (previous_application["DAYS_LAST_DUE_1ST_VERSION"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_SENTINEL"] = (previous_application["DAYS_LAST_DUE"] == 365243).astype(int)
previous_application["DAYS_TERMINATION_SENTINEL"] = (previous_application["DAYS_TERMINATION"] == 365243).astype(int)

previous_agg = previous_application.groupby("SK_ID_CURR").apply(previous_agg_func).reset_index()

previous_agg.to_csv(path + "previous_agg.csv", index=False, header=True)
del prev_temp, previous_application, previous_agg
gc.collect()

65

time: 1h 49min 26s


# Bureau Balance

#### Aggregation function

In [36]:
def bureau_balance_agg_func(g):
    mask6 = g["MONTHS_BALANCE"] >= -6
    mask12 = g["MONTHS_BALANCE"] >= -12
    closed = g["STATUS"] == "C"

    d = {
        "WORST_DQ_BUREAU_BALANCE_6M": np.nanmax(g["STATUS"].apply(lambda x: 0 if x == "C" else int(x)).where(mask6)), 
        "WORST_DQ_BUREAU_BALANCE_12M": np.nanmax(g["STATUS"].apply(lambda x: 0 if x == "C" else int(x)).where(mask12)), 
        "LEN_BUREAU_BALANCE": np.nansum(~closed), 
        "SUM_CLOSED_BUREAU_BALANCE": np.nansum(closed), 
        "SUM_CURRENT_BUREAU_BALANCE": np.nansum(g["STATUS"] == "0"), 
        "SUM_DQ_BUREAU_BALANCE": np.nansum(g["STATUS"].isin(["1", "2", "3", "3", "4", "5"])),
        "WORST_DQ_BUREAU_BALANCE": np.nanmax(g["STATUS"].apply(lambda x: 0 if x == "C" else int(x))), 
        "AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nansum(abs(g["MONTHS_BALANCE"]).where(~closed)) / np.nansum(~closed), 
        "MIN_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmin(g["MONTHS_BALANCE"].where(~closed)), 
        "MAX_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["MONTHS_BALANCE"].where(~closed)), 
    }

    return pd.Series(d)

time: 25.5 ms


#### Process data and write to file

In [None]:
bureau_balance = pd.read_csv(path + "bureau_balance.csv")

bureau_balance["STATUS"] = bureau_balance["STATUS"].where(lambda x: x != "X").fillna("0")

bureau_balance_agg = bureau_balance.groupby("SK_ID_BUREAU").apply(bureau_balance_agg_func).reset_index()
bureau_balance_agg.to_csv(path + "bureau_balance_agg.csv", index=False, header=True)
del bureau_balance, bureau_balance_agg
gc.collect()

# Bureau

This is dependent on `bureau_balance_agg`

#### Aggregation function

In [89]:
def bureau_agg_func(g):
    mask3 = g["DAYS_CREDIT_UPDATE"] >= -90
    mask6 = g["DAYS_CREDIT_UPDATE"] >= -180
    mask12 = g["DAYS_CREDIT_UPDATE"] >= -360
    mask24 = g["DAYS_CREDIT_UPDATE"] >= -720
    active = g["CREDIT_ACTIVE"] == "Active"
    cc = g["CREDIT_TYPE"] == "Credit card"
    
    d = {
        "SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M": np.nansum((g["AMT_CREDIT_SUM_DEBT"] / g["DAYS_CREDIT_ENDDATE"]).where(active & mask12)),
        "SUM_CC_DEBT_6M": np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(cc & mask6)), 
        "SUM_CC_DEBT_12M": np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(cc & mask12)), 
        "MAX_WORST_DQ_BUREAU_BALANCE_6M": np.nanmax(g["WORST_DQ_BUREAU_BALANCE_6M"].where(mask6)), 
        "MAX_WORST_DQ_BUREAU_BALANCE_12M": np.nanmax(g["WORST_DQ_BUREAU_BALANCE_12M"].where(mask12)), 
        "MAX_BUREAU_UTILIZATION_6M": np.nanmax((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM"]).where(mask6)), 
        "MAX_BUREAU_UTILIZATION_12M": np.nanmax((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM"]).where(mask12)), 
        "COUNT_ACTIVE_6M": np.nansum((g["CREDIT_ACTIVE"] == "Active").where(active & mask6)), 
        "COUNT_ACTIVE_12M": np.nansum((g["CREDIT_ACTIVE"] == "Active").where(active & mask12)), 
        "COUNT_ACTIVE_24M": np.nansum((g["CREDIT_ACTIVE"] == "Active").where(active & mask24)), 
        "DAYS_REMAINING_ACTIVE": np.nansum(g["DAYS_CREDIT_ENDDATE"].where(active)), 
        "MAX_CREDIT_DAY_OVERDUE_6M": np.nanmax(g["CREDIT_DAY_OVERDUE"].where(mask6)), 
        "MAX_CREDIT_DAY_OVERDUE_DIFF_6M_12M": np.nanmax(g["CREDIT_DAY_OVERDUE"].where(mask6)) - np.nanmax(g["CREDIT_DAY_OVERDUE"].where(mask6 ^ mask12)), 
        "BUREAU_UTILIZATION_DIFF_6M_12M": np.nanmean((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]).where(active & mask6)) - np.nanmean((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]).where(active & (mask6 ^ mask12))), 
        "BUREAU_UTILIZATION_DIFF_12M_24M": np.nanmean((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]).where(active & mask12)) - np.nanmean((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]).where(active & (mask6 ^ mask24))), 
        "BUREAU_SUM_DEBT_DIFF_6M_12M": np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(active & mask6)) - np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(active & (mask6 ^ mask12))),
        "BUREAU_SUM_DEBT_DIFF_12M_24M": np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(active & mask6)) - np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(active & (mask6 ^ mask12))),         
        "MAX_CNT_CREDIT_PROLONG": np.nanmax(g["CNT_CREDIT_PROLONG"]), 
        "AVG_LEN_BUREAU_BALANCE": np.nanmean(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CURRENT": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CLOSED": np.nansum(g["SUM_CLOSED_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CURRENT_WEIGHTED": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]) / np.nansum(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmin(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]) - np.nanmin(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "SUM_SUM_CURRENT_BUREAU_BALANCE": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]), 
        "AVG_PROP_CURRENT": np.nanmean(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ": np.nanmean(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ": np.nanmax(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]),
        "AVG_PROP_CURRENT_WEIGHTED": np.nanmean(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_PROP_CURRENT_WEIGHTED": np.nanmin(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ_WEIGHTED": np.nanmean(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ_WEIGHTED": np.nanmax(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_CURRENT_WEIGHTED_AMT": np.nanmean(g["AMT_CREDIT_SUM"] * g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_PROP_CURRENT_WEIGHTED_AMT": np.nanmin(g["AMT_CREDIT_SUM"] * g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ_WEIGHTED_AMT": np.nanmean(g["AMT_CREDIT_SUM"] * g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ_WEIGHTED_AMT": np.nanmax(g["AMT_CREDIT_SUM"] * g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]),          
        "AVG_WORST_DQ_BUREAU_BALANCE": np.nanmean(g["WORST_DQ_BUREAU_BALANCE"]), 
        "MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED": np.nanmax(g["WORST_DQ_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED": np.nanmean(g["WORST_DQ_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "TOTAL_AMT_CREDIT_SUM_POS_DAYS": np.nansum(g["AMT_CREDIT_SUM"].where(g["DAYS_CREDIT_ENDDATE"] > 0)),
        "SUM_DAYS_CREDIT_ENDDATE_POS_DAYS": np.nansum(g["DAYS_CREDIT_ENDDATE"].where(g["DAYS_CREDIT_ENDDATE"] > 0)), 
        "MAX_LEN_BUREAU_BALANCE": np.nanmax(g["LEN_BUREAU_BALANCE"]), 
        "SUM_LEN_BUREAU_BALANCE": np.nanmax(g["LEN_BUREAU_BALANCE"]), 
        "MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmin(g["MIN_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_DAYS_CREDIT_ENDDATE": np.nanmin(g["DAYS_CREDIT_ENDDATE"]), 
        "MAX_DAYS_CREDIT_ENDDATE": np.nanmax(g["DAYS_CREDIT_ENDDATE"]), 
        "SUM_DAYS_CREDIT_ENDDATE": np.nansum(g["DAYS_CREDIT_ENDDATE"]), 
        "SUM_NULL_DAYS_ENDDATE_FACT": np.nansum(g["DAYS_ENDDATE_FACT"].isnull()), 
        "COUNT_BUREAU_RECORDS": len(g), 
        "COUNT_ACTIVE": np.nansum(active), 
        "MAX_CREDIT_DAY_OVERDUE_WEIGHTED": np.nanmax(g["CREDIT_DAY_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_CREDIT_DAY_OVERDUE_WEIGHTED": np.nansum(g["CREDIT_DAY_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "MAX_CREDIT_DAY_OVERDUE": np.nanmax(g["CREDIT_DAY_OVERDUE"]), 
        "SUM_CREDIT_DAY_OVERDUE": np.nansum(g["CREDIT_DAY_OVERDUE"]), 
        "DAYS_SINCE_APPLIED": - np.nanmax(g["DAYS_CREDIT"]), 
        "SUM_INVERSE_DAYS_CREDIT": - np.nansum(1 / g["DAYS_CREDIT"]), 
        "MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED": np.nanmax(g["AMT_CREDIT_MAX_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED": np.nansum(g["AMT_CREDIT_MAX_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "MAX_AMT_CREDIT_MAX_OVERDUE": np.nanmax(g["AMT_CREDIT_MAX_OVERDUE"]), 
        "SUM_AMT_CREDIT_MAX_OVERDUE": np.nansum(g["AMT_CREDIT_MAX_OVERDUE"]), 
        "SUM_CNT_CREDIT_PROLONG": np.nansum(g["CNT_CREDIT_PROLONG"]), 
        "SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED": np.nansum(g["AMT_CREDIT_SUM_DEBT"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_AMT_CREDIT_SUM_DEBT": np.nansum(g["AMT_CREDIT_SUM_DEBT"]),
        "BUREAU_UTILIZATION_AVG": np.nanmean(g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]), 
        "BUREAU_UTILIZATION_MAX": np.nanmax(g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]), 
        "BUREAU_PROP_SUM_OVERDUE_AVG": np.nanmean(g["AMT_CREDIT_SUM_OVERDUE"] / g["AMT_CREDIT_SUM_DEBT"]), 
        "BUREAU_PROP_MAX_OVERDUE_AVG": np.nanmean(g["AMT_CREDIT_MAX_OVERDUE"] / g["AMT_CREDIT_SUM_DEBT"]), 
        "MAX_DAYS_CREDIT_UPDATE": np.nanmax(g["DAYS_CREDIT_UPDATE"]), 
        "RANGE_DAYS_CREDIT_UPDATE": np.nanmax(g["DAYS_CREDIT_UPDATE"]) - np.nanmin(g["DAYS_CREDIT_UPDATE"]), 
        "DAYS_CREDIT_RANGE": np.nanmax(g["DAYS_CREDIT"]) - np.nanmin(g["DAYS_CREDIT"]), 
        "TOTAL_AMT_CREDIT_SUM_WEIGHTED": np.nansum(g["AMT_CREDIT_SUM"] / abs(g["DAYS_CREDIT_UPDATE"])),
        "TOTAL_AMT_CREDIT_SUM": np.nansum(g["AMT_CREDIT_SUM"]),
        "COUNT_CREDIT_CARD": np.nansum(g["CREDIT_TYPE"] == "Credit card"), 
        "COUNT_CAR_LOAN": np.nansum(g["CREDIT_TYPE"] == "Car loan"), 
        "COUNT_MORTGAGE": np.nansum(g["CREDIT_TYPE"] == "Mortgage"), 
        "SUM_AMT_ANNUITY": np.nansum(g["AMT_ANNUITY"]), 
    }
    
    return pd.Series(d)

time: 433 ms


#### Process data and write to file

In [90]:
bureau = pd.read_csv(path + "bureau.csv")
bureau_balance_agg = pd.read_csv(path + "bureau_balance_agg.csv")

bureau_joined = pd.merge(bureau, 
                         bureau_balance_agg, 
                         how="left", 
                         on="SK_ID_BUREAU")

bureau_agg = bureau_joined.groupby("SK_ID_CURR").apply(bureau_agg_func).reset_index()
bureau_agg.to_csv(path + "bureau_agg.csv", index=False, header=True)

time: 4h 9min 27s


# Credit card

#### Aggregation function

In [34]:
def credit_card_agg_func(g):
    mask3 = g["MONTHS_BALANCE"] >= -3
    mask6 = g["MONTHS_BALANCE"] >= -6
    mask12 = g["MONTHS_BALANCE"] >= -12
    active = g["NAME_CONTRACT_STATUS"] == "Active"
    overdue = g["SK_DPD"] > 0
    
    d = {
        "MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M": np.nanmax(g["AMT_INST_MIN_REGULARITY"].where(overdue & mask6)), 
        "MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M": np.nanmin(g["AMT_INST_MIN_REGULARITY"].where(overdue & mask12)), 
        "SUM_CNT_DRAWINGS_ATM_CURRENT_6M": np.nansum(g["CNT_DRAWINGS_ATM_CURRENT"].where(mask6)), 
        "SUM_AMT_DRAWINGS_ATM_CURRENT_6M": np.nansum(g["AMT_DRAWINGS_ATM_CURRENT"].where(mask6)), 
        "MAX_AMT_DRAWINGS_ATM_CURRENT_6M": np.nanmax(g["AMT_DRAWINGS_ATM_CURRENT"].where(mask6)), 
        "MAX_CNT_DRAWINGS_ATM_CURRENT_6M": np.nanmax(g["CNT_DRAWINGS_ATM_CURRENT"].where(mask6)), 
        "MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M": np.nanmax((g["AMT_RECIVABLE"] / g["AMT_RECEIVABLE_PRINCIPAL"]).where(mask6)), 
        "MAX_UTILIZATION_6M": np.nanmax((g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]).where(mask6)),
        "MAX_CREDIT_CARD_SK_DPD_6M": np.nanmax(g["SK_DPD"].where(mask6)), 
        "MAX_CREDIT_CARD_SK_DPD_12M": np.nanmax(g["SK_DPD"].where(mask12)),
        "MAX_AMT_DRAWINGS_CURRENT_6M": np.nanmax(g["AMT_DRAWINGS_CURRENT"].where(mask6)), 
        "MAX_AMT_DRAWINGS_CURRENT_12M": np.nanmax(g["AMT_DRAWINGS_CURRENT"].where(mask12)), 
        "MAX_AMT_INST_MIN_REGULARITY_6M": np.nanmax(g["AMT_INST_MIN_REGULARITY"].where(mask6)), 
        "MAX_AMT_INST_MIN_REGULARITY_12M": np.nanmax(g["AMT_INST_MIN_REGULARITY"].where(mask12)), 
        "MAX_CNT_DRAWINGS_POS_CURRENT_6M": np.nanmax(g["CNT_DRAWINGS_POS_CURRENT"].where(mask6)), 
        "MAX_CNT_DRAWINGS_POS_CURRENT_12M": np.nanmax(g["CNT_DRAWINGS_POS_CURRENT"].where(mask12)), 
        "SUM_CC_PAYMENT_DIFF_12M": np.nansum((g["AMT_PAYMENT_TOTAL_CURRENT"] - g["AMT_INST_MIN_REGULARITY"]).where(mask12)),
        "DIFF_AVG_BALANCE_6M_12M": np.nanmean(g["AMT_BALANCE"].where(mask6)) - np.nanmean(g["AMT_BALANCE"].where(mask6 ^ mask12)),
        "AVG_BALANCE_6M": np.nanmean(g["AMT_BALANCE"].where(mask6)),
        "AVG_UTILIZATION_6M": np.nanmean((g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]).where(mask6)),
        "AVG_BALANCE": np.nanmean(g["AMT_BALANCE"]), 
        "MAX_BALANCE": np.nanmax(g["AMT_BALANCE"]), 
        "SUM_BALANCE": np.nansum(g["AMT_BALANCE"]), 
        "MAX_MONTHS_BALANCE": np.nanmax(abs(g["MONTHS_BALANCE"])), 
        "MIN_MONTHS_BALANCE": np.nanmin(abs(g["MONTHS_BALANCE"])), 
        "RANGE_MONTHS_BALANCE": np.nanmax(g["MONTHS_BALANCE"]) - np.nanmin(g["MONTHS_BALANCE"]), 
        "AVG_UTILIZATION": np.nanmean(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]), 
        "MAX_UTILIZATION": np.nanmax(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]), 
        "AVG_BALANCE_WEIGHTED": np.nanmean(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_BALANCE_WEIGHTED": np.nanmax(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "SUM_BALANCE_WEIGHTED": np.nansum(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "AVG_UTILIZATION_WEIGHTED": np.nanmean(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_UTILIZATION_WEIGHTED": np.nanmax(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_DPD_WEIGHTED": np.nanmax(g["SK_DPD"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_DPD_DEF_WEIGHTED": np.nanmax(g["SK_DPD_DEF"] / abs(g["MONTHS_BALANCE"])), 
        "SUM_CNT_DRAWINGS_CURRENT": np.nansum(g["CNT_DRAWINGS_CURRENT"]), 
        "AVG_CNT_DRAWINGS_CURRENT": np.nanmean(g["CNT_DRAWINGS_CURRENT"]), 
        "MAX_CNT_DRAWINGS_CURRENT": np.nanmax(g["CNT_DRAWINGS_CURRENT"]), 
        "SUM_AMT_DRAWINGS_CURRENT": np.nansum(g["AMT_DRAWINGS_CURRENT"]), 
        "AVG_AMT_DRAWINGS_CURRENT": np.nanmean(g["AMT_DRAWINGS_CURRENT"]), 
        "MAX_AMT_DRAWINGS_CURRENT": np.nanmax(g["AMT_DRAWINGS_CURRENT"]), 
        "MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmin(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"]), 
        "AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmean(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"]), 
        "MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmax(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"]), 
    }
    
    return pd.Series(d)

time: 49 ms


#### Process data and write to file

In [35]:
credit_card = pd.read_csv(path + "credit_card_balance.csv")

credit_card_agg = credit_card.groupby("SK_ID_CURR").apply(credit_card_agg_func).reset_index()
credit_card_agg.to_csv(path + "credit_card_agg.csv", index=False, header=True)

time: 34min 6s


# Installments

#### Aggregation function

In [10]:
def installment_agg_func(g):
    mask6 = g["DAYS_ENTRY_PAYMENT"] >= -180
    mask12 = g["DAYS_ENTRY_PAYMENT"] >= -360
    
    d = {
        "MAX_UNDERPAYMENT_6M": np.nanmax((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask6)), 
        "MAX_UNDERPAYMENT_12M": np.nanmax((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask12)), 
        "SUM_PAYMENT_6M": np.nansum(g["AMT_PAYMENT"].where(mask6)), 
        "SUM_PAYMENT_DIFF_6M_12M": np.nansum(g["AMT_PAYMENT"].where(mask6)) - np.nansum(g["AMT_PAYMENT"].where(mask6 ^ mask12)), 
        "MAX_AMT_INSTALMENT_6M": np.nanmax(g["AMT_INSTALMENT"].where(mask6)), 
        "MIN_AMT_INSTALMENT_6M": np.nanmin(g["AMT_INSTALMENT"].where(mask6)), 
        "MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M": np.nanmax((g["DAYS_ENTRY_PAYMENT"] - g["DAYS_INSTALMENT"])), 
        "MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M": np.nanmin((g["DAYS_ENTRY_PAYMENT"] - g["DAYS_INSTALMENT"])), 
        "SUM_UNDERPAYMENT_12M": np.nansum((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask12)), 
        "SUM_UNDERPAYMENT_6M": np.nansum((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask6)), 
        "MAX_PAYMENT_SIZE_6M": np.nanmax(g["AMT_PAYMENT"].where(mask6)), 
        "MAX_PAYMENT_SIZE_12M": np.nanmax(g["AMT_PAYMENT"].where(mask12)), 
        "MIN_PAYMENT_SIZE_6M": np.nanmin(g["AMT_PAYMENT"].where(mask6)),
        "MAX_ABS_DAYS_INSTALMENT": np.nanmax(abs(g["DAYS_INSTALMENT"])), 
        "COUNT_UNDERPAYMENT": np.nansum(g["AMT_PAYMENT"] / g["AMT_INSTALMENT"] < 0.5), 
        "SUM_UNDERPAYMENT": np.nansum(g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]), 
        "SUM_UNDERPAYMENT_WEIGHTED": np.nansum((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]) / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MAX_UNDERPAYMENT": np.nanmax(g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]), 
        "AVG_PAYMENT_SIZE_WEIGHTED": np.nanmean(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "AVG_PAYMENT_SIZE": np.nanmean(g["AMT_PAYMENT"]), 
        "MAX_PAYMENT_SIZE_WEIGHTED": np.nanmax(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MAX_PAYMENT_SIZE": np.nanmax(g["AMT_PAYMENT"]), 
        "MIN_PAYMENT_SIZE_WEIGHTED": np.nanmin(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MIN_PAYMENT_SIZE": np.nanmin(g["AMT_PAYMENT"]),
        "SUM_PAYMENT_WEIGHTED": np.nansum(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "SUM_PAYMENT": np.nansum(g["AMT_PAYMENT"]),
        "SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT": np.nansum(g["DAYS_ENTRY_PAYMENT"] > g["DAYS_INSTALMENT"]), 
        "MAX_DAYS_ENTRY_PAYMENT": np.nanmax(g["DAYS_ENTRY_PAYMENT"]), 
        "MIN_DAYS_ENTRY_PAYMENT": np.nanmin(g["DAYS_ENTRY_PAYMENT"]), 
        "RANGE_DAYS_ENTRY_PAYMENT": np.nanmax(g["DAYS_ENTRY_PAYMENT"]) - np.nanmin(g["DAYS_ENTRY_PAYMENT"]), 
    }
    
    return pd.Series(d)

#### Process data and write to file

In [12]:
installments = pd.read_csv(path + "installments_payments.csv")

installment_agg = installments.groupby("SK_ID_CURR").apply(installment_agg_func).reset_index()
installment_agg.to_csv(path + "installment_agg.csv", index=False, header=True)

# Point of Sale

#### Aggregation function

In [13]:
def pos_cash_agg_func(g):
    mask3 = g["MONTHS_BALANCE"] >= -3
    mask6 = g["MONTHS_BALANCE"] >= -6
    mask12 = g["MONTHS_BALANCE"] >= -12
    overdue = g["SK_DPD"] > 0
    
    d = {
        "MIN_CNT_INSTALMENT_FUTURE_6M": np.nanmin(g["CNT_INSTALMENT_FUTURE"].where(mask6)), 
        "MAX_CNT_INSTALMENT_FUTURE_6M": np.nanmax(g["CNT_INSTALMENT_FUTURE"].where(mask6)), 
        "MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_12M": np.nanmax((g["CNT_INSTALMENT_FUTURE"] * g["SK_DPD"]).where(mask12)), 
        "MAX_POS_DPD": np.nanmax(g["SK_DPD"]), 
        "MAX_POS_DPD_DEF": np.nanmax(g["SK_DPD_DEF"]), 
        "NUM_POS_CASH": g["SK_ID_PREV"].nunique(), 
    }
    
    return pd.Series(d)

#### Process data and write to file

In [14]:
pos_cash = pd.read_csv(path + "POS_CASH_balance.csv")

pos_cash_agg = pos_cash.groupby("SK_ID_CURR").apply(pos_cash_agg_func).reset_index()
pos_cash_agg.to_csv(path + "pos_cash_agg.csv", index=False, header=True)

# Join all files

In [322]:
train_or_test = "train"

time: 840 µs


In [323]:
application = pd.read_csv(path + "application_" + train_or_test + ".csv")
previous_agg = pd.read_csv(path + "previous_agg.csv")
# bureau_balance_agg should already be joined with bureau_agg
bureau_agg = pd.read_csv(path + "bureau_agg.csv")
credit_card_agg = pd.read_csv(path + "credit_card_agg.csv")
installment_agg = pd.read_csv(path + "installment_agg.csv")
pos_cash_agg = pd.read_csv(path + "pos_cash_agg.csv")

df = pd.merge(application, previous_agg, how="left", on="SK_ID_CURR")
df = pd.merge(df, bureau_agg, how="left", on="SK_ID_CURR")
df = pd.merge(df, credit_card_agg, how="left", on="SK_ID_CURR")
df = pd.merge(df, installment_agg, how="left", on="SK_ID_CURR")
df = pd.merge(df, pos_cash_agg, how="left", on="SK_ID_CURR")
del previous_agg, bureau_agg, credit_card_agg, installment_agg, pos_cash_agg
gc.collect()

109

time: 55 s


#### Construct interaction features

In [324]:
df["TOTAL_AMT_CREDIT_SUM_DIV_SUM_DAYS_CREDIT_ENDDATE"] = df["TOTAL_AMT_CREDIT_SUM"] / df["SUM_DAYS_CREDIT_ENDDATE"]
df["TOTAL_AMT_CREDIT_SUM_POS_DAYS_DIV_SUM_DAYS_CREDIT_ENDDATE_POS_DAYS"] = df["TOTAL_AMT_CREDIT_SUM_POS_DAYS"] / df["SUM_DAYS_CREDIT_ENDDATE_POS_DAYS"]
df["MAX_ABS_DAYS_INSTALMENT_DIV_DAYS_BIRTH"] = df["MAX_ABS_DAYS_INSTALMENT"] / df["DAYS_BIRTH"]
df["FLAG_OWN_CAR"] = (df["FLAG_OWN_CAR"] == "Y").astype(int)
df["FLAG_OWN_REALTY"] = (df["FLAG_OWN_REALTY"] == "Y").astype(int)
df["AMT_CREDIT_DIV_AMT_INCOME_TOTAL"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]
df["AMT_CREDIT_PLUS_AMT_INCOME_TOTAL"] = df["AMT_CREDIT"] + df["AMT_INCOME_TOTAL"]
df["AMT_CREDIT_DIV_AMT_GOODS_PRICE"] = df["AMT_CREDIT"] / df["AMT_GOODS_PRICE"]
df["AMT_CREDIT_DIV_SUM_PAYMENT"] = df["AMT_CREDIT"] / df["SUM_PAYMENT"]
df["AMT_GOODS_PRICE_DIV_AMT_INCOME_TOTAL"] = df["AMT_GOODS_PRICE"] / df["AMT_INCOME_TOTAL"]
df["AMT_CREDIT_DIV_AMT_ANNUITY"] = df["AMT_CREDIT"] / df["AMT_ANNUITY"]
df["AMT_CREDIT_DIV_AVG_PREV_REQ_AMOUNT"] = df["AMT_CREDIT"] / df["AVG_PREV_REQ_AMOUNT"]
df["AMT_CREDIT_DIV_MAX_PREV_REQ_AMOUNT"] = df["AMT_CREDIT"] / df["MAX_PREV_REQ_AMOUNT"]
df["EXT_SOURCE_PROD"] = df["EXT_SOURCE_1"] * df["EXT_SOURCE_2"] * df["EXT_SOURCE_3"]
df["DAYS_EMPLOYED_DIV_DAYS_BIRTH"] = df["DAYS_EMPLOYED"] / df["DAYS_BIRTH"]
df["DAYS_EMPLOYED_PLUS_DAYS_REGISTRATION_PLUS_DAYS_LAST_PHONE_CHANGE"] = df["DAYS_EMPLOYED"] + df["DAYS_REGISTRATION"] + df["DAYS_LAST_PHONE_CHANGE"]
df["AVG_PAYMENT_SIZE_DIV_AMT_INCOME_TOTAL"] = df["AVG_PAYMENT_SIZE"] / df["AMT_INCOME_TOTAL"]
df["AVG_PAYMENT_SIZE_DIV_AMT_CREDIT"] = df["AVG_PAYMENT_SIZE"] / df["AMT_CREDIT"]
df["AVG_PAYMENT_SIZE_DIV_AMT_ANNUITY"] = df["AVG_PAYMENT_SIZE"] / df["AMT_ANNUITY"]
df["DAYS_REGISTRATION_PLUS_DAYS_ID_PUBLISH"] = df["DAYS_REGISTRATION"] + df["DAYS_ID_PUBLISH"]
df["SUM_REFUSED_CONTRACT_DIV_SUM_APPR_CONTRACT"] = df["SUM_REFUSED_CONTRACT"] / df["SUM_APPR_CONTRACT"]
df["MAX_UTILIZATION_DIV_AVG_UTILIZATION"] = df["MAX_UTILIZATION"] / df["AVG_UTILIZATION"]
df["MAX_PREV_REQ_AMOUNT_DIV_AMT_CREDIT"] = df["MAX_PREV_REQ_AMOUNT"] / df["AMT_CREDIT"]
df["AMT_INCOME_TOTAL_DIV_DAYS_BIRTH"] = df["AMT_INCOME_TOTAL"] / df["DAYS_BIRTH"]
df["SUM_DAYS_ID_REG_PHONE"] = df["DAYS_ID_PUBLISH"] + df["DAYS_REGISTRATION"] + df["DAYS_LAST_PHONE_CHANGE"]
df["SUM_REQ_CREDIT_YEAR"] = df["AMT_REQ_CREDIT_BUREAU_HOUR"] + df["AMT_REQ_CREDIT_BUREAU_DAY"] + df["AMT_REQ_CREDIT_BUREAU_WEEK"] + df["AMT_REQ_CREDIT_BUREAU_MON"] + df["AMT_REQ_CREDIT_BUREAU_QRT"] + df["AMT_REQ_CREDIT_BUREAU_YEAR"]
df["SUM_REQ_CREDIT_QRT"] = df["AMT_REQ_CREDIT_BUREAU_HOUR"] + df["AMT_REQ_CREDIT_BUREAU_DAY"] + df["AMT_REQ_CREDIT_BUREAU_WEEK"] + df["AMT_REQ_CREDIT_BUREAU_MON"] + df["AMT_REQ_CREDIT_BUREAU_QRT"]
df["SUM_REQ_CREDIT_1M"] = df["AMT_REQ_CREDIT_BUREAU_HOUR"] + df["AMT_REQ_CREDIT_BUREAU_DAY"] + df["AMT_REQ_CREDIT_BUREAU_WEEK"] + df["AMT_REQ_CREDIT_BUREAU_MON"]
df["SUM_REQ_CREDIT_1M_DIV_SUM_REQ_CREDIT_QRT"] = df["SUM_REQ_CREDIT_1M"] / df["SUM_REQ_CREDIT_QRT"]
df["SUM_REQ_CREDIT_QRT_DIV_SUM_REQ_CREDIT_YEAR"] = df["SUM_REQ_CREDIT_QRT"] / df["SUM_REQ_CREDIT_YEAR"]
df["DEF_30_PLUS_60_CNT_SOCIAL_CIRCLE"] = df["DEF_30_CNT_SOCIAL_CIRCLE"] + df["DEF_60_CNT_SOCIAL_CIRCLE"]
df["OWN_CAR_AGE_DIV_DAYS_BIRTH"] = df["OWN_CAR_AGE"] / df["DAYS_BIRTH"]
df["LANDAREA_DIV_TOTALAREA_MODE"] = df["LANDAREA_MODE"] / df["TOTALAREA_MODE"]
df["OWN_CAR_AGE_PLUS_DAYS_BIRTH"] = df["OWN_CAR_AGE"] + df["DAYS_BIRTH"]
df["AMT_ANNUITY_DIV_DAYS_BIRTH"] = df["AMT_ANNUITY"] / df["DAYS_BIRTH"]
df["AMT_ANNUITY_DIV_DAYS_EMPLOYED"] = df["AMT_ANNUITY"] / df["DAYS_EMPLOYED"]
df["AMT_ANNUITY_PROD_DAYS_EMPLOYED"] = df["AMT_ANNUITY"] * df["DAYS_EMPLOYED"]
df["DAYS_REGISTRATION_DIV_DAYS_ID_PUBLISH"] = df["DAYS_REGISTRATION"] / df["DAYS_ID_PUBLISH"]
df["DAYS_REGISTRATION_DIV_DAYS_LAST_PHONE_CHANGE"] = df["DAYS_REGISTRATION"] / df["DAYS_LAST_PHONE_CHANGE"]
df["REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULATION_RELATIVE"] = df["REGION_RATING_CLIENT_W_CITY"] / df["REGION_POPULATION_RELATIVE"]
df["REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULATION_RELATIVE"] = df["REGION_RATING_CLIENT_W_CITY"] * df["REGION_POPULATION_RELATIVE"]
df["SUM_REG_NOT_FLAG"] = df["REG_REGION_NOT_LIVE_REGION"] + df["REG_REGION_NOT_WORK_REGION"] + df["LIVE_REGION_NOT_WORK_REGION"] + df["REG_CITY_NOT_LIVE_CITY"] + df["REG_CITY_NOT_WORK_CITY"] + df["LIVE_CITY_NOT_WORK_CITY"]
df["SUM_AVG_BUILD"] = df["APARTMENTS_AVG"] + df["BASEMENTAREA_AVG"] + df["YEARS_BEGINEXPLUATATION_AVG"] + df["YEARS_BUILD_AVG"] + df["COMMONAREA_AVG"] + df["ELEVATORS_AVG"] + df["ENTRANCES_AVG"] + df["FLOORSMAX_AVG"] + df["FLOORSMIN_AVG"] + df["LANDAREA_AVG"] + df["LIVINGAPARTMENTS_AVG"] + df["LIVINGAREA_AVG"] + df["NONLIVINGAPARTMENTS_AVG"] + df["NONLIVINGAREA_AVG"]
df["SUM_MODE_BUILD"] = df["APARTMENTS_MODE"] + df["BASEMENTAREA_MODE"] + df["YEARS_BEGINEXPLUATATION_MODE"] + df["YEARS_BUILD_MODE"] + df["COMMONAREA_MODE"] + df["ELEVATORS_MODE"] + df["ENTRANCES_MODE"] + df["FLOORSMAX_MODE"] + df["FLOORSMIN_MODE"] + df["LANDAREA_MODE"] + df["LIVINGAPARTMENTS_MODE"] + df["LIVINGAREA_MODE"] + df["NONLIVINGAPARTMENTS_MODE"] + df["NONLIVINGAREA_MODE"]
df["SUM_MEDI_BUILD"] = df["APARTMENTS_MEDI"] + df["BASEMENTAREA_MEDI"] + df["YEARS_BEGINEXPLUATATION_MEDI"] + df["YEARS_BUILD_MEDI"] + df["COMMONAREA_MEDI"] + df["ELEVATORS_MEDI"] + df["ENTRANCES_MEDI"] + df["FLOORSMAX_MEDI"] + df["FLOORSMIN_MEDI"] + df["LANDAREA_MEDI"] + df["LIVINGAPARTMENTS_MEDI"] + df["LIVINGAREA_MEDI"] + df["NONLIVINGAPARTMENTS_MEDI"] + df["NONLIVINGAREA_MEDI"]
df["SUM_DOC_FLAG"] = df["FLAG_DOCUMENT_2"] + df["FLAG_DOCUMENT_3"] + df["FLAG_DOCUMENT_4"] + df["FLAG_DOCUMENT_5"] + df["FLAG_DOCUMENT_6"] + df["FLAG_DOCUMENT_7"] + df["FLAG_DOCUMENT_8"] + df["FLAG_DOCUMENT_9"] + df["FLAG_DOCUMENT_10"] + df["FLAG_DOCUMENT_11"] + df["FLAG_DOCUMENT_12"] + df["FLAG_DOCUMENT_13"] + df["FLAG_DOCUMENT_14"] + df["FLAG_DOCUMENT_15"] + df["FLAG_DOCUMENT_16"] + df["FLAG_DOCUMENT_17"] + df["FLAG_DOCUMENT_18"] + df["FLAG_DOCUMENT_19"] + df["FLAG_DOCUMENT_20"] + df["FLAG_DOCUMENT_21"]
df["CNT_CHILDREN_DIV_DAYS_BIRTH"] = df["CNT_CHILDREN"] / df["DAYS_BIRTH"]
df["CNT_CHILDREN_DIV_REGION_POPULATION_RELATIVE"] = df["CNT_CHILDREN"] / df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_REALTY_PROD_REGION_POPULATION_RELATIVE"] = df["FLAG_OWN_REALTY"] * df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_REALTY_DIV_REGION_POPULATION_RELATIVE"] = df["FLAG_OWN_REALTY"] / df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_CAR_DIV_OWN_CAR_AGE"] = df["FLAG_OWN_CAR"] / df["OWN_CAR_AGE"]
df["EXT_SOURCE_1_DIV_DAYS_BIRTH"] = df["EXT_SOURCE_1"] / df["DAYS_BIRTH"]
df["EXT_SOURCE_1_PROD_DAYS_BIRTH"] = df["EXT_SOURCE_1"] * df["DAYS_BIRTH"]

df["AVG_AGG_SYNTHETIC_TARGET"] = df[["BUREAU_AGG_SYNTHETIC_TARGET", 
                                     "PREVIOUS_AGG_SYNTHETIC_TARGET", 
                                     "CREDIT_CARD_AGG_SYNTHETIC_TARGET", 
                                     "INSTALLMENT_AGG_SYNTHETIC_TARGET"]].apply(np.nanmean, axis=1)

df["SUM_AGG_SYNTHETIC_TARGET"] = df[["BUREAU_AGG_SYNTHETIC_TARGET", 
                                     "PREVIOUS_AGG_SYNTHETIC_TARGET", 
                                     "CREDIT_CARD_AGG_SYNTHETIC_TARGET", 
                                     "INSTALLMENT_AGG_SYNTHETIC_TARGET"]].apply(np.nansum, axis=1)

df["MAX_AGG_SYNTHETIC_TARGET"] = df[["BUREAU_AGG_SYNTHETIC_TARGET", 
                                     "PREVIOUS_AGG_SYNTHETIC_TARGET", 
                                     "CREDIT_CARD_AGG_SYNTHETIC_TARGET", 
                                     "INSTALLMENT_AGG_SYNTHETIC_TARGET"]].apply(np.nanmax, axis=1)

df["MIN_AGG_SYNTHETIC_TARGET"] = df[["BUREAU_AGG_SYNTHETIC_TARGET", 
                                     "PREVIOUS_AGG_SYNTHETIC_TARGET", 
                                     "CREDIT_CARD_AGG_SYNTHETIC_TARGET", 
                                     "INSTALLMENT_AGG_SYNTHETIC_TARGET"]].apply(np.nanmin, axis=1)

time: 1min 19s


#### Remove infinite values

In [325]:
df.replace([-np.inf, np.inf], np.nan, inplace=True)

time: 9.8 s


#### Remove income outliers

In [326]:
df.loc[df["AMT_INCOME_TOTAL"] > 500000, "AMT_INCOME_TOTAL"] = np.nan

time: 16.8 ms


#### Handle special values for DAYS_EMPLOYED

In [327]:
df.loc[df["DAYS_EMPLOYED"] > 0, "DAYS_EMPLOYED"] = np.nan

time: 118 ms


#### Encode categorical features

Order `ORGANIZATION_TYPE` categories and map to integers (`org_type_map` was obtained by checking the default rates by group within the training data)

In [328]:
org_type_map = {"Trade: type 4": 0, "Industry: type 12": 1, "Transport: type 1": 2, "Trade: type 6": 3,
    "Security Ministries": 4, "University": 5, "Police": 6, "Military": 7,
    "Bank": 8, "XNA": 9, "Culture": 10, "Insurance": 11,
    "Religion": 12, "School": 13, "Trade: type 5": 14, "Hotel": 15,
    "Industry: type 10": 16, "Medicine": 17, "Services": 18, "Electricity": 19,
    "Industry: type 9": 20, "Industry: type 5": 21, "Government": 22, "Trade: type 2": 23,
    "Kindergarten": 24, "Emergency": 25, "Industry: type 6": 26, "Industry: type 2": 27,
    "Telecom": 28, "Other": 29, "Transport: type 2": 30, "Legal Services": 31,
    "Housing": 32, "Industry: type 7": 33, "Business Entity Type 1": 34, "Advertising": 35,
    "Postal": 36, "Business Entity Type 2": 37, "Industry: type 11": 38, "Trade: type 1": 39,
    "Mobile": 40, "Transport: type 4": 41, "Business Entity Type 3": 42, "Trade: type 7": 43,
    "Security": 44, "Industry: type 4": 45, "Self-employed": 46, "Trade: type 3": 47,
    "Agriculture": 48, "Realtor": 49, "Industry: type 3": 50, "Industry: type 1": 51,
    "Cleaning": 52, "Construction": 53, "Restaurant": 54, "Industry: type 8": 55,
    "Industry: type 13": 56, "Transport: type 3": 57}

time: 16.2 ms


In [329]:
df["ORGANIZATION_TYPE"] = df["ORGANIZATION_TYPE"].map(org_type_map)

time: 118 ms


Dummy code remaining categorical features

In [330]:
df = pd.get_dummies(df, dummy_na=True)
df.columns = df.columns.str.replace("\s+", "_")

time: 8.52 s


Remove nuisance columns

In [331]:
df["NAME_FAMILY_STATUS_Unknown"] = 0
df["NAME_INCOME_TYPE_Maternity_leave"] = 0
df["CODE_GENDER_XNA"] = 0
df.drop(["NAME_FAMILY_STATUS_Unknown", "NAME_INCOME_TYPE_Maternity_leave", "CODE_GENDER_XNA"], axis=1, inplace=True)
df.shape

(307511, 502)

time: 6.12 s


#### Write preprocessed data to file

In [None]:
df.to_csv(path + train_or_test + ".csv", index=False, header=True)