# Home Credit Preprocessing

In [17]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
import pickle
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
import warnings
import gc

%load_ext autotime

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None
gc.enable()
path = "/Users/dsaxton/home_credit_default/"

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 13.4 ms


# Previous application

#### Aggregation function

In [63]:
def previous_agg_func(g):
    mask6 = g["DAYS_DECISION"] >= -180
    mask12 = g["DAYS_DECISION"] >= -360
    mask24 = g["DAYS_DECISION"] >= -720

    d = {
        "AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M": np.nanmean((g["AMT_CREDIT"] / g["AMT_ANNUITY"]).where(mask6)), 
        "MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M": np.nanmin((g["AMT_CREDIT"] / g["AMT_ANNUITY"]).where(mask6)), 
        "MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M": np.nanmax((g["AMT_CREDIT"] / g["AMT_ANNUITY"]).where(mask6)), 
        "AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M": np.nanmean((g["AMT_CREDIT"] / g["AMT_GOODS_PRICE"]).where(mask6)), 
        "MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M": np.nanmax((g["AMT_CREDIT"] / g["AMT_GOODS_PRICE"]).where(mask6)), 
        "AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M": np.nanmean((g["AMT_CREDIT"] + g["AMT_ANNUITY"]).where(mask6)), 
        "MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M": np.nanmin((g["AMT_CREDIT"] + g["AMT_ANNUITY"]).where(mask6)), 
        
        "COUNT_NAME_CLIENT_TYPE_REPEATER_12M": np.nansum((g["NAME_CLIENT_TYPE"] == "Repeater").where(mask12)), 
        "COUNT_NAME_CLIENT_TYPE_NEW_12M": np.nansum((g["NAME_CLIENT_TYPE"] == "New").where(mask12)), 
        "SUM_NAME_PAYMENT_TYPE_XNA_6M": np.nansum((g["NAME_PAYMENT_TYPE"] == "XNA").where(mask6)), 
        "SUM_NAME_SELLER_INDUSTRY_CSTR_6M": np.nansum((g["NAME_SELLER_INDUSTRY"] == "Construction").where(mask6)), 
        "SUM_NAME_SELLER_INDUSTRY_XNA_6M": np.nansum((g["NAME_SELLER_INDUSTRY"] == "XNA").where(mask6)), 
        "SUM_NAME_GOODS_CATEGORY_XNA_6M": np.nansum((g["NAME_GOODS_CATEGORY"] == "XNA").where(mask6)), 
        "SUM_PRODUCT_COMBINATION_POS_MOBILE_INTEREST_12M": np.nansum((g["PRODUCT_COMBINATION"] == "POS mobile with interest").where(mask12)),
        "SUM_PRODUCT_COMBINATION_POS_HOUSE_INTEREST_12M": np.nansum((g["PRODUCT_COMBINATION"] == "POS household with interest").where(mask12)), 
        "SUM_REFUSED_CONTRACT_6M": np.nansum((g["NAME_CONTRACT_STATUS"] == "Refused").where(mask12)), 
        "AVG_RATE_INTEREST_PRIMARY_12M": np.nanmean(g["RATE_INTEREST_PRIMARY"].where(mask12)), 
        "MAX_RATE_INTEREST_PRIMARY_12M": np.nanmax(g["RATE_INTEREST_PRIMARY"].where(mask12)), 
        "MIN_RATE_INTEREST_PRIMARY_12M": np.nanmin(g["RATE_INTEREST_PRIMARY"].where(mask12)), 
        "AVG_RATE_INTEREST_PRIVILEGED_12M": np.nanmean(g["RATE_INTEREST_PRIVILEGED"].where(mask12)), 
        "MAX_UTILIZATION_3M": np.nanmax((g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]).where(mask3)),
        "MIN_PREV_AMT_ANNUITY_12M": np.nanmin(g["AMT_ANNUITY"].where(mask12)), 
        "MIN_PREV_AMT_ANNUITY_24M": np.nanmin(g["AMT_ANNUITY"].where(mask24)), 
        "MIN_PREV_PROP_APPROVED_12M": np.nanmin((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask12)), 
        "AVG_SYNTH_TARGET_12M": np.nanmean(g["SYNTHETIC_TARGET"].where(mask12)), 
        "AVG_PREV_PROP_APPROVED_12M": np.nanmean((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask12)), 
        "AVG_PREV_PROP_APPROVED_24M": np.nanmean((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask24)), 
        "MAX_PREV_PROP_APPROVED_12M": np.nanmax((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask12)), 
        "MAX_PREV_PROP_APPROVED_24M": np.nanmax((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask24)), 
        "COUNT_PREV_APP": len(g), 
        "MIN_PREV_DAYS_TERMINATION": np.nanmin(g["DAYS_TERMINATION"]), 
        "MAX_PREV_DAYS_TERMINATION": np.nanmax(g["DAYS_TERMINATION"]), 
        "AVG_PREV_DAYS_TERMINATION": np.nanmean(g["DAYS_TERMINATION"]), 
        "RANGE_PREV_DAYS_TERMINATION": np.nanmax(g["DAYS_TERMINATION"]) - np.nanmin(g["DAYS_TERMINATION"]),  
        "MIN_PREV_AMT_CREDIT": np.nanmin(g["AMT_CREDIT"]),
        "MAX_PREV_AMT_CREDIT": np.nanmax(g["AMT_CREDIT"]),
        "AVG_PREV_AMT_CREDIT": np.nanmean(g["AMT_CREDIT"]),
        "MIN_PREV_AMT_CREDIT_WEIGHTED": np.nanmin(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])),
        "MAX_PREV_AMT_CREDIT_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])),
        "AVG_PREV_AMT_CREDIT_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])), 
        "MIN_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmin(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "MAX_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmax(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "AVG_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmean(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmin(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),
        "MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),
        "AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),  
        "MIN_PREV_AMT_ANNUITY": np.nanmin(g["AMT_ANNUITY"]), 
        "MAX_PREV_AMT_ANNUITY": np.nanmax(g["AMT_ANNUITY"]), 
        "AVG_PREV_AMT_ANNUITY": np.nanmean(g["AMT_ANNUITY"]), 
        "MIN_PREV_AMT_ANNUITY_WEIGHTED": np.nanmin(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_AMT_ANNUITY_WEIGHTED": np.nanmax(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_AMT_ANNUITY_WEIGHTED": np.nanmean(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "MIN_DAYS_DECISION": np.nanmin(g["DAYS_DECISION"]), 
        "MAX_DAYS_DECISION": np.nanmax(g["DAYS_DECISION"]), 
        "RANGE_DAYS_DECISION": np.nanmax(g["DAYS_DECISION"]) - np.nanmin(g["DAYS_DECISION"]),
        "SUM_DAYS_LAST_DUE_NULL": np.nansum(g["DAYS_LAST_DUE"].isnull()), 
        "AVG_DAYS_LAST_DUE_NULL": np.nanmean(g["DAYS_LAST_DUE"].isnull()), 
        "AVG_PREV_REQ_AMOUNT_WEIGHTED": np.nanmean(g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_REQ_AMOUNT_WEIGHTED": np.nanmax(g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_REQ_AMOUNT": np.nanmean(g["AMT_APPLICATION"]), 
        "MAX_PREV_REQ_AMOUNT": np.nanmax(g["AMT_APPLICATION"]), 
        "AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED": np.nanmean(g["RATE_DOWN_PAYMENT"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_PROP_APPROVED_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_PROP_APPROVED_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_RATE_DOWNPAYMENT": np.nanmean(g["RATE_DOWN_PAYMENT"]), 
        "AVG_PREV_PROP_APPROVED": np.nanmean(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "MAX_PREV_PROP_APPROVED": np.nanmax(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "MIN_PREV_PROP_APPROVED": np.nanmin(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "AVG_PREV_REQ_AMOUNT": np.nanmean(g["AMT_APPLICATION"]), 
        "MAX_PREV_REQ_AMOUNT": np.nanmax(g["AMT_APPLICATION"]), 
        "AVG_PREV_RATE_DOWNPAYMENT": np.nanmean(g["RATE_DOWN_PAYMENT"]), 
        "AVG_PREV_INT_RATE": np.nanmean(g["RATE_INTEREST_PRIMARY"]), 
        "SUM_PREV_URGENT_NEEDS": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Urgent needs"), 
        "SUM_PREV_REPAIRS": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Repairs"), 
        "SUM_PREV_OTHER": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Other"), 
        "SUM_PREV_LIMIT_REJECT": np.nansum(g["CODE_REJECT_REASON"] == "LIMIT"), 
        "SUM_REFUSED_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Refused"), 
        "SUM_CANC_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Canceled"), 
        "SUM_APPR_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Approved"), 
        "SUM_PREV_HC_REJECT": np.nansum(g["CODE_REJECT_REASON"] == "HC"), 
        "SUM_PREV_INSURE_REQ": np.nansum(g["NFLAG_INSURED_ON_APPROVAL"]), 
        "COUNT_PREV_WALK_IN": np.nansum(g["NAME_PRODUCT_TYPE"] == "walk-in"), 
        "COUNT_PREV_HIGH_YIELD": np.nansum(g["NAME_YIELD_GROUP"] == "high"), 
        "COUNT_PREV_LOW_YIELD": np.nansum(g["NAME_YIELD_GROUP"].apply(lambda x: x.startswith("low"))), 
        "AVG_SYNTH_TARGET": np.nanmean(g["SYNTHETIC_TARGET"]), 
        "SUM_SYNTH_TARGET_WEIGHTED": np.nansum(g["SYNTHETIC_TARGET"] / abs(g["DAYS_DECISION"])), 
        "SUM_SYNTH_TARGET": np.nansum(g["SYNTHETIC_TARGET"]), 
        "MAX_SYNTH_TARGET": np.nanmax(g["SYNTHETIC_TARGET"]), 
        "MIN_SYNTH_TARGET": np.nanmin(g["SYNTHETIC_TARGET"]), 
        "RANGE_SYNTH_TARGET": np.nanmax(g["SYNTHETIC_TARGET"]) - np.min(g["SYNTHETIC_TARGET"]), 
        "SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE": np.nansum(g["DAYS_LAST_DUE_1ST_VERSION"] == g["DAYS_LAST_DUE"]), 
        "SUM_DAYS_FIRST_DRAWING_SENTINEL": np.nansum(g["DAYS_FIRST_DRAWING_SENTINEL"]), 
        "SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED": np.nansum(g["DAYS_FIRST_DRAWING_SENTINEL"] / abs(g["DAYS_DECISION"])), 
        "MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED": np.nanmax(g["DAYS_FIRST_DRAWING_SENTINEL"] / abs(g["DAYS_DECISION"])),         
        "SUM_DAYS_LAST_DUE_LT_FIRST_VERSION": np.nansum(g["DAYS_LAST_DUE"] < g["DAYS_LAST_DUE_1ST_VERSION"]), 
    }

    return pd.Series(d)

time: 191 ms


#### Process data and write to file

In [64]:
previous_application = pd.read_csv(path + "previous_application.csv")

with open(path + "linear_model.pkl", "rb") as f:
    clf = pickle.load(f)

impute = Imputer(strategy="median")
scale = StandardScaler()

cols = ["AMT_ANNUITY", 
        "AMT_CREDIT", 
        "AMT_GOODS_PRICE", 
        "HOUR_APPR_PROCESS_START", 
        "NAME_CONTRACT_TYPE", 
        "NAME_TYPE_SUITE", 
        "WEEKDAY_APPR_PROCESS_START"]

prev_temp = pd.get_dummies(previous_application[cols])

dummy_cols = ["AMT_CREDIT",
              "AMT_GOODS_PRICE",
              "HOUR_APPR_PROCESS_START",
              "NAME_CONTRACT_TYPE_Cash loans",
              "NAME_CONTRACT_TYPE_Revolving loans",
              "NAME_TYPE_SUITE_Children",
              "NAME_TYPE_SUITE_Family",
              "NAME_TYPE_SUITE_Group of people",
              "NAME_TYPE_SUITE_Other_A",
              "NAME_TYPE_SUITE_Other_B",
              "NAME_TYPE_SUITE_Spouse, partner",
              "NAME_TYPE_SUITE_Unaccompanied",
              "WEEKDAY_APPR_PROCESS_START_FRIDAY",
              "WEEKDAY_APPR_PROCESS_START_MONDAY",
              "WEEKDAY_APPR_PROCESS_START_SATURDAY",
              "WEEKDAY_APPR_PROCESS_START_SUNDAY",
              "WEEKDAY_APPR_PROCESS_START_THURSDAY",
              "WEEKDAY_APPR_PROCESS_START_TUESDAY",
              "WEEKDAY_APPR_PROCESS_START_WEDNESDAY"]

previous_application["SYNTHETIC_TARGET"] = clf.predict_proba(scale.fit_transform(impute.fit_transform(prev_temp[dummy_cols])))[:,1]
previous_application["DAYS_FIRST_DRAWING_SENTINEL"] = (previous_application["DAYS_FIRST_DRAWING"] == 365243).astype(int)
previous_application["DAYS_FIRST_DUE_SENTINEL"] = (previous_application["DAYS_FIRST_DUE"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_1ST_VERSION_SENTINEL"] = (previous_application["DAYS_LAST_DUE_1ST_VERSION"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_SENTINEL"] = (previous_application["DAYS_LAST_DUE"] == 365243).astype(int)
previous_application["DAYS_TERMINATION_SENTINEL"] = (previous_application["DAYS_TERMINATION"] == 365243).astype(int)

previous_agg = previous_application.groupby("SK_ID_CURR").apply(previous_agg_func).reset_index()

previous_agg.to_csv(path + "previous_agg.csv", index=False, header=True)
del prev_temp, previous_application, previous_agg
gc.collect()

65

time: 1h 49min 26s


# Bureau Balance

#### Aggregation function

In [36]:
def bureau_balance_agg_func(g):
    mask6 = g["MONTHS_BALANCE"] >= -6
    mask12 = g["MONTHS_BALANCE"] >= -12
    closed = g["STATUS"] == "C"

    d = {
        "WORST_DQ_BUREAU_BALANCE_6M": np.nanmax(g["STATUS"].apply(lambda x: 0 if x == "C" else int(x)).where(mask6)), 
        "WORST_DQ_BUREAU_BALANCE_12M": np.nanmax(g["STATUS"].apply(lambda x: 0 if x == "C" else int(x)).where(mask12)), 
        "LEN_BUREAU_BALANCE": np.nansum(~closed), 
        "SUM_CLOSED_BUREAU_BALANCE": np.nansum(closed), 
        "SUM_CURRENT_BUREAU_BALANCE": np.nansum(g["STATUS"] == "0"), 
        "SUM_DQ_BUREAU_BALANCE": np.nansum(g["STATUS"].isin(["1", "2", "3", "3", "4", "5"])),
        "WORST_DQ_BUREAU_BALANCE": np.nanmax(g["STATUS"].apply(lambda x: 0 if x == "C" else int(x))), 
        "AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nansum(abs(g["MONTHS_BALANCE"]).where(~closed)) / np.nansum(~closed), 
        "MIN_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmin(g["MONTHS_BALANCE"].where(~closed)), 
        "MAX_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["MONTHS_BALANCE"].where(~closed)), 
    }

    return pd.Series(d)

time: 25.5 ms


#### Process data and write to file

In [None]:
bureau_balance = pd.read_csv(path + "bureau_balance.csv")

bureau_balance["STATUS"] = bureau_balance["STATUS"].where(lambda x: x != "X").fillna("0")

bureau_balance_agg = bureau_balance.groupby("SK_ID_BUREAU").apply(bureau_balance_agg_func).reset_index()
bureau_balance_agg.to_csv(path + "bureau_balance_agg.csv", index=False, header=True)
del bureau_balance, bureau_balance_agg
gc.collect()

# Bureau

This is dependent on `bureau_balance_agg`

#### Aggregation function

In [89]:
def bureau_agg_func(g):
    mask3 = g["DAYS_CREDIT_UPDATE"] >= -90
    mask6 = g["DAYS_CREDIT_UPDATE"] >= -180
    mask12 = g["DAYS_CREDIT_UPDATE"] >= -360
    mask24 = g["DAYS_CREDIT_UPDATE"] >= -720
    active = g["CREDIT_ACTIVE"] == "Active"
    cc = g["CREDIT_TYPE"] == "Credit card"
    
    d = {
        "SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M": np.nansum((g["AMT_CREDIT_SUM_DEBT"] / g["DAYS_CREDIT_ENDDATE"]).where(active & mask12)),
        "SUM_CC_DEBT_6M": np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(cc & mask6)), 
        "SUM_CC_DEBT_12M": np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(cc & mask12)), 
        "MAX_WORST_DQ_BUREAU_BALANCE_6M": np.nanmax(g["WORST_DQ_BUREAU_BALANCE_6M"].where(mask6)), 
        "MAX_WORST_DQ_BUREAU_BALANCE_12M": np.nanmax(g["WORST_DQ_BUREAU_BALANCE_12M"].where(mask12)), 
        "MAX_BUREAU_UTILIZATION_6M": np.nanmax((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM"]).where(mask6)), 
        "MAX_BUREAU_UTILIZATION_12M": np.nanmax((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM"]).where(mask12)), 
        "COUNT_ACTIVE_6M": np.nansum((g["CREDIT_ACTIVE"] == "Active").where(active & mask6)), 
        "COUNT_ACTIVE_12M": np.nansum((g["CREDIT_ACTIVE"] == "Active").where(active & mask12)), 
        "COUNT_ACTIVE_24M": np.nansum((g["CREDIT_ACTIVE"] == "Active").where(active & mask24)), 
        "DAYS_REMAINING_ACTIVE": np.nansum(g["DAYS_CREDIT_ENDDATE"].where(active)), 
        "MAX_CREDIT_DAY_OVERDUE_6M": np.nanmax(g["CREDIT_DAY_OVERDUE"].where(mask6)), 
        "MAX_CREDIT_DAY_OVERDUE_DIFF_6M_12M": np.nanmax(g["CREDIT_DAY_OVERDUE"].where(mask6)) - np.nanmax(g["CREDIT_DAY_OVERDUE"].where(mask6 ^ mask12)), 
        "BUREAU_UTILIZATION_DIFF_6M_12M": np.nanmean((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]).where(active & mask6)) - np.nanmean((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]).where(active & (mask6 ^ mask12))), 
        "BUREAU_UTILIZATION_DIFF_12M_24M": np.nanmean((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]).where(active & mask12)) - np.nanmean((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]).where(active & (mask6 ^ mask24))), 
        "BUREAU_SUM_DEBT_DIFF_6M_12M": np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(active & mask6)) - np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(active & (mask6 ^ mask12))),
        "BUREAU_SUM_DEBT_DIFF_12M_24M": np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(active & mask6)) - np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(active & (mask6 ^ mask12))),         
        "MAX_CNT_CREDIT_PROLONG": np.nanmax(g["CNT_CREDIT_PROLONG"]), 
        "AVG_LEN_BUREAU_BALANCE": np.nanmean(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CURRENT": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CLOSED": np.nansum(g["SUM_CLOSED_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CURRENT_WEIGHTED": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]) / np.nansum(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmin(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]) - np.nanmin(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "SUM_SUM_CURRENT_BUREAU_BALANCE": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]), 
        "AVG_PROP_CURRENT": np.nanmean(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ": np.nanmean(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ": np.nanmax(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]),
        "AVG_PROP_CURRENT_WEIGHTED": np.nanmean(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_PROP_CURRENT_WEIGHTED": np.nanmin(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ_WEIGHTED": np.nanmean(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ_WEIGHTED": np.nanmax(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_CURRENT_WEIGHTED_AMT": np.nanmean(g["AMT_CREDIT_SUM"] * g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_PROP_CURRENT_WEIGHTED_AMT": np.nanmin(g["AMT_CREDIT_SUM"] * g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ_WEIGHTED_AMT": np.nanmean(g["AMT_CREDIT_SUM"] * g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ_WEIGHTED_AMT": np.nanmax(g["AMT_CREDIT_SUM"] * g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]),          
        "AVG_WORST_DQ_BUREAU_BALANCE": np.nanmean(g["WORST_DQ_BUREAU_BALANCE"]), 
        "MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED": np.nanmax(g["WORST_DQ_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED": np.nanmean(g["WORST_DQ_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "TOTAL_AMT_CREDIT_SUM_POS_DAYS": np.nansum(g["AMT_CREDIT_SUM"].where(g["DAYS_CREDIT_ENDDATE"] > 0)),
        "SUM_DAYS_CREDIT_ENDDATE_POS_DAYS": np.nansum(g["DAYS_CREDIT_ENDDATE"].where(g["DAYS_CREDIT_ENDDATE"] > 0)), 
        "MAX_LEN_BUREAU_BALANCE": np.nanmax(g["LEN_BUREAU_BALANCE"]), 
        "SUM_LEN_BUREAU_BALANCE": np.nanmax(g["LEN_BUREAU_BALANCE"]), 
        "MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmin(g["MIN_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_DAYS_CREDIT_ENDDATE": np.nanmin(g["DAYS_CREDIT_ENDDATE"]), 
        "MAX_DAYS_CREDIT_ENDDATE": np.nanmax(g["DAYS_CREDIT_ENDDATE"]), 
        "SUM_DAYS_CREDIT_ENDDATE": np.nansum(g["DAYS_CREDIT_ENDDATE"]), 
        "SUM_NULL_DAYS_ENDDATE_FACT": np.nansum(g["DAYS_ENDDATE_FACT"].isnull()), 
        "COUNT_BUREAU_RECORDS": len(g), 
        "COUNT_ACTIVE": np.nansum(active), 
        "MAX_CREDIT_DAY_OVERDUE_WEIGHTED": np.nanmax(g["CREDIT_DAY_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_CREDIT_DAY_OVERDUE_WEIGHTED": np.nansum(g["CREDIT_DAY_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "MAX_CREDIT_DAY_OVERDUE": np.nanmax(g["CREDIT_DAY_OVERDUE"]), 
        "SUM_CREDIT_DAY_OVERDUE": np.nansum(g["CREDIT_DAY_OVERDUE"]), 
        "DAYS_SINCE_APPLIED": - np.nanmax(g["DAYS_CREDIT"]), 
        "SUM_INVERSE_DAYS_CREDIT": - np.nansum(1 / g["DAYS_CREDIT"]), 
        "MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED": np.nanmax(g["AMT_CREDIT_MAX_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED": np.nansum(g["AMT_CREDIT_MAX_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "MAX_AMT_CREDIT_MAX_OVERDUE": np.nanmax(g["AMT_CREDIT_MAX_OVERDUE"]), 
        "SUM_AMT_CREDIT_MAX_OVERDUE": np.nansum(g["AMT_CREDIT_MAX_OVERDUE"]), 
        "SUM_CNT_CREDIT_PROLONG": np.nansum(g["CNT_CREDIT_PROLONG"]), 
        "SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED": np.nansum(g["AMT_CREDIT_SUM_DEBT"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_AMT_CREDIT_SUM_DEBT": np.nansum(g["AMT_CREDIT_SUM_DEBT"]),
        "BUREAU_UTILIZATION_AVG": np.nanmean(g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]), 
        "BUREAU_UTILIZATION_MAX": np.nanmax(g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]), 
        "BUREAU_PROP_SUM_OVERDUE_AVG": np.nanmean(g["AMT_CREDIT_SUM_OVERDUE"] / g["AMT_CREDIT_SUM_DEBT"]), 
        "BUREAU_PROP_MAX_OVERDUE_AVG": np.nanmean(g["AMT_CREDIT_MAX_OVERDUE"] / g["AMT_CREDIT_SUM_DEBT"]), 
        "MAX_DAYS_CREDIT_UPDATE": np.nanmax(g["DAYS_CREDIT_UPDATE"]), 
        "RANGE_DAYS_CREDIT_UPDATE": np.nanmax(g["DAYS_CREDIT_UPDATE"]) - np.nanmin(g["DAYS_CREDIT_UPDATE"]), 
        "DAYS_CREDIT_RANGE": np.nanmax(g["DAYS_CREDIT"]) - np.nanmin(g["DAYS_CREDIT"]), 
        "TOTAL_AMT_CREDIT_SUM_WEIGHTED": np.nansum(g["AMT_CREDIT_SUM"] / abs(g["DAYS_CREDIT_UPDATE"])),
        "TOTAL_AMT_CREDIT_SUM": np.nansum(g["AMT_CREDIT_SUM"]),
        "COUNT_CREDIT_CARD": np.nansum(g["CREDIT_TYPE"] == "Credit card"), 
        "COUNT_CAR_LOAN": np.nansum(g["CREDIT_TYPE"] == "Car loan"), 
        "COUNT_MORTGAGE": np.nansum(g["CREDIT_TYPE"] == "Mortgage"), 
        "SUM_AMT_ANNUITY": np.nansum(g["AMT_ANNUITY"]), 
    }
    
    return pd.Series(d)

time: 433 ms


#### Process data and write to file

In [90]:
bureau = pd.read_csv(path + "bureau.csv")
bureau_balance_agg = pd.read_csv(path + "bureau_balance_agg.csv")

bureau_joined = pd.merge(bureau, 
                         bureau_balance_agg, 
                         how="left", 
                         on="SK_ID_BUREAU")

bureau_agg = bureau_joined.groupby("SK_ID_CURR").apply(bureau_agg_func).reset_index()
bureau_agg.to_csv(path + "bureau_agg.csv", index=False, header=True)

time: 4h 9min 27s


# Credit card

#### Aggregation function

In [34]:
def credit_card_agg_func(g):
    mask3 = g["MONTHS_BALANCE"] >= -3
    mask6 = g["MONTHS_BALANCE"] >= -6
    mask12 = g["MONTHS_BALANCE"] >= -12
    active = g["NAME_CONTRACT_STATUS"] == "Active"
    overdue = g["SK_DPD"] > 0
    
    d = {
        "MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M": np.nanmax(g["AMT_INST_MIN_REGULARITY"].where(overdue & mask6)), 
        "MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M": np.nanmin(g["AMT_INST_MIN_REGULARITY"].where(overdue & mask12)), 
        "SUM_CNT_DRAWINGS_ATM_CURRENT_6M": np.nansum(g["CNT_DRAWINGS_ATM_CURRENT"].where(mask6)), 
        "SUM_AMT_DRAWINGS_ATM_CURRENT_6M": np.nansum(g["AMT_DRAWINGS_ATM_CURRENT"].where(mask6)), 
        "MAX_AMT_DRAWINGS_ATM_CURRENT_6M": np.nanmax(g["AMT_DRAWINGS_ATM_CURRENT"].where(mask6)), 
        "MAX_CNT_DRAWINGS_ATM_CURRENT_6M": np.nanmax(g["CNT_DRAWINGS_ATM_CURRENT"].where(mask6)), 
        "MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M": np.nanmax((g["AMT_RECIVABLE"] / g["AMT_RECEIVABLE_PRINCIPAL"]).where(mask6)), 
        "MAX_UTILIZATION_6M": np.nanmax((g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]).where(mask6)),
        "MAX_CREDIT_CARD_SK_DPD_6M": np.nanmax(g["SK_DPD"].where(mask6)), 
        "MAX_CREDIT_CARD_SK_DPD_12M": np.nanmax(g["SK_DPD"].where(mask12)),
        "MAX_AMT_DRAWINGS_CURRENT_6M": np.nanmax(g["AMT_DRAWINGS_CURRENT"].where(mask6)), 
        "MAX_AMT_DRAWINGS_CURRENT_12M": np.nanmax(g["AMT_DRAWINGS_CURRENT"].where(mask12)), 
        "MAX_AMT_INST_MIN_REGULARITY_6M": np.nanmax(g["AMT_INST_MIN_REGULARITY"].where(mask6)), 
        "MAX_AMT_INST_MIN_REGULARITY_12M": np.nanmax(g["AMT_INST_MIN_REGULARITY"].where(mask12)), 
        "MAX_CNT_DRAWINGS_POS_CURRENT_6M": np.nanmax(g["CNT_DRAWINGS_POS_CURRENT"].where(mask6)), 
        "MAX_CNT_DRAWINGS_POS_CURRENT_12M": np.nanmax(g["CNT_DRAWINGS_POS_CURRENT"].where(mask12)), 
        "SUM_CC_PAYMENT_DIFF_12M": np.nansum((g["AMT_PAYMENT_TOTAL_CURRENT"] - g["AMT_INST_MIN_REGULARITY"]).where(mask12)),
        "DIFF_AVG_BALANCE_6M_12M": np.nanmean(g["AMT_BALANCE"].where(mask6)) - np.nanmean(g["AMT_BALANCE"].where(mask6 ^ mask12)),
        "AVG_BALANCE_6M": np.nanmean(g["AMT_BALANCE"].where(mask6)),
        "AVG_UTILIZATION_6M": np.nanmean((g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]).where(mask6)),
        "AVG_BALANCE": np.nanmean(g["AMT_BALANCE"]), 
        "MAX_BALANCE": np.nanmax(g["AMT_BALANCE"]), 
        "SUM_BALANCE": np.nansum(g["AMT_BALANCE"]), 
        "MAX_MONTHS_BALANCE": np.nanmax(abs(g["MONTHS_BALANCE"])), 
        "MIN_MONTHS_BALANCE": np.nanmin(abs(g["MONTHS_BALANCE"])), 
        "RANGE_MONTHS_BALANCE": np.nanmax(g["MONTHS_BALANCE"]) - np.nanmin(g["MONTHS_BALANCE"]), 
        "AVG_UTILIZATION": np.nanmean(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]), 
        "MAX_UTILIZATION": np.nanmax(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]), 
        "AVG_BALANCE_WEIGHTED": np.nanmean(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_BALANCE_WEIGHTED": np.nanmax(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "SUM_BALANCE_WEIGHTED": np.nansum(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "AVG_UTILIZATION_WEIGHTED": np.nanmean(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_UTILIZATION_WEIGHTED": np.nanmax(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_DPD_WEIGHTED": np.nanmax(g["SK_DPD"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_DPD_DEF_WEIGHTED": np.nanmax(g["SK_DPD_DEF"] / abs(g["MONTHS_BALANCE"])), 
        "SUM_CNT_DRAWINGS_CURRENT": np.nansum(g["CNT_DRAWINGS_CURRENT"]), 
        "AVG_CNT_DRAWINGS_CURRENT": np.nanmean(g["CNT_DRAWINGS_CURRENT"]), 
        "MAX_CNT_DRAWINGS_CURRENT": np.nanmax(g["CNT_DRAWINGS_CURRENT"]), 
        "SUM_AMT_DRAWINGS_CURRENT": np.nansum(g["AMT_DRAWINGS_CURRENT"]), 
        "AVG_AMT_DRAWINGS_CURRENT": np.nanmean(g["AMT_DRAWINGS_CURRENT"]), 
        "MAX_AMT_DRAWINGS_CURRENT": np.nanmax(g["AMT_DRAWINGS_CURRENT"]), 
        "MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmin(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"]), 
        "AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmean(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"]), 
        "MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmax(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"]), 
    }
    
    return pd.Series(d)

time: 49 ms


#### Process data and write to file

In [35]:
credit_card = pd.read_csv(path + "credit_card_balance.csv")

credit_card_agg = credit_card.groupby("SK_ID_CURR").apply(credit_card_agg_func).reset_index()
credit_card_agg.to_csv(path + "credit_card_agg.csv", index=False, header=True)

time: 34min 6s


# Installments

#### Aggregation function

In [10]:
def installment_agg_func(g):
    mask6 = g["DAYS_ENTRY_PAYMENT"] >= -180
    mask12 = g["DAYS_ENTRY_PAYMENT"] >= -360
    
    d = {
        "MAX_UNDERPAYMENT_6M": np.nanmax((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask6)), 
        "MAX_UNDERPAYMENT_12M": np.nanmax((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask12)), 
        "SUM_PAYMENT_6M": np.nansum(g["AMT_PAYMENT"].where(mask6)), 
        "SUM_PAYMENT_DIFF_6M_12M": np.nansum(g["AMT_PAYMENT"].where(mask6)) - np.nansum(g["AMT_PAYMENT"].where(mask6 ^ mask12)), 
        "MAX_AMT_INSTALMENT_6M": np.nanmax(g["AMT_INSTALMENT"].where(mask6)), 
        "MIN_AMT_INSTALMENT_6M": np.nanmin(g["AMT_INSTALMENT"].where(mask6)), 
        "MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M": np.nanmax((g["DAYS_ENTRY_PAYMENT"] - g["DAYS_INSTALMENT"])), 
        "MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M": np.nanmin((g["DAYS_ENTRY_PAYMENT"] - g["DAYS_INSTALMENT"])), 
        "SUM_UNDERPAYMENT_12M": np.nansum((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask12)), 
        "SUM_UNDERPAYMENT_6M": np.nansum((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask6)), 
        "MAX_PAYMENT_SIZE_6M": np.nanmax(g["AMT_PAYMENT"].where(mask6)), 
        "MAX_PAYMENT_SIZE_12M": np.nanmax(g["AMT_PAYMENT"].where(mask12)), 
        "MIN_PAYMENT_SIZE_6M": np.nanmin(g["AMT_PAYMENT"].where(mask6)),
        "MAX_ABS_DAYS_INSTALMENT": np.nanmax(abs(g["DAYS_INSTALMENT"])), 
        "COUNT_UNDERPAYMENT": np.nansum(g["AMT_PAYMENT"] / g["AMT_INSTALMENT"] < 0.5), 
        "SUM_UNDERPAYMENT": np.nansum(g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]), 
        "SUM_UNDERPAYMENT_WEIGHTED": np.nansum((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]) / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MAX_UNDERPAYMENT": np.nanmax(g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]), 
        "AVG_PAYMENT_SIZE_WEIGHTED": np.nanmean(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "AVG_PAYMENT_SIZE": np.nanmean(g["AMT_PAYMENT"]), 
        "MAX_PAYMENT_SIZE_WEIGHTED": np.nanmax(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MAX_PAYMENT_SIZE": np.nanmax(g["AMT_PAYMENT"]), 
        "MIN_PAYMENT_SIZE_WEIGHTED": np.nanmin(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MIN_PAYMENT_SIZE": np.nanmin(g["AMT_PAYMENT"]),
        "SUM_PAYMENT_WEIGHTED": np.nansum(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "SUM_PAYMENT": np.nansum(g["AMT_PAYMENT"]),
        "SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT": np.nansum(g["DAYS_ENTRY_PAYMENT"] > g["DAYS_INSTALMENT"]), 
        "MAX_DAYS_ENTRY_PAYMENT": np.nanmax(g["DAYS_ENTRY_PAYMENT"]), 
        "MIN_DAYS_ENTRY_PAYMENT": np.nanmin(g["DAYS_ENTRY_PAYMENT"]), 
        "RANGE_DAYS_ENTRY_PAYMENT": np.nanmax(g["DAYS_ENTRY_PAYMENT"]) - np.nanmin(g["DAYS_ENTRY_PAYMENT"]), 
    }
    
    return pd.Series(d)

#### Process data and write to file

In [12]:
installments = pd.read_csv(path + "installments_payments.csv")

installment_agg = installments.groupby("SK_ID_CURR").apply(installment_agg_func).reset_index()
installment_agg.to_csv(path + "installment_agg.csv", index=False, header=True)

# Point of Sale

#### Aggregation function

In [13]:
def pos_cash_agg_func(g):
    mask3 = g["MONTHS_BALANCE"] >= -3
    mask6 = g["MONTHS_BALANCE"] >= -6
    mask12 = g["MONTHS_BALANCE"] >= -12
    overdue = g["SK_DPD"] > 0
    
    d = {
        "MAX_POS_DPD": np.nanmax(g["SK_DPD"]), 
        "MAX_POS_DPD_DEF": np.nanmax(g["SK_DPD_DEF"]), 
        "NUM_POS_CASH": g["SK_ID_PREV"].nunique(), 
    }
    
    return pd.Series(d)

#### Process data and write to file

In [14]:
pos_cash = pd.read_csv(path + "POS_CASH_balance.csv")

pos_cash_agg = pos_cash.groupby("SK_ID_CURR").apply(pos_cash_agg_func).reset_index()
pos_cash_agg.to_csv(path + "pos_cash_agg.csv", index=False, header=True)

# Join all files

In [125]:
train_or_test = "train"

time: 809 µs


In [126]:
application = pd.read_csv(path + "application_" + train_or_test + ".csv")
previous_agg = pd.read_csv(path + "previous_agg.csv")
# bureau_balance_agg should already be joined with bureau_agg
bureau_agg = pd.read_csv(path + "bureau_agg.csv")
credit_card_agg = pd.read_csv(path + "credit_card_agg.csv")
installment_agg = pd.read_csv(path + "installment_agg.csv")
pos_cash_agg = pd.read_csv(path + "pos_cash_agg.csv")

df = pd.merge(application, previous_agg, how="left", on="SK_ID_CURR")
df = pd.merge(df, bureau_agg, how="left", on="SK_ID_CURR")
df = pd.merge(df, credit_card_agg, how="left", on="SK_ID_CURR")
df = pd.merge(df, installment_agg, how="left", on="SK_ID_CURR")
df = pd.merge(df, pos_cash_agg, how="left", on="SK_ID_CURR")
del previous_agg, bureau_agg, credit_card_agg, installment_agg, pos_cash_agg
gc.collect()

95

time: 1min 7s


#### Construct additional features

In [127]:
df["TOTAL_AMT_CREDIT_SUM_DIV_SUM_DAYS_CREDIT_ENDDATE"] = df["TOTAL_AMT_CREDIT_SUM"] / df["SUM_DAYS_CREDIT_ENDDATE"]
df["TOTAL_AMT_CREDIT_SUM_POS_DAYS_DIV_SUM_DAYS_CREDIT_ENDDATE_POS_DAYS"] = df["TOTAL_AMT_CREDIT_SUM_POS_DAYS"] / df["SUM_DAYS_CREDIT_ENDDATE_POS_DAYS"]
df["MAX_ABS_DAYS_INSTALMENT_DIV_DAYS_BIRTH"] = df["MAX_ABS_DAYS_INSTALMENT"] / df["DAYS_BIRTH"]
df["FLAG_OWN_CAR"] = (df["FLAG_OWN_CAR"] == "Y").astype(int)
df["FLAG_OWN_REALTY"] = (df["FLAG_OWN_REALTY"] == "Y").astype(int)
df["AMT_CREDIT_DIV_AMT_INCOME_TOTAL"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]
df["AMT_CREDIT_PLUS_AMT_INCOME_TOTAL"] = df["AMT_CREDIT"] + df["AMT_INCOME_TOTAL"]
df["AMT_CREDIT_DIV_AMT_GOODS_PRICE"] = df["AMT_CREDIT"] / df["AMT_GOODS_PRICE"]
df["AMT_CREDIT_DIV_SUM_PAYMENT"] = df["AMT_CREDIT"] / df["SUM_PAYMENT"]
df["AMT_GOODS_PRICE_DIV_AMT_INCOME_TOTAL"] = df["AMT_GOODS_PRICE"] / df["AMT_INCOME_TOTAL"]
df["AMT_CREDIT_DIV_AMT_ANNUITY"] = df["AMT_CREDIT"] / df["AMT_ANNUITY"]
df["AMT_CREDIT_DIV_AVG_PREV_REQ_AMOUNT"] = df["AMT_CREDIT"] / df["AVG_PREV_REQ_AMOUNT"]
df["AMT_CREDIT_DIV_MAX_PREV_REQ_AMOUNT"] = df["AMT_CREDIT"] / df["MAX_PREV_REQ_AMOUNT"]
df["EXT_SOURCE_PROD"] = df["EXT_SOURCE_1"] * df["EXT_SOURCE_2"] * df["EXT_SOURCE_3"]
df["DAYS_EMPLOYED_DIV_DAYS_BIRTH"] = df["DAYS_EMPLOYED"] / df["DAYS_BIRTH"]
df["DAYS_EMPLOYED_PLUS_DAYS_REGISTRATION_PLUS_DAYS_LAST_PHONE_CHANGE"] = df["DAYS_EMPLOYED"] + df["DAYS_REGISTRATION"] + df["DAYS_LAST_PHONE_CHANGE"]
df["AVG_PAYMENT_SIZE_DIV_AMT_INCOME_TOTAL"] = df["AVG_PAYMENT_SIZE"] / df["AMT_INCOME_TOTAL"]
df["AVG_PAYMENT_SIZE_DIV_AMT_CREDIT"] = df["AVG_PAYMENT_SIZE"] / df["AMT_CREDIT"]
df["AVG_PAYMENT_SIZE_DIV_AMT_ANNUITY"] = df["AVG_PAYMENT_SIZE"] / df["AMT_ANNUITY"]
df["DAYS_REGISTRATION_PLUS_DAYS_ID_PUBLISH"] = df["DAYS_REGISTRATION"] + df["DAYS_ID_PUBLISH"]
df["SUM_REFUSED_CONTRACT_DIV_SUM_APPR_CONTRACT"] = df["SUM_REFUSED_CONTRACT"] / df["SUM_APPR_CONTRACT"]
df["MAX_UTILIZATION_DIV_AVG_UTILIZATION"] = df["MAX_UTILIZATION"] / df["AVG_UTILIZATION"]
df["MAX_PREV_REQ_AMOUNT_DIV_AMT_CREDIT"] = df["MAX_PREV_REQ_AMOUNT"] / df["AMT_CREDIT"]
df["AMT_INCOME_TOTAL_DIV_DAYS_BIRTH"] = df["AMT_INCOME_TOTAL"] / df["DAYS_BIRTH"]
df["SUM_DAYS_ID_REG_PHONE"] = df["DAYS_ID_PUBLISH"] + df["DAYS_REGISTRATION"] + df["DAYS_LAST_PHONE_CHANGE"]
df["SUM_REQ_CREDIT_YEAR"] = df["AMT_REQ_CREDIT_BUREAU_HOUR"] + df["AMT_REQ_CREDIT_BUREAU_DAY"] + df["AMT_REQ_CREDIT_BUREAU_WEEK"] + df["AMT_REQ_CREDIT_BUREAU_MON"] + df["AMT_REQ_CREDIT_BUREAU_QRT"] + df["AMT_REQ_CREDIT_BUREAU_YEAR"]
df["SUM_REQ_CREDIT_QRT"] = df["AMT_REQ_CREDIT_BUREAU_HOUR"] + df["AMT_REQ_CREDIT_BUREAU_DAY"] + df["AMT_REQ_CREDIT_BUREAU_WEEK"] + df["AMT_REQ_CREDIT_BUREAU_MON"] + df["AMT_REQ_CREDIT_BUREAU_QRT"]
df["SUM_REQ_CREDIT_1M"] = df["AMT_REQ_CREDIT_BUREAU_HOUR"] + df["AMT_REQ_CREDIT_BUREAU_DAY"] + df["AMT_REQ_CREDIT_BUREAU_WEEK"] + df["AMT_REQ_CREDIT_BUREAU_MON"]
df["SUM_REQ_CREDIT_1M_DIV_SUM_REQ_CREDIT_QRT"] = df["SUM_REQ_CREDIT_1M"] / df["SUM_REQ_CREDIT_QRT"]
df["SUM_REQ_CREDIT_QRT_DIV_SUM_REQ_CREDIT_YEAR"] = df["SUM_REQ_CREDIT_QRT"] / df["SUM_REQ_CREDIT_YEAR"]
df["DEF_30_PLUS_60_CNT_SOCIAL_CIRCLE"] = df["DEF_30_CNT_SOCIAL_CIRCLE"] + df["DEF_60_CNT_SOCIAL_CIRCLE"]
df["OWN_CAR_AGE_DIV_DAYS_BIRTH"] = df["OWN_CAR_AGE"] / df["DAYS_BIRTH"]
df["LANDAREA_DIV_TOTALAREA_MODE"] = df["LANDAREA_MODE"] / df["TOTALAREA_MODE"]
df["OWN_CAR_AGE_PLUS_DAYS_BIRTH"] = df["OWN_CAR_AGE"] + df["DAYS_BIRTH"]
df["AMT_ANNUITY_DIV_DAYS_BIRTH"] = df["AMT_ANNUITY"] / df["DAYS_BIRTH"]
df["AMT_ANNUITY_DIV_DAYS_EMPLOYED"] = df["AMT_ANNUITY"] / df["DAYS_EMPLOYED"]
df["AMT_ANNUITY_PROD_DAYS_EMPLOYED"] = df["AMT_ANNUITY"] * df["DAYS_EMPLOYED"]
df["DAYS_REGISTRATION_DIV_DAYS_ID_PUBLISH"] = df["DAYS_REGISTRATION"] / df["DAYS_ID_PUBLISH"]
df["DAYS_REGISTRATION_DIV_DAYS_LAST_PHONE_CHANGE"] = df["DAYS_REGISTRATION"] / df["DAYS_LAST_PHONE_CHANGE"]
df["REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULATION_RELATIVE"] = df["REGION_RATING_CLIENT_W_CITY"] / df["REGION_POPULATION_RELATIVE"]
df["REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULATION_RELATIVE"] = df["REGION_RATING_CLIENT_W_CITY"] * df["REGION_POPULATION_RELATIVE"]
df["SUM_REG_NOT_FLAG"] = df["REG_REGION_NOT_LIVE_REGION"] + df["REG_REGION_NOT_WORK_REGION"] + df["LIVE_REGION_NOT_WORK_REGION"] + df["REG_CITY_NOT_LIVE_CITY"] + df["REG_CITY_NOT_WORK_CITY"] + df["LIVE_CITY_NOT_WORK_CITY"]
df["SUM_AVG_BUILD"] = df["APARTMENTS_AVG"] + df["BASEMENTAREA_AVG"] + df["YEARS_BEGINEXPLUATATION_AVG"] + df["YEARS_BUILD_AVG"] + df["COMMONAREA_AVG"] + df["ELEVATORS_AVG"] + df["ENTRANCES_AVG"] + df["FLOORSMAX_AVG"] + df["FLOORSMIN_AVG"] + df["LANDAREA_AVG"] + df["LIVINGAPARTMENTS_AVG"] + df["LIVINGAREA_AVG"] + df["NONLIVINGAPARTMENTS_AVG"] + df["NONLIVINGAREA_AVG"]
df["SUM_MODE_BUILD"] = df["APARTMENTS_MODE"] + df["BASEMENTAREA_MODE"] + df["YEARS_BEGINEXPLUATATION_MODE"] + df["YEARS_BUILD_MODE"] + df["COMMONAREA_MODE"] + df["ELEVATORS_MODE"] + df["ENTRANCES_MODE"] + df["FLOORSMAX_MODE"] + df["FLOORSMIN_MODE"] + df["LANDAREA_MODE"] + df["LIVINGAPARTMENTS_MODE"] + df["LIVINGAREA_MODE"] + df["NONLIVINGAPARTMENTS_MODE"] + df["NONLIVINGAREA_MODE"]
df["SUM_MEDI_BUILD"] = df["APARTMENTS_MEDI"] + df["BASEMENTAREA_MEDI"] + df["YEARS_BEGINEXPLUATATION_MEDI"] + df["YEARS_BUILD_MEDI"] + df["COMMONAREA_MEDI"] + df["ELEVATORS_MEDI"] + df["ENTRANCES_MEDI"] + df["FLOORSMAX_MEDI"] + df["FLOORSMIN_MEDI"] + df["LANDAREA_MEDI"] + df["LIVINGAPARTMENTS_MEDI"] + df["LIVINGAREA_MEDI"] + df["NONLIVINGAPARTMENTS_MEDI"] + df["NONLIVINGAREA_MEDI"]
df["SUM_DOC_FLAG"] = df["FLAG_DOCUMENT_2"] + df["FLAG_DOCUMENT_3"] + df["FLAG_DOCUMENT_4"] + df["FLAG_DOCUMENT_5"] + df["FLAG_DOCUMENT_6"] + df["FLAG_DOCUMENT_7"] + df["FLAG_DOCUMENT_8"] + df["FLAG_DOCUMENT_9"] + df["FLAG_DOCUMENT_10"] + df["FLAG_DOCUMENT_11"] + df["FLAG_DOCUMENT_12"] + df["FLAG_DOCUMENT_13"] + df["FLAG_DOCUMENT_14"] + df["FLAG_DOCUMENT_15"] + df["FLAG_DOCUMENT_16"] + df["FLAG_DOCUMENT_17"] + df["FLAG_DOCUMENT_18"] + df["FLAG_DOCUMENT_19"] + df["FLAG_DOCUMENT_20"] + df["FLAG_DOCUMENT_21"]
df["CNT_CHILDREN_DIV_DAYS_BIRTH"] = df["CNT_CHILDREN"] / df["DAYS_BIRTH"]
df["CNT_CHILDREN_DIV_REGION_POPULATION_RELATIVE"] = df["CNT_CHILDREN"] / df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_REALTY_PROD_REGION_POPULATION_RELATIVE"] = df["FLAG_OWN_REALTY"] * df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_REALTY_DIV_REGION_POPULATION_RELATIVE"] = df["FLAG_OWN_REALTY"] / df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_CAR_DIV_OWN_CAR_AGE"] = df["FLAG_OWN_CAR"] / df["OWN_CAR_AGE"]
df["EXT_SOURCE_1_DIV_DAYS_BIRTH"] = df["EXT_SOURCE_1"] / df["DAYS_BIRTH"]
df["EXT_SOURCE_1_PROD_DAYS_BIRTH"] = df["EXT_SOURCE_1"] * df["DAYS_BIRTH"]

time: 588 ms


#### Remove infinite values

In [128]:
df.replace([-np.inf, np.inf], np.nan, inplace=True)

time: 10.1 s


#### Remove income outliers

In [129]:
df.loc[df["AMT_INCOME_TOTAL"] > 500000, "AMT_INCOME_TOTAL"] = np.nan

time: 10.5 ms


#### Handle special values for DAYS_EMPLOYED

In [130]:
df.loc[df["DAYS_EMPLOYED"] > 0, "DAYS_EMPLOYED"] = np.nan

time: 134 ms


#### Encode categorical features

Order `ORGANIZATION_TYPE` categories and map to integers (`org_type_map` was obtained by checking the default rates by group within the training data)

In [131]:
org_type_map = {"Trade: type 4": 0, "Industry: type 12": 1, "Transport: type 1": 2, "Trade: type 6": 3,
    "Security Ministries": 4, "University": 5, "Police": 6, "Military": 7,
    "Bank": 8, "XNA": 9, "Culture": 10, "Insurance": 11,
    "Religion": 12, "School": 13, "Trade: type 5": 14, "Hotel": 15,
    "Industry: type 10": 16, "Medicine": 17, "Services": 18, "Electricity": 19,
    "Industry: type 9": 20, "Industry: type 5": 21, "Government": 22, "Trade: type 2": 23,
    "Kindergarten": 24, "Emergency": 25, "Industry: type 6": 26, "Industry: type 2": 27,
    "Telecom": 28, "Other": 29, "Transport: type 2": 30, "Legal Services": 31,
    "Housing": 32, "Industry: type 7": 33, "Business Entity Type 1": 34, "Advertising": 35,
    "Postal": 36, "Business Entity Type 2": 37, "Industry: type 11": 38, "Trade: type 1": 39,
    "Mobile": 40, "Transport: type 4": 41, "Business Entity Type 3": 42, "Trade: type 7": 43,
    "Security": 44, "Industry: type 4": 45, "Self-employed": 46, "Trade: type 3": 47,
    "Agriculture": 48, "Realtor": 49, "Industry: type 3": 50, "Industry: type 1": 51,
    "Cleaning": 52, "Construction": 53, "Restaurant": 54, "Industry: type 8": 55,
    "Industry: type 13": 56, "Transport: type 3": 57}

time: 16.5 ms


In [132]:
df["ORGANIZATION_TYPE"] = df["ORGANIZATION_TYPE"].map(org_type_map)

time: 153 ms


Dummy code remaining categorical features

In [133]:
df = pd.get_dummies(df, dummy_na=True)
df.columns = df.columns.str.replace("\s+", "_")

time: 6.12 s


In [134]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,MIN_PREV_AMT_ANNUITY_12M,MIN_PREV_AMT_ANNUITY_24M,MIN_PREV_PROP_APPROVED_12M,AVG_SYNTH_TARGET_12M,AVG_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_24M,MAX_PREV_PROP_APPROVED_12M,MAX_PREV_PROP_APPROVED_24M,COUNT_PREV_APP,MIN_PREV_DAYS_TERMINATION,MAX_PREV_DAYS_TERMINATION,AVG_PREV_DAYS_TERMINATION,RANGE_PREV_DAYS_TERMINATION,MIN_PREV_AMT_CREDIT,MAX_PREV_AMT_CREDIT,AVG_PREV_AMT_CREDIT,MIN_PREV_AMT_CREDIT_WEIGHTED,MAX_PREV_AMT_CREDIT_WEIGHTED,AVG_PREV_AMT_CREDIT_WEIGHTED,MIN_PREV_AMT_CREDIT_DIV_ANNUITY,MAX_PREV_AMT_CREDIT_DIV_ANNUITY,AVG_PREV_AMT_CREDIT_DIV_ANNUITY,MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MIN_PREV_AMT_ANNUITY,MAX_PREV_AMT_ANNUITY,AVG_PREV_AMT_ANNUITY,MIN_PREV_AMT_ANNUITY_WEIGHTED,MAX_PREV_AMT_ANNUITY_WEIGHTED,AVG_PREV_AMT_ANNUITY_WEIGHTED,MIN_DAYS_DECISION,MAX_DAYS_DECISION,RANGE_DAYS_DECISION,SUM_DAYS_LAST_DUE_NULL,AVG_DAYS_LAST_DUE_NULL,AVG_PREV_REQ_AMOUNT_WEIGHTED,MAX_PREV_REQ_AMOUNT_WEIGHTED,AVG_PREV_REQ_AMOUNT,MAX_PREV_REQ_AMOUNT,AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED,AVG_PREV_PROP_APPROVED_WEIGHTED,MAX_PREV_PROP_APPROVED_WEIGHTED,AVG_PREV_RATE_DOWNPAYMENT,AVG_PREV_PROP_APPROVED,MAX_PREV_PROP_APPROVED,MIN_PREV_PROP_APPROVED,AVG_PREV_INT_RATE,SUM_PREV_URGENT_NEEDS,SUM_PREV_REPAIRS,SUM_PREV_OTHER,SUM_PREV_LIMIT_REJECT,SUM_REFUSED_CONTRACT,SUM_CANC_CONTRACT,SUM_APPR_CONTRACT,SUM_PREV_HC_REJECT,SUM_PREV_INSURE_REQ,COUNT_PREV_WALK_IN,COUNT_PREV_HIGH_YIELD,COUNT_PREV_LOW_YIELD,AVG_SYNTH_TARGET,SUM_SYNTH_TARGET_WEIGHTED,SUM_SYNTH_TARGET,MAX_SYNTH_TARGET,MIN_SYNTH_TARGET,RANGE_SYNTH_TARGET,SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE,SUM_DAYS_FIRST_DRAWING_SENTINEL,SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,SUM_DAYS_LAST_DUE_LT_FIRST_VERSION,MIN_RATE_INTEREST_PRIMARY_12M,AVG_RATE_INTEREST_PRIVILEGED_12M,SUM_REFUSED_CONTRACT_6M,SUM_PRODUCT_COMBINATION_POS_HOUSE_INTEREST_12M,SUM_PRODUCT_COMBINATION_POS_MOBILE_INTEREST_12M,SUM_NAME_GOODS_CATEGORY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_CSTR_6M,SUM_NAME_PAYMENT_TYPE_XNA_6M,COUNT_NAME_CLIENT_TYPE_REPEATER_12M,COUNT_NAME_CLIENT_TYPE_NEW_12M,AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M,SUM_CC_DEBT_6M,SUM_CC_DEBT_12M,MAX_WORST_DQ_BUREAU_BALANCE_6M,MAX_WORST_DQ_BUREAU_BALANCE_12M,MAX_BUREAU_UTILIZATION_6M,MAX_BUREAU_UTILIZATION_12M,COUNT_ACTIVE_6M,COUNT_ACTIVE_12M,COUNT_ACTIVE_24M,DAYS_REMAINING_ACTIVE,MAX_CREDIT_DAY_OVERDUE_6M,MAX_CREDIT_DAY_OVERDUE_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_12M_24M,BUREAU_SUM_DEBT_DIFF_6M_12M,BUREAU_SUM_DEBT_DIFF_12M_24M,MAX_CNT_CREDIT_PROLONG,AVG_LEN_BUREAU_BALANCE,PROP_CURRENT,PROP_CLOSED,PROP_CURRENT_WEIGHTED,MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE,MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE,RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE,SUM_SUM_CURRENT_BUREAU_BALANCE,AVG_PROP_CURRENT,AVG_PROP_DQ,MAX_PROP_DQ,AVG_PROP_CURRENT_WEIGHTED,MIN_PROP_CURRENT_WEIGHTED,AVG_PROP_DQ_WEIGHTED,MAX_PROP_DQ_WEIGHTED,AVG_PROP_CURRENT_WEIGHTED_AMT,MIN_PROP_CURRENT_WEIGHTED_AMT,AVG_PROP_DQ_WEIGHTED_AMT,MAX_PROP_DQ_WEIGHTED_AMT,AVG_WORST_DQ_BUREAU_BALANCE,MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED,AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED,TOTAL_AMT_CREDIT_SUM_POS_DAYS,SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,MAX_LEN_BUREAU_BALANCE,SUM_LEN_BUREAU_BALANCE,MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE,MIN_DAYS_CREDIT_ENDDATE,MAX_DAYS_CREDIT_ENDDATE,SUM_DAYS_CREDIT_ENDDATE,SUM_NULL_DAYS_ENDDATE_FACT,COUNT_BUREAU_RECORDS,COUNT_ACTIVE,MAX_CREDIT_DAY_OVERDUE_WEIGHTED,SUM_CREDIT_DAY_OVERDUE_WEIGHTED,MAX_CREDIT_DAY_OVERDUE,SUM_CREDIT_DAY_OVERDUE,DAYS_SINCE_APPLIED,SUM_INVERSE_DAYS_CREDIT,MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,MAX_AMT_CREDIT_MAX_OVERDUE,SUM_AMT_CREDIT_MAX_OVERDUE,SUM_CNT_CREDIT_PROLONG,SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED,SUM_AMT_CREDIT_SUM_DEBT,BUREAU_UTILIZATION_AVG,BUREAU_UTILIZATION_MAX,BUREAU_PROP_SUM_OVERDUE_AVG,BUREAU_PROP_MAX_OVERDUE_AVG,MAX_DAYS_CREDIT_UPDATE,RANGE_DAYS_CREDIT_UPDATE,DAYS_CREDIT_RANGE,TOTAL_AMT_CREDIT_SUM_WEIGHTED,TOTAL_AMT_CREDIT_SUM,COUNT_CREDIT_CARD,COUNT_CAR_LOAN,COUNT_MORTGAGE,SUM_AMT_ANNUITY,MAX_CREDIT_CARD_SK_DPD_6M,MAX_CREDIT_CARD_SK_DPD_12M,MAX_AMT_DRAWINGS_CURRENT_6M,MAX_AMT_DRAWINGS_CURRENT_12M,MAX_AMT_INST_MIN_REGULARITY_6M,MAX_AMT_INST_MIN_REGULARITY_12M,MAX_CNT_DRAWINGS_POS_CURRENT_6M,MAX_CNT_DRAWINGS_POS_CURRENT_12M,SUM_CC_PAYMENT_DIFF_12M,DIFF_AVG_BALANCE_6M_12M,AVG_BALANCE_6M,AVG_UTILIZATION_6M,AVG_BALANCE,MAX_BALANCE,SUM_BALANCE,MAX_MONTHS_BALANCE,MIN_MONTHS_BALANCE,RANGE_MONTHS_BALANCE,AVG_UTILIZATION,MAX_UTILIZATION,AVG_BALANCE_WEIGHTED,MAX_BALANCE_WEIGHTED,SUM_BALANCE_WEIGHTED,AVG_UTILIZATION_WEIGHTED,MAX_UTILIZATION_WEIGHTED,MAX_DPD_WEIGHTED,MAX_DPD_DEF_WEIGHTED,SUM_CNT_DRAWINGS_CURRENT,AVG_CNT_DRAWINGS_CURRENT,MAX_CNT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_CURRENT,AVG_AMT_DRAWINGS_CURRENT,MAX_AMT_DRAWINGS_CURRENT,MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,SUM_CNT_DRAWINGS_ATM_CURRENT_6M,SUM_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_CNT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M,MAX_UTILIZATION_6M,MAX_UTILIZATION_3M,MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M,MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M,SUM_UNDERPAYMENT_12M,SUM_UNDERPAYMENT_6M,MAX_PAYMENT_SIZE_6M,MAX_PAYMENT_SIZE_12M,MIN_PAYMENT_SIZE_6M,MAX_ABS_DAYS_INSTALMENT,COUNT_UNDERPAYMENT,SUM_UNDERPAYMENT,SUM_UNDERPAYMENT_WEIGHTED,MAX_UNDERPAYMENT,AVG_PAYMENT_SIZE_WEIGHTED,AVG_PAYMENT_SIZE,MAX_PAYMENT_SIZE_WEIGHTED,MAX_PAYMENT_SIZE,MIN_PAYMENT_SIZE_WEIGHTED,MIN_PAYMENT_SIZE,SUM_PAYMENT_WEIGHTED,SUM_PAYMENT,SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT,MAX_DAYS_ENTRY_PAYMENT,MIN_DAYS_ENTRY_PAYMENT,RANGE_DAYS_ENTRY_PAYMENT,MAX_UNDERPAYMENT_6M,MAX_UNDERPAYMENT_12M,SUM_PAYMENT_6M,SUM_PAYMENT_DIFF_6M_12M,MAX_AMT_INSTALMENT_6M,MIN_AMT_INSTALMENT_6M,MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,MAX_POS_DPD,MAX_POS_DPD_DEF,NUM_POS_CASH,TOTAL_AMT_CREDIT_SUM_DIV_SUM_DAYS_CREDIT_ENDDATE,TOTAL_AMT_CREDIT_SUM_POS_DAYS_DIV_SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,MAX_ABS_DAYS_INSTALMENT_DIV_DAYS_BIRTH,AMT_CREDIT_DIV_AMT_INCOME_TOTAL,AMT_CREDIT_PLUS_AMT_INCOME_TOTAL,AMT_CREDIT_DIV_AMT_GOODS_PRICE,AMT_CREDIT_DIV_SUM_PAYMENT,AMT_GOODS_PRICE_DIV_AMT_INCOME_TOTAL,AMT_CREDIT_DIV_AMT_ANNUITY,AMT_CREDIT_DIV_AVG_PREV_REQ_AMOUNT,AMT_CREDIT_DIV_MAX_PREV_REQ_AMOUNT,EXT_SOURCE_PROD,DAYS_EMPLOYED_DIV_DAYS_BIRTH,DAYS_EMPLOYED_PLUS_DAYS_REGISTRATION_PLUS_DAYS_LAST_PHONE_CHANGE,AVG_PAYMENT_SIZE_DIV_AMT_INCOME_TOTAL,AVG_PAYMENT_SIZE_DIV_AMT_CREDIT,AVG_PAYMENT_SIZE_DIV_AMT_ANNUITY,DAYS_REGISTRATION_PLUS_DAYS_ID_PUBLISH,SUM_REFUSED_CONTRACT_DIV_SUM_APPR_CONTRACT,MAX_UTILIZATION_DIV_AVG_UTILIZATION,MAX_PREV_REQ_AMOUNT_DIV_AMT_CREDIT,AMT_INCOME_TOTAL_DIV_DAYS_BIRTH,SUM_DAYS_ID_REG_PHONE,SUM_REQ_CREDIT_YEAR,SUM_REQ_CREDIT_QRT,SUM_REQ_CREDIT_1M,SUM_REQ_CREDIT_1M_DIV_SUM_REQ_CREDIT_QRT,SUM_REQ_CREDIT_QRT_DIV_SUM_REQ_CREDIT_YEAR,DEF_30_PLUS_60_CNT_SOCIAL_CIRCLE,OWN_CAR_AGE_DIV_DAYS_BIRTH,LANDAREA_DIV_TOTALAREA_MODE,OWN_CAR_AGE_PLUS_DAYS_BIRTH,AMT_ANNUITY_DIV_DAYS_BIRTH,AMT_ANNUITY_DIV_DAYS_EMPLOYED,AMT_ANNUITY_PROD_DAYS_EMPLOYED,DAYS_REGISTRATION_DIV_DAYS_ID_PUBLISH,DAYS_REGISTRATION_DIV_DAYS_LAST_PHONE_CHANGE,REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULATION_RELATIVE,SUM_REG_NOT_FLAG,SUM_AVG_BUILD,SUM_MODE_BUILD,SUM_MEDI_BUILD,SUM_DOC_FLAG,CNT_CHILDREN_DIV_DAYS_BIRTH,CNT_CHILDREN_DIV_REGION_POPULATION_RELATIVE,FLAG_OWN_REALTY_PROD_REGION_POPULATION_RELATIVE,FLAG_OWN_REALTY_DIV_REGION_POPULATION_RELATIVE,FLAG_OWN_CAR_DIV_OWN_CAR_AGE,EXT_SOURCE_1_DIV_DAYS_BIRTH,EXT_SOURCE_1_PROD_DAYS_BIRTH,NAME_CONTRACT_TYPE_Cash_loans,NAME_CONTRACT_TYPE_Revolving_loans,NAME_CONTRACT_TYPE_nan,CODE_GENDER_F,CODE_GENDER_M,CODE_GENDER_XNA,CODE_GENDER_nan,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group_of_people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse,_partner",NAME_TYPE_SUITE_Unaccompanied,NAME_TYPE_SUITE_nan,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial_associate,NAME_INCOME_TYPE_Maternity_leave,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State_servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_INCOME_TYPE_nan,NAME_EDUCATION_TYPE_Academic_degree,NAME_EDUCATION_TYPE_Higher_education,NAME_EDUCATION_TYPE_Incomplete_higher,NAME_EDUCATION_TYPE_Lower_secondary,NAME_EDUCATION_TYPE_Secondary_/_secondary_special,NAME_EDUCATION_TYPE_nan,NAME_FAMILY_STATUS_Civil_marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single_/_not_married,NAME_FAMILY_STATUS_Unknown,NAME_FAMILY_STATUS_Widow,NAME_FAMILY_STATUS_nan,NAME_HOUSING_TYPE_Co-op_apartment,NAME_HOUSING_TYPE_House_/_apartment,NAME_HOUSING_TYPE_Municipal_apartment,NAME_HOUSING_TYPE_Office_apartment,NAME_HOUSING_TYPE_Rented_apartment,NAME_HOUSING_TYPE_With_parents,NAME_HOUSING_TYPE_nan,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning_staff,OCCUPATION_TYPE_Cooking_staff,OCCUPATION_TYPE_Core_staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR_staff,OCCUPATION_TYPE_High_skill_tech_staff,OCCUPATION_TYPE_IT_staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill_Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine_staff,OCCUPATION_TYPE_Private_service_staff,OCCUPATION_TYPE_Realty_agents,OCCUPATION_TYPE_Sales_staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security_staff,OCCUPATION_TYPE_Waiters/barmen_staff,OCCUPATION_TYPE_nan,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,WEEKDAY_APPR_PROCESS_START_nan,FONDKAPREMONT_MODE_not_specified,FONDKAPREMONT_MODE_org_spec_account,FONDKAPREMONT_MODE_reg_oper_account,FONDKAPREMONT_MODE_reg_oper_spec_account,FONDKAPREMONT_MODE_nan,HOUSETYPE_MODE_block_of_flats,HOUSETYPE_MODE_specific_housing,HOUSETYPE_MODE_terraced_house,HOUSETYPE_MODE_nan,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone,_brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan
0,100002,1,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637.0,-3648.0,-2120,,1,1,0,1,1,0,1.0,2,2,10,0,0,0,0,0,0,42,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,0.0149,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,,9251.775,,,,1.0,,1.0,1.0,-17.0,-17.0,-17.0,0.0,179055.0,179055.0,179055.0,295.470297,295.470297,295.470297,19.353584,19.353584,19.353584,0.031937,0.031937,0.031937,9251.775,9251.775,9251.775,15.266955,15.266955,15.266955,-606.0,-606.0,0.0,0.0,0.0,295.470297,295.470297,179055.0,179055.0,0.0,0.00165,0.00165,0.0,1.0,1.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.071974,0.000119,0.071974,0.071974,0.071974,0.0,0.0,1.0,0.00165,0.00165,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,315.103846,0.0,0.0,0.0,0.0,0.54618,0.54618,2.0,2.0,2.0,780.0,0.0,,,,245781.0,245781.0,0.0,10.875,0.689655,0.264368,0.003698,40.5,1.5,39.0,60.0,0.716964,0.283036,0.5,0.109328,0.014109,0.010476,0.025641,4863.768166,0.0,1617.905476,7012.987013,0.75,0.051282,0.027542,638235.0,927.0,20.0,20.0,-47.0,-1072.0,780.0,-2094.0,2.0,8.0,2.0,0.0,0.0,0.0,0.0,103.0,0.017755,148.3425,153.695563,5043.645,8405.145,0.0,35111.571429,245781.0,,,0.0,,-7.0,1178.0,1334.0,69432.89321,865055.565,4.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,53093.745,53093.745,9251.775,565.0,0.0,0.0,0.0,0.0,95.448632,11559.247105,1083.545816,53093.745,15.761116,9251.775,1813.524009,219625.695,0.0,-49.0,-587.0,538.0,0.0,0.0,90100.845,34590.195,53093.745,9251.775,-12.0,-31.0,0.0,0.0,1.0,-413.11154,688.495146,-0.059719,2.007889,609097.5,1.158397,1.85132,1.733333,16.461104,2.270797,2.270797,0.003043,0.067329,-5419.0,0.057083,0.028429,0.467976,-5768.0,0.0,,0.440374,-21.403657,-6902.0,1.0,0.0,0.0,,0.0,4.0,,2.530201,,-2.610771,-38.776295,-15734218.5,1.720755,3.216931,0.037602,0,2.0207,2.041,2.0274,1,-0.0,0.0,0.018801,53.18866,,-9e-06,-785.612748,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188.0,-1186.0,-291,,1,1,0,1,1,0,2.0,1,1,11,0,0,0,0,0,0,13,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.804,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.079,0.0554,0.0,0.0,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,0.0714,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,3.0,-1976.0,-527.0,-1047.333333,1449.0,68053.5,1035882.0,484191.0,29.070269,1388.581769,612.90394,5.399568,10.531859,8.677472,0.004315,0.014118,0.008318,6737.31,98356.995,56553.99,2.877962,131.845838,70.901357,-2341.0,-746.0,1595.0,0.0,0.0,547.812073,1206.434316,435436.5,900000.0,2.1e-05,0.001071,0.001543,0.05003,1.057664,1.15098,0.989013,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,0.078878,0.000225,0.236634,0.090332,0.070374,0.019958,2.0,3.0,0.002975,0.00134,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,0.0,0.0,,,0.0,0.0,1.0,1.0,1.0,1216.0,0.0,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,810000.0,1216.0,,,,-2434.0,1216.0,-2178.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,606.0,0.003938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-43.0,2088.0,1980.0,19188.078259,1017400.5,2.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,2310.0,0.0,0.0,0.0,0.0,100.798053,64754.586,1030.947353,560835.36,2.899015,6662.97,2519.951327,1618864.65,0.0,-544.0,-2324.0,1780.0,,,0.0,0.0,,,-1.0,-14.0,0.0,0.0,3.0,-467.126033,666.118421,-0.137787,4.79075,1563502.5,1.145199,0.799018,4.183333,36.234085,2.970588,1.437225,,0.070862,-3202.0,0.239832,0.050061,1.81393,-1477.0,0.0,,0.695785,-16.104981,-2305.0,0.0,0.0,0.0,,,0.0,,0.179272,,-2.129347,-30.049242,-42409818.0,4.075601,1.432367,0.003541,0,2.8888,2.8723,2.8954,1,-0.0,0.0,0.0,0.0,,-1.9e-05,-5218.396475,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0
2,100004,0,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225.0,-4260.0,-2531,26.0,1,1,1,1,1,0,1.0,2,2,9,0,0,0,0,0,0,22,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,1.0,-714.0,-714.0,-714.0,0.0,20106.0,20106.0,20106.0,24.669939,24.669939,24.669939,3.753045,3.753045,3.753045,0.004605,0.004605,0.004605,5357.25,5357.25,5357.25,6.573313,6.573313,6.573313,-815.0,-815.0,0.0,0.0,0.0,29.793865,29.793865,24282.0,24282.0,0.00026,0.001016,0.001016,0.212008,0.828021,0.828021,0.828021,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.119115,0.000146,0.119115,0.119115,0.119115,0.0,0.0,1.0,0.001227,0.001227,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,0.0,0.0,,,,-595.0,-382.0,-977.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,408.0,0.003205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,-382.0,300.0,918.0,386.044202,189037.8,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,784.0,0.0,0.0,0.0,0.0,9.434878,7096.155,14.544656,10573.965,6.738679,5357.25,28.304633,21288.465,0.0,-727.0,-795.0,68.0,,,0.0,0.0,,,-3.0,-11.0,0.0,0.0,1.0,-193.488025,,-0.041163,2.0,202500.0,1.0,6.341462,2.0,20.0,5.559674,5.559674,,0.011814,-5300.0,0.105128,0.052564,1.051282,-6791.0,0.0,,0.179867,-3.544051,-7606.0,0.0,0.0,0.0,,,0.0,-0.001365,,-19020.0,-0.354405,-30.0,-1518750.0,1.683129,5.226994,0.020064,0,,,,0,-0.0,0.0,0.010032,99.681021,0.038462,,,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039.0,-9833.0,-2437,,1,1,0,1,0,0,2.0,2,2,17,0,0,0,0,0,0,42,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,13500.0,2482.92,0.799989,0.063868,1.029197,1.012684,1.316797,1.316797,9.0,-416.0,365243.0,182481.75,365659.0,0.0,906615.0,291695.5,0.0,5008.922652,1358.887335,9.230206,27.839644,17.767287,0.015809,0.15381,0.081751,2482.92,39954.51,23651.175,4.024182,180.641436,96.293912,-617.0,-181.0,436.0,5.0,0.555556,1242.561634,3803.867403,272203.26,688500.0,0.000439,0.004129,0.007275,0.163412,1.012684,1.316797,0.799989,,0.0,0.0,0.0,1.0,1.0,3.0,5.0,0.0,0.0,0.0,2.0,2.0,0.065491,0.002612,0.589419,0.116006,0.041129,0.074877,1.0,4.0,0.015886,0.005525,2.0,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,0.0,0.0,0.0,0.0,0.0,6.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,,,0.0,0.0,691786.89,691786.89,29027.52,545.0,0.0,0.0,0.0,0.0,543.096731,62947.088438,3975.786724,691786.89,4.318122,2482.92,8689.547693,1007153.415,0.0,-12.0,-575.0,563.0,0.0,0.0,865952.01,749841.93,691786.89,29027.52,-1.0,-77.0,0.0,0.0,3.0,,,-0.028677,2.316167,447682.5,1.052803,0.310462,2.2,10.532818,1.14871,0.45415,,0.159905,-13489.0,0.466275,0.201313,2.120394,-12270.0,0.2,,2.201914,-7.103394,-12887.0,,,,,,0.0,,,,-1.562036,-9.768509,-90217273.5,4.034879,15.936791,0.016038,0,,,,1,-0.0,0.0,0.008019,124.703828,,,,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1
4,100007,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038.0,-4311.0,-3458,,1,1,0,1,0,0,1.0,2,2,11,0,0,0,0,1,1,12,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,,16037.64,,,,1.108236,,1.108236,6.0,-2041.0,365243.0,72143.8,367284.0,14616.0,284400.0,166638.75,6.201103,733.391711,248.03877,7.968206,21.858453,12.644075,0.003381,0.045729,0.016725,1834.29,22678.785,12278.805,0.778231,42.88139,16.715844,-2357.0,-374.0,1983.0,1.0,0.166667,222.881532,661.764706,150530.25,247500.0,7.5e-05,0.001244,0.002963,0.159516,1.046356,1.264,0.85093,,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,3.0,1.0,3.0,0.0,0.09508,0.000625,0.570482,0.112414,0.0779,0.034514,3.0,5.0,0.005724,0.002674,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,0.0,0.0,,,,-783.0,-783.0,-783.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1149.0,0.00087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,-783.0,0.0,0.0,186.781609,146250.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,16037.64,16037.64,16037.64,2326.0,3.0,29857.365,25.402727,22655.655,49.833434,12214.060227,1145.545714,22678.785,0.000125,0.18,3289.00667,806127.975,16.0,-14.0,-2318.0,2304.0,0.0,0.0,96225.84,0.0,16037.64,16037.64,12.0,-31.0,0.0,0.0,5.0,-186.781609,,-0.116697,4.222222,634500.0,1.0,0.636375,4.222222,23.461618,3.407953,2.072727,,0.152418,-8455.0,0.100527,0.023809,0.5586,-7769.0,0.0,,0.482456,-6.095725,-8875.0,0.0,0.0,0.0,,,0.0,,,,-1.097005,-7.197334,-66427389.0,1.246674,3.89783,0.057326,2,,,,1,-0.0,0.0,0.028663,34.888183,,,,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1


time: 491 ms


In [135]:
df.shape

(307511, 494)

time: 12.9 ms


#### Write preprocessed data to file

In [136]:
df.to_csv(path + train_or_test + ".csv", index=False, header=True)

time: 4min 35s
