# Home Credit Preprocessing

In [17]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
import pickle
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
import warnings
import gc

%load_ext autotime

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None
gc.enable()
path = "/Users/dsaxton/home_credit_default/"

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 13.4 ms


# Previous application

#### Aggregation function

In [63]:
def previous_agg_func(g):
    mask6 = g["DAYS_DECISION"] >= -180
    mask12 = g["DAYS_DECISION"] >= -360
    mask24 = g["DAYS_DECISION"] >= -720

    d = {
        "AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M": np.nanmean((g["AMT_CREDIT"] / g["AMT_ANNUITY"]).where(mask6)), 
        "MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M": np.nanmin((g["AMT_CREDIT"] / g["AMT_ANNUITY"]).where(mask6)), 
        "MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M": np.nanmax((g["AMT_CREDIT"] / g["AMT_ANNUITY"]).where(mask6)), 
        "AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M": np.nanmean((g["AMT_CREDIT"] / g["AMT_GOODS_PRICE"]).where(mask6)), 
        "MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M": np.nanmax((g["AMT_CREDIT"] / g["AMT_GOODS_PRICE"]).where(mask6)), 
        "AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M": np.nanmean((g["AMT_CREDIT"] + g["AMT_ANNUITY"]).where(mask6)), 
        "MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M": np.nanmin((g["AMT_CREDIT"] + g["AMT_ANNUITY"]).where(mask6)), 
        
        "COUNT_NAME_CLIENT_TYPE_REPEATER_12M": np.nansum((g["NAME_CLIENT_TYPE"] == "Repeater").where(mask12)), 
        "COUNT_NAME_CLIENT_TYPE_NEW_12M": np.nansum((g["NAME_CLIENT_TYPE"] == "New").where(mask12)), 
        "SUM_NAME_PAYMENT_TYPE_XNA_6M": np.nansum((g["NAME_PAYMENT_TYPE"] == "XNA").where(mask6)), 
        "SUM_NAME_SELLER_INDUSTRY_CSTR_6M": np.nansum((g["NAME_SELLER_INDUSTRY"] == "Construction").where(mask6)), 
        "SUM_NAME_SELLER_INDUSTRY_XNA_6M": np.nansum((g["NAME_SELLER_INDUSTRY"] == "XNA").where(mask6)), 
        "SUM_NAME_GOODS_CATEGORY_XNA_6M": np.nansum((g["NAME_GOODS_CATEGORY"] == "XNA").where(mask6)), 
        "SUM_PRODUCT_COMBINATION_POS_MOBILE_INTEREST_12M": np.nansum((g["PRODUCT_COMBINATION"] == "POS mobile with interest").where(mask12)),
        "SUM_PRODUCT_COMBINATION_POS_HOUSE_INTEREST_12M": np.nansum((g["PRODUCT_COMBINATION"] == "POS household with interest").where(mask12)), 
        "SUM_REFUSED_CONTRACT_6M": np.nansum((g["NAME_CONTRACT_STATUS"] == "Refused").where(mask12)), 
        "AVG_RATE_INTEREST_PRIMARY_12M": np.nanmean(g["RATE_INTEREST_PRIMARY"].where(mask12)), 
        "MAX_RATE_INTEREST_PRIMARY_12M": np.nanmax(g["RATE_INTEREST_PRIMARY"].where(mask12)), 
        "MIN_RATE_INTEREST_PRIMARY_12M": np.nanmin(g["RATE_INTEREST_PRIMARY"].where(mask12)), 
        "AVG_RATE_INTEREST_PRIVILEGED_12M": np.nanmean(g["RATE_INTEREST_PRIVILEGED"].where(mask12)), 
        "MAX_UTILIZATION_3M": np.nanmax((g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]).where(mask3)),
        "MIN_PREV_AMT_ANNUITY_12M": np.nanmin(g["AMT_ANNUITY"].where(mask12)), 
        "MIN_PREV_AMT_ANNUITY_24M": np.nanmin(g["AMT_ANNUITY"].where(mask24)), 
        "MIN_PREV_PROP_APPROVED_12M": np.nanmin((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask12)), 
        "AVG_SYNTH_TARGET_12M": np.nanmean(g["SYNTHETIC_TARGET"].where(mask12)), 
        "AVG_PREV_PROP_APPROVED_12M": np.nanmean((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask12)), 
        "AVG_PREV_PROP_APPROVED_24M": np.nanmean((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask24)), 
        "MAX_PREV_PROP_APPROVED_12M": np.nanmax((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask12)), 
        "MAX_PREV_PROP_APPROVED_24M": np.nanmax((g["AMT_CREDIT"] / g["AMT_APPLICATION"]).where(mask24)), 
        "COUNT_PREV_APP": len(g), 
        "MIN_PREV_DAYS_TERMINATION": np.nanmin(g["DAYS_TERMINATION"]), 
        "MAX_PREV_DAYS_TERMINATION": np.nanmax(g["DAYS_TERMINATION"]), 
        "AVG_PREV_DAYS_TERMINATION": np.nanmean(g["DAYS_TERMINATION"]), 
        "RANGE_PREV_DAYS_TERMINATION": np.nanmax(g["DAYS_TERMINATION"]) - np.nanmin(g["DAYS_TERMINATION"]),  
        "MIN_PREV_AMT_CREDIT": np.nanmin(g["AMT_CREDIT"]),
        "MAX_PREV_AMT_CREDIT": np.nanmax(g["AMT_CREDIT"]),
        "AVG_PREV_AMT_CREDIT": np.nanmean(g["AMT_CREDIT"]),
        "MIN_PREV_AMT_CREDIT_WEIGHTED": np.nanmin(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])),
        "MAX_PREV_AMT_CREDIT_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])),
        "AVG_PREV_AMT_CREDIT_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / abs(g["DAYS_DECISION"])), 
        "MIN_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmin(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "MAX_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmax(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "AVG_PREV_AMT_CREDIT_DIV_ANNUITY": np.nanmean(g["AMT_CREDIT"] / g["AMT_ANNUITY"]),
        "MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmin(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),
        "MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),
        "AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])),  
        "MIN_PREV_AMT_ANNUITY": np.nanmin(g["AMT_ANNUITY"]), 
        "MAX_PREV_AMT_ANNUITY": np.nanmax(g["AMT_ANNUITY"]), 
        "AVG_PREV_AMT_ANNUITY": np.nanmean(g["AMT_ANNUITY"]), 
        "MIN_PREV_AMT_ANNUITY_WEIGHTED": np.nanmin(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_AMT_ANNUITY_WEIGHTED": np.nanmax(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_AMT_ANNUITY_WEIGHTED": np.nanmean(g["AMT_ANNUITY"] / abs(g["DAYS_DECISION"])), 
        "MIN_DAYS_DECISION": np.nanmin(g["DAYS_DECISION"]), 
        "MAX_DAYS_DECISION": np.nanmax(g["DAYS_DECISION"]), 
        "RANGE_DAYS_DECISION": np.nanmax(g["DAYS_DECISION"]) - np.nanmin(g["DAYS_DECISION"]),
        "SUM_DAYS_LAST_DUE_NULL": np.nansum(g["DAYS_LAST_DUE"].isnull()), 
        "AVG_DAYS_LAST_DUE_NULL": np.nanmean(g["DAYS_LAST_DUE"].isnull()), 
        "AVG_PREV_REQ_AMOUNT_WEIGHTED": np.nanmean(g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_REQ_AMOUNT_WEIGHTED": np.nanmax(g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_REQ_AMOUNT": np.nanmean(g["AMT_APPLICATION"]), 
        "MAX_PREV_REQ_AMOUNT": np.nanmax(g["AMT_APPLICATION"]), 
        "AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED": np.nanmean(g["RATE_DOWN_PAYMENT"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_PROP_APPROVED_WEIGHTED": np.nanmean(g["AMT_CREDIT"] / g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "MAX_PREV_PROP_APPROVED_WEIGHTED": np.nanmax(g["AMT_CREDIT"] / g["AMT_APPLICATION"] / abs(g["DAYS_DECISION"])), 
        "AVG_PREV_RATE_DOWNPAYMENT": np.nanmean(g["RATE_DOWN_PAYMENT"]), 
        "AVG_PREV_PROP_APPROVED": np.nanmean(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "MAX_PREV_PROP_APPROVED": np.nanmax(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "MIN_PREV_PROP_APPROVED": np.nanmin(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "AVG_PREV_REQ_AMOUNT": np.nanmean(g["AMT_APPLICATION"]), 
        "MAX_PREV_REQ_AMOUNT": np.nanmax(g["AMT_APPLICATION"]), 
        "AVG_PREV_RATE_DOWNPAYMENT": np.nanmean(g["RATE_DOWN_PAYMENT"]), 
        "AVG_PREV_INT_RATE": np.nanmean(g["RATE_INTEREST_PRIMARY"]), 
        "SUM_PREV_URGENT_NEEDS": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Urgent needs"), 
        "SUM_PREV_REPAIRS": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Repairs"), 
        "SUM_PREV_OTHER": np.nansum(g["NAME_CASH_LOAN_PURPOSE"] == "Other"), 
        "SUM_PREV_LIMIT_REJECT": np.nansum(g["CODE_REJECT_REASON"] == "LIMIT"), 
        "SUM_REFUSED_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Refused"), 
        "SUM_CANC_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Canceled"), 
        "SUM_APPR_CONTRACT": np.nansum(g["NAME_CONTRACT_STATUS"] == "Approved"), 
        "SUM_PREV_HC_REJECT": np.nansum(g["CODE_REJECT_REASON"] == "HC"), 
        "SUM_PREV_INSURE_REQ": np.nansum(g["NFLAG_INSURED_ON_APPROVAL"]), 
        "COUNT_PREV_WALK_IN": np.nansum(g["NAME_PRODUCT_TYPE"] == "walk-in"), 
        "COUNT_PREV_HIGH_YIELD": np.nansum(g["NAME_YIELD_GROUP"] == "high"), 
        "COUNT_PREV_LOW_YIELD": np.nansum(g["NAME_YIELD_GROUP"].apply(lambda x: x.startswith("low"))), 
        "AVG_SYNTH_TARGET": np.nanmean(g["SYNTHETIC_TARGET"]), 
        "SUM_SYNTH_TARGET_WEIGHTED": np.nansum(g["SYNTHETIC_TARGET"] / abs(g["DAYS_DECISION"])), 
        "SUM_SYNTH_TARGET": np.nansum(g["SYNTHETIC_TARGET"]), 
        "MAX_SYNTH_TARGET": np.nanmax(g["SYNTHETIC_TARGET"]), 
        "MIN_SYNTH_TARGET": np.nanmin(g["SYNTHETIC_TARGET"]), 
        "RANGE_SYNTH_TARGET": np.nanmax(g["SYNTHETIC_TARGET"]) - np.min(g["SYNTHETIC_TARGET"]), 
        "SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE": np.nansum(g["DAYS_LAST_DUE_1ST_VERSION"] == g["DAYS_LAST_DUE"]), 
        "SUM_DAYS_FIRST_DRAWING_SENTINEL": np.nansum(g["DAYS_FIRST_DRAWING_SENTINEL"]), 
        "SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED": np.nansum(g["DAYS_FIRST_DRAWING_SENTINEL"] / abs(g["DAYS_DECISION"])), 
        "MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED": np.nanmax(g["DAYS_FIRST_DRAWING_SENTINEL"] / abs(g["DAYS_DECISION"])),         
        "SUM_DAYS_LAST_DUE_LT_FIRST_VERSION": np.nansum(g["DAYS_LAST_DUE"] < g["DAYS_LAST_DUE_1ST_VERSION"])}

    return pd.Series(d)

time: 191 ms


#### Process data and write to file

In [64]:
previous_application = pd.read_csv(path + "previous_application.csv")

with open(path + "linear_model.pkl", "rb") as f:
    clf = pickle.load(f)

impute = Imputer(strategy="median")
scale = StandardScaler()

cols = ["AMT_ANNUITY", 
        "AMT_CREDIT", 
        "AMT_GOODS_PRICE", 
        "HOUR_APPR_PROCESS_START", 
        "NAME_CONTRACT_TYPE", 
        "NAME_TYPE_SUITE", 
        "WEEKDAY_APPR_PROCESS_START"]

prev_temp = pd.get_dummies(previous_application[cols])

dummy_cols = ["AMT_CREDIT",
              "AMT_GOODS_PRICE",
              "HOUR_APPR_PROCESS_START",
              "NAME_CONTRACT_TYPE_Cash loans",
              "NAME_CONTRACT_TYPE_Revolving loans",
              "NAME_TYPE_SUITE_Children",
              "NAME_TYPE_SUITE_Family",
              "NAME_TYPE_SUITE_Group of people",
              "NAME_TYPE_SUITE_Other_A",
              "NAME_TYPE_SUITE_Other_B",
              "NAME_TYPE_SUITE_Spouse, partner",
              "NAME_TYPE_SUITE_Unaccompanied",
              "WEEKDAY_APPR_PROCESS_START_FRIDAY",
              "WEEKDAY_APPR_PROCESS_START_MONDAY",
              "WEEKDAY_APPR_PROCESS_START_SATURDAY",
              "WEEKDAY_APPR_PROCESS_START_SUNDAY",
              "WEEKDAY_APPR_PROCESS_START_THURSDAY",
              "WEEKDAY_APPR_PROCESS_START_TUESDAY",
              "WEEKDAY_APPR_PROCESS_START_WEDNESDAY"]

previous_application["SYNTHETIC_TARGET"] = clf.predict_proba(scale.fit_transform(impute.fit_transform(prev_temp[dummy_cols])))[:,1]
previous_application["DAYS_FIRST_DRAWING_SENTINEL"] = (previous_application["DAYS_FIRST_DRAWING"] == 365243).astype(int)
previous_application["DAYS_FIRST_DUE_SENTINEL"] = (previous_application["DAYS_FIRST_DUE"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_1ST_VERSION_SENTINEL"] = (previous_application["DAYS_LAST_DUE_1ST_VERSION"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_SENTINEL"] = (previous_application["DAYS_LAST_DUE"] == 365243).astype(int)
previous_application["DAYS_TERMINATION_SENTINEL"] = (previous_application["DAYS_TERMINATION"] == 365243).astype(int)

previous_agg = previous_application.groupby("SK_ID_CURR").apply(previous_agg_func).reset_index()

previous_agg.to_csv(path + "previous_agg.csv", index=False, header=True)
del prev_temp, previous_application, previous_agg
gc.collect()

65

time: 1h 49min 26s


# Bureau Balance

#### Aggregation function

In [36]:
def bureau_balance_agg_func(g):
    mask6 = g["MONTHS_BALANCE"] >= -6
    mask12 = g["MONTHS_BALANCE"] >= -12
    closed = g["STATUS"] == "C"

    d = {"WORST_DQ_BUREAU_BALANCE_6M": np.nanmax(g["STATUS"].apply(lambda x: 0 if x == "C" else int(x)).where(mask6)), 
        "WORST_DQ_BUREAU_BALANCE_12M": np.nanmax(g["STATUS"].apply(lambda x: 0 if x == "C" else int(x)).where(mask12)), 
        "LEN_BUREAU_BALANCE": np.nansum(~closed), 
        "SUM_CLOSED_BUREAU_BALANCE": np.nansum(closed), 
        "SUM_CURRENT_BUREAU_BALANCE": np.nansum(g["STATUS"] == "0"), 
        "SUM_DQ_BUREAU_BALANCE": np.nansum(g["STATUS"].isin(["1", "2", "3", "3", "4", "5"])),
        "WORST_DQ_BUREAU_BALANCE": np.nanmax(g["STATUS"].apply(lambda x: 0 if x == "C" else int(x))), 
        "AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nansum(abs(g["MONTHS_BALANCE"]).where(~closed)) / np.nansum(~closed), 
        "MIN_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmin(g["MONTHS_BALANCE"].where(~closed)), 
        "MAX_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["MONTHS_BALANCE"].where(~closed))}

    return pd.Series(d)

time: 25.5 ms


#### Process data and write to file

In [None]:
bureau_balance = pd.read_csv(path + "bureau_balance.csv")

bureau_balance["STATUS"] = bureau_balance["STATUS"].where(lambda x: x != "X").fillna("0")

bureau_balance_agg = bureau_balance.groupby("SK_ID_BUREAU").apply(bureau_balance_agg_func).reset_index()
bureau_balance_agg.to_csv(path + "bureau_balance_agg.csv", index=False, header=True)
del bureau_balance, bureau_balance_agg
gc.collect()

# Bureau

This is dependent on `bureau_balance_agg`

#### Aggregation function

In [89]:
def bureau_agg_func(g):
    mask3 = g["DAYS_CREDIT_UPDATE"] >= -90
    mask6 = g["DAYS_CREDIT_UPDATE"] >= -180
    mask12 = g["DAYS_CREDIT_UPDATE"] >= -360
    mask24 = g["DAYS_CREDIT_UPDATE"] >= -720
    active = g["CREDIT_ACTIVE"] == "Active"
    cc = g["CREDIT_TYPE"] == "Credit card"
    
    d = {"SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M": np.nansum((g["AMT_CREDIT_SUM_DEBT"] / g["DAYS_CREDIT_ENDDATE"]).where(active & mask12)),
        "SUM_CC_DEBT_6M": np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(cc & mask6)), 
        "SUM_CC_DEBT_12M": np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(cc & mask12)), 
        "MAX_WORST_DQ_BUREAU_BALANCE_6M": np.nanmax(g["WORST_DQ_BUREAU_BALANCE_6M"].where(mask6)), 
        "MAX_WORST_DQ_BUREAU_BALANCE_12M": np.nanmax(g["WORST_DQ_BUREAU_BALANCE_12M"].where(mask12)), 
        "MAX_BUREAU_UTILIZATION_6M": np.nanmax((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM"]).where(mask6)), 
        "MAX_BUREAU_UTILIZATION_12M": np.nanmax((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM"]).where(mask12)), 
        "COUNT_ACTIVE_6M": np.nansum((g["CREDIT_ACTIVE"] == "Active").where(active & mask6)), 
        "COUNT_ACTIVE_12M": np.nansum((g["CREDIT_ACTIVE"] == "Active").where(active & mask12)), 
        "COUNT_ACTIVE_24M": np.nansum((g["CREDIT_ACTIVE"] == "Active").where(active & mask24)), 
        "DAYS_REMAINING_ACTIVE": np.nansum(g["DAYS_CREDIT_ENDDATE"].where(active)), 
        "MAX_CREDIT_DAY_OVERDUE_6M": np.nanmax(g["CREDIT_DAY_OVERDUE"].where(mask6)), 
        "MAX_CREDIT_DAY_OVERDUE_DIFF_6M_12M": np.nanmax(g["CREDIT_DAY_OVERDUE"].where(mask6)) - np.nanmax(g["CREDIT_DAY_OVERDUE"].where(mask6 ^ mask12)), 
        "BUREAU_UTILIZATION_DIFF_6M_12M": np.nanmean((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]).where(active & mask6)) - np.nanmean((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]).where(active & (mask6 ^ mask12))), 
        "BUREAU_UTILIZATION_DIFF_12M_24M": np.nanmean((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]).where(active & mask12)) - np.nanmean((g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]).where(active & (mask6 ^ mask24))), 
        "BUREAU_SUM_DEBT_DIFF_6M_12M": np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(active & mask6)) - np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(active & (mask6 ^ mask12))),
        "BUREAU_SUM_DEBT_DIFF_12M_24M": np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(active & mask6)) - np.nansum(g["AMT_CREDIT_SUM_DEBT"].where(active & (mask6 ^ mask12))),         
        "MAX_CNT_CREDIT_PROLONG": np.nanmax(g["CNT_CREDIT_PROLONG"]), 
        "AVG_LEN_BUREAU_BALANCE": np.nanmean(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CURRENT": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CLOSED": np.nansum(g["SUM_CLOSED_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]), 
        "PROP_CURRENT_WEIGHTED": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]) / np.nansum(g["LEN_BUREAU_BALANCE"]) / np.nansum(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmin(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmax(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]) - np.nanmin(g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "SUM_SUM_CURRENT_BUREAU_BALANCE": np.nansum(g["SUM_CURRENT_BUREAU_BALANCE"]), 
        "AVG_PROP_CURRENT": np.nanmean(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ": np.nanmean(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ": np.nanmax(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"]),
        "AVG_PROP_CURRENT_WEIGHTED": np.nanmean(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_PROP_CURRENT_WEIGHTED": np.nanmin(g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ_WEIGHTED": np.nanmean(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ_WEIGHTED": np.nanmax(g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_CURRENT_WEIGHTED_AMT": np.nanmean(g["AMT_CREDIT_SUM"] * g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_PROP_CURRENT_WEIGHTED_AMT": np.nanmin(g["AMT_CREDIT_SUM"] * g["SUM_CURRENT_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_PROP_DQ_WEIGHTED_AMT": np.nanmean(g["AMT_CREDIT_SUM"] * g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MAX_PROP_DQ_WEIGHTED_AMT": np.nanmax(g["AMT_CREDIT_SUM"] * g["SUM_DQ_BUREAU_BALANCE"] / g["LEN_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]),          
        "AVG_WORST_DQ_BUREAU_BALANCE": np.nanmean(g["WORST_DQ_BUREAU_BALANCE"]), 
        "MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED": np.nanmax(g["WORST_DQ_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED": np.nanmean(g["WORST_DQ_BUREAU_BALANCE"] / g["AVG_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "TOTAL_AMT_CREDIT_SUM_POS_DAYS": np.nansum(g["AMT_CREDIT_SUM"].where(g["DAYS_CREDIT_ENDDATE"] > 0)),
        "SUM_DAYS_CREDIT_ENDDATE_POS_DAYS": np.nansum(g["DAYS_CREDIT_ENDDATE"].where(g["DAYS_CREDIT_ENDDATE"] > 0)), 
        "MAX_LEN_BUREAU_BALANCE": np.nanmax(g["LEN_BUREAU_BALANCE"]), 
        "SUM_LEN_BUREAU_BALANCE": np.nanmax(g["LEN_BUREAU_BALANCE"]), 
        "MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE": np.nanmin(g["MIN_MONTHS_BALANCE_BUREAU_BALANCE"]), 
        "MIN_DAYS_CREDIT_ENDDATE": np.nanmin(g["DAYS_CREDIT_ENDDATE"]), 
        "MAX_DAYS_CREDIT_ENDDATE": np.nanmax(g["DAYS_CREDIT_ENDDATE"]), 
        "SUM_DAYS_CREDIT_ENDDATE": np.nansum(g["DAYS_CREDIT_ENDDATE"]), 
        "SUM_NULL_DAYS_ENDDATE_FACT": np.nansum(g["DAYS_ENDDATE_FACT"].isnull()), 
        "COUNT_BUREAU_RECORDS": len(g), 
        "COUNT_ACTIVE": np.nansum(active), 
        "MAX_CREDIT_DAY_OVERDUE_WEIGHTED": np.nanmax(g["CREDIT_DAY_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_CREDIT_DAY_OVERDUE_WEIGHTED": np.nansum(g["CREDIT_DAY_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "MAX_CREDIT_DAY_OVERDUE": np.nanmax(g["CREDIT_DAY_OVERDUE"]), 
        "SUM_CREDIT_DAY_OVERDUE": np.nansum(g["CREDIT_DAY_OVERDUE"]), 
        "DAYS_SINCE_APPLIED": - np.nanmax(g["DAYS_CREDIT"]), 
        "SUM_INVERSE_DAYS_CREDIT": - np.nansum(1 / g["DAYS_CREDIT"]), 
        "MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED": np.nanmax(g["AMT_CREDIT_MAX_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED": np.nansum(g["AMT_CREDIT_MAX_OVERDUE"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "MAX_AMT_CREDIT_MAX_OVERDUE": np.nanmax(g["AMT_CREDIT_MAX_OVERDUE"]), 
        "SUM_AMT_CREDIT_MAX_OVERDUE": np.nansum(g["AMT_CREDIT_MAX_OVERDUE"]), 
        "SUM_CNT_CREDIT_PROLONG": np.nansum(g["CNT_CREDIT_PROLONG"]), 
        "SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED": np.nansum(g["AMT_CREDIT_SUM_DEBT"] / abs(g["DAYS_CREDIT_UPDATE"])), 
        "SUM_AMT_CREDIT_SUM_DEBT": np.nansum(g["AMT_CREDIT_SUM_DEBT"]),
        "BUREAU_UTILIZATION_AVG": np.nanmean(g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]), 
        "BUREAU_UTILIZATION_MAX": np.nanmax(g["AMT_CREDIT_SUM_DEBT"] / g["AMT_CREDIT_SUM_LIMIT"]), 
        "BUREAU_PROP_SUM_OVERDUE_AVG": np.nanmean(g["AMT_CREDIT_SUM_OVERDUE"] / g["AMT_CREDIT_SUM_DEBT"]), 
        "BUREAU_PROP_MAX_OVERDUE_AVG": np.nanmean(g["AMT_CREDIT_MAX_OVERDUE"] / g["AMT_CREDIT_SUM_DEBT"]), 
        "MAX_DAYS_CREDIT_UPDATE": np.nanmax(g["DAYS_CREDIT_UPDATE"]), 
        "RANGE_DAYS_CREDIT_UPDATE": np.nanmax(g["DAYS_CREDIT_UPDATE"]) - np.nanmin(g["DAYS_CREDIT_UPDATE"]), 
        "DAYS_CREDIT_RANGE": np.nanmax(g["DAYS_CREDIT"]) - np.nanmin(g["DAYS_CREDIT"]), 
        "TOTAL_AMT_CREDIT_SUM_WEIGHTED": np.nansum(g["AMT_CREDIT_SUM"] / abs(g["DAYS_CREDIT_UPDATE"])),
        "TOTAL_AMT_CREDIT_SUM": np.nansum(g["AMT_CREDIT_SUM"]),
        "COUNT_CREDIT_CARD": np.nansum(g["CREDIT_TYPE"] == "Credit card"), 
        "COUNT_CAR_LOAN": np.nansum(g["CREDIT_TYPE"] == "Car loan"), 
        "COUNT_MORTGAGE": np.nansum(g["CREDIT_TYPE"] == "Mortgage"), 
        "SUM_AMT_ANNUITY": np.nansum(g["AMT_ANNUITY"])}
    
    return pd.Series(d)

time: 433 ms


#### Process data and write to file

In [90]:
bureau = pd.read_csv(path + "bureau.csv")
bureau_balance_agg = pd.read_csv(path + "bureau_balance_agg.csv")

bureau_joined = pd.merge(bureau, 
                         bureau_balance_agg, 
                         how="left", 
                         on="SK_ID_BUREAU")

bureau_agg = bureau_joined.groupby("SK_ID_CURR").apply(bureau_agg_func).reset_index()
bureau_agg.to_csv(path + "bureau_agg.csv", index=False, header=True)

time: 4h 9min 27s


# Credit card

#### Aggregation function

In [34]:
def credit_card_agg_func(g):
    mask3 = g["MONTHS_BALANCE"] >= -3
    mask6 = g["MONTHS_BALANCE"] >= -6
    mask12 = g["MONTHS_BALANCE"] >= -12
    active = g["NAME_CONTRACT_STATUS"] == "Active"
    overdue = g["SK_DPD"] > 0
    
    d = {
        "MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M": np.nanmax(g["AMT_INST_MIN_REGULARITY"].where(overdue & mask6)), 
        "MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M": np.nanmin(g["AMT_INST_MIN_REGULARITY"].where(overdue & mask12)), 
        
        "SUM_CNT_DRAWINGS_ATM_CURRENT_6M": np.nansum(g["CNT_DRAWINGS_ATM_CURRENT"].where(mask6)), 
        "SUM_AMT_DRAWINGS_ATM_CURRENT_6M": np.nansum(g["AMT_DRAWINGS_ATM_CURRENT"].where(mask6)), 
        "MAX_AMT_DRAWINGS_ATM_CURRENT_6M": np.nanmax(g["AMT_DRAWINGS_ATM_CURRENT"].where(mask6)), 
        "MAX_CNT_DRAWINGS_ATM_CURRENT_6M": np.nanmax(g["CNT_DRAWINGS_ATM_CURRENT"].where(mask6)), 
        "MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M": np.nanmax((g["AMT_RECIVABLE"] / g["AMT_RECEIVABLE_PRINCIPAL"]).where(mask6)), 
        "MAX_UTILIZATION_6M": np.nanmax((g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]).where(mask6)),
        "MAX_CREDIT_CARD_SK_DPD_6M": np.nanmax(g["SK_DPD"].where(mask6)), 
        "MAX_CREDIT_CARD_SK_DPD_12M": np.nanmax(g["SK_DPD"].where(mask12)),
        "MAX_AMT_DRAWINGS_CURRENT_6M": np.nanmax(g["AMT_DRAWINGS_CURRENT"].where(mask6)), 
        "MAX_AMT_DRAWINGS_CURRENT_12M": np.nanmax(g["AMT_DRAWINGS_CURRENT"].where(mask12)), 
        "MAX_AMT_INST_MIN_REGULARITY_6M": np.nanmax(g["AMT_INST_MIN_REGULARITY"].where(mask6)), 
        "MAX_AMT_INST_MIN_REGULARITY_12M": np.nanmax(g["AMT_INST_MIN_REGULARITY"].where(mask12)), 
        "MAX_CNT_DRAWINGS_POS_CURRENT_6M": np.nanmax(g["CNT_DRAWINGS_POS_CURRENT"].where(mask6)), 
        "MAX_CNT_DRAWINGS_POS_CURRENT_12M": np.nanmax(g["CNT_DRAWINGS_POS_CURRENT"].where(mask12)), 
        "SUM_CC_PAYMENT_DIFF_12M": np.nansum((g["AMT_PAYMENT_TOTAL_CURRENT"] - g["AMT_INST_MIN_REGULARITY"]).where(mask12)),
        "DIFF_AVG_BALANCE_6M_12M": np.nanmean(g["AMT_BALANCE"].where(mask6)) - np.nanmean(g["AMT_BALANCE"].where(mask6 ^ mask12)),
        "AVG_BALANCE_6M": np.nanmean(g["AMT_BALANCE"].where(mask6)),
        "AVG_UTILIZATION_6M": np.nanmean((g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]).where(mask6)),
        "AVG_BALANCE": np.nanmean(g["AMT_BALANCE"]), 
        "MAX_BALANCE": np.nanmax(g["AMT_BALANCE"]), 
        "SUM_BALANCE": np.nansum(g["AMT_BALANCE"]), 
        "MAX_MONTHS_BALANCE": np.nanmax(abs(g["MONTHS_BALANCE"])), 
        "MIN_MONTHS_BALANCE": np.nanmin(abs(g["MONTHS_BALANCE"])), 
        "RANGE_MONTHS_BALANCE": np.nanmax(g["MONTHS_BALANCE"]) - np.nanmin(g["MONTHS_BALANCE"]), 
        "AVG_UTILIZATION": np.nanmean(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]), 
        "MAX_UTILIZATION": np.nanmax(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]), 
        "AVG_BALANCE_WEIGHTED": np.nanmean(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_BALANCE_WEIGHTED": np.nanmax(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "SUM_BALANCE_WEIGHTED": np.nansum(g["AMT_BALANCE"] / abs(g["MONTHS_BALANCE"])), 
        "AVG_UTILIZATION_WEIGHTED": np.nanmean(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_UTILIZATION_WEIGHTED": np.nanmax(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_DPD_WEIGHTED": np.nanmax(g["SK_DPD"] / abs(g["MONTHS_BALANCE"])), 
        "MAX_DPD_DEF_WEIGHTED": np.nanmax(g["SK_DPD_DEF"] / abs(g["MONTHS_BALANCE"])), 
        "SUM_CNT_DRAWINGS_CURRENT": np.nansum(g["CNT_DRAWINGS_CURRENT"]), 
        "AVG_CNT_DRAWINGS_CURRENT": np.nanmean(g["CNT_DRAWINGS_CURRENT"]), 
        "MAX_CNT_DRAWINGS_CURRENT": np.nanmax(g["CNT_DRAWINGS_CURRENT"]), 
        "SUM_AMT_DRAWINGS_CURRENT": np.nansum(g["AMT_DRAWINGS_CURRENT"]), 
        "AVG_AMT_DRAWINGS_CURRENT": np.nanmean(g["AMT_DRAWINGS_CURRENT"]), 
        "MAX_AMT_DRAWINGS_CURRENT": np.nanmax(g["AMT_DRAWINGS_CURRENT"]), 
        "MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmin(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"]), 
        "AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmean(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"]), 
        "MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY": np.nanmax(g["AMT_PAYMENT_CURRENT"] / g["AMT_INST_MIN_REGULARITY"])}
    
    return pd.Series(d)

time: 49 ms


#### Process data and write to file

In [35]:
credit_card = pd.read_csv(path + "credit_card_balance.csv")

credit_card_agg = credit_card.groupby("SK_ID_CURR").apply(credit_card_agg_func).reset_index()
credit_card_agg.to_csv(path + "credit_card_agg.csv", index=False, header=True)

time: 34min 6s


# Installments

#### Aggregation function

In [10]:
def installment_agg_func(g):
    mask6 = g["DAYS_ENTRY_PAYMENT"] >= -180
    mask12 = g["DAYS_ENTRY_PAYMENT"] >= -360
    
    d = {"MAX_UNDERPAYMENT_6M": np.nanmax((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask6)), 
        "MAX_UNDERPAYMENT_12M": np.nanmax((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask12)), 
        "SUM_PAYMENT_6M": np.nansum(g["AMT_PAYMENT"].where(mask6)), 
        "SUM_PAYMENT_DIFF_6M_12M": np.nansum(g["AMT_PAYMENT"].where(mask6)) - np.nansum(g["AMT_PAYMENT"].where(mask6 ^ mask12)), 
        "MAX_AMT_INSTALMENT_6M": np.nanmax(g["AMT_INSTALMENT"].where(mask6)), 
        "MIN_AMT_INSTALMENT_6M": np.nanmin(g["AMT_INSTALMENT"].where(mask6)), 
        "MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M": np.nanmax((g["DAYS_ENTRY_PAYMENT"] - g["DAYS_INSTALMENT"])), 
        "MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M": np.nanmin((g["DAYS_ENTRY_PAYMENT"] - g["DAYS_INSTALMENT"])), 
        "SUM_UNDERPAYMENT_12M": np.nansum((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask12)), 
        "SUM_UNDERPAYMENT_6M": np.nansum((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]).where(mask6)), 
        "MAX_PAYMENT_SIZE_6M": np.nanmax(g["AMT_PAYMENT"].where(mask6)), 
        "MAX_PAYMENT_SIZE_12M": np.nanmax(g["AMT_PAYMENT"].where(mask12)), 
        "MIN_PAYMENT_SIZE_6M": np.nanmin(g["AMT_PAYMENT"].where(mask6)),
        "MAX_ABS_DAYS_INSTALMENT": np.nanmax(abs(g["DAYS_INSTALMENT"])), 
        "COUNT_UNDERPAYMENT": np.nansum(g["AMT_PAYMENT"] / g["AMT_INSTALMENT"] < 0.5), 
        "SUM_UNDERPAYMENT": np.nansum(g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]), 
        "SUM_UNDERPAYMENT_WEIGHTED": np.nansum((g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]) / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MAX_UNDERPAYMENT": np.nanmax(g["AMT_INSTALMENT"] - g["AMT_PAYMENT"]), 
        "AVG_PAYMENT_SIZE_WEIGHTED": np.nanmean(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "AVG_PAYMENT_SIZE": np.nanmean(g["AMT_PAYMENT"]), 
        "MAX_PAYMENT_SIZE_WEIGHTED": np.nanmax(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MAX_PAYMENT_SIZE": np.nanmax(g["AMT_PAYMENT"]), 
        "MIN_PAYMENT_SIZE_WEIGHTED": np.nanmin(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "MIN_PAYMENT_SIZE": np.nanmin(g["AMT_PAYMENT"]),
        "SUM_PAYMENT_WEIGHTED": np.nansum(g["AMT_PAYMENT"] / abs(g["DAYS_ENTRY_PAYMENT"])), 
        "SUM_PAYMENT": np.nansum(g["AMT_PAYMENT"]),
        "SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT": np.nansum(g["DAYS_ENTRY_PAYMENT"] > g["DAYS_INSTALMENT"]), 
        "MAX_DAYS_ENTRY_PAYMENT": np.nanmax(g["DAYS_ENTRY_PAYMENT"]), 
        "MIN_DAYS_ENTRY_PAYMENT": np.nanmin(g["DAYS_ENTRY_PAYMENT"]), 
        "RANGE_DAYS_ENTRY_PAYMENT": np.nanmax(g["DAYS_ENTRY_PAYMENT"]) - np.nanmin(g["DAYS_ENTRY_PAYMENT"])}
    
    return pd.Series(d)

#### Process data and write to file

In [12]:
installments = pd.read_csv(path + "installments_payments.csv")

installment_agg = installments.groupby("SK_ID_CURR").apply(installment_agg_func).reset_index()
installment_agg.to_csv(path + "installment_agg.csv", index=False, header=True)

# Point of Sale

#### Aggregation function

In [13]:
def pos_cash_agg_func(g):
    d = {"MAX_POS_DPD": np.nanmax(g["SK_DPD"]), 
        "MAX_POS_DPD_DEF": np.nanmax(g["SK_DPD_DEF"]), 
        "NUM_POS_CASH": g["SK_ID_PREV"].nunique()}
    
    return pd.Series(d)

#### Process data and write to file

In [14]:
pos_cash = pd.read_csv(path + "POS_CASH_balance.csv")

pos_cash_agg = pos_cash.groupby("SK_ID_CURR").apply(pos_cash_agg_func).reset_index()
pos_cash_agg.to_csv(path + "pos_cash_agg.csv", index=False, header=True)

# Join all files

In [101]:
train_or_test = "test"

time: 1.18 ms


In [102]:
application = pd.read_csv(path + "application_" + train_or_test + ".csv")
previous_agg = pd.read_csv(path + "previous_agg.csv")
# bureau_balance_agg should already be joined with bureau_agg
bureau_agg = pd.read_csv(path + "bureau_agg.csv")
credit_card_agg = pd.read_csv(path + "credit_card_agg.csv")
installment_agg = pd.read_csv(path + "installment_agg.csv")
pos_cash_agg = pd.read_csv(path + "pos_cash_agg.csv")

df = pd.merge(application, previous_agg, how="left", on="SK_ID_CURR")
df = pd.merge(df, bureau_agg, how="left", on="SK_ID_CURR")
df = pd.merge(df, credit_card_agg, how="left", on="SK_ID_CURR")
df = pd.merge(df, installment_agg, how="left", on="SK_ID_CURR")
df = pd.merge(df, pos_cash_agg, how="left", on="SK_ID_CURR")
del previous_agg, bureau_agg, credit_card_agg, installment_agg, pos_cash_agg
gc.collect()

165

time: 1min


#### Construct additional features

In [103]:
df["TOTAL_AMT_CREDIT_SUM_DIV_SUM_DAYS_CREDIT_ENDDATE"] = df["TOTAL_AMT_CREDIT_SUM"] / df["SUM_DAYS_CREDIT_ENDDATE"]
df["TOTAL_AMT_CREDIT_SUM_POS_DAYS_DIV_SUM_DAYS_CREDIT_ENDDATE_POS_DAYS"] = df["TOTAL_AMT_CREDIT_SUM_POS_DAYS"] / df["SUM_DAYS_CREDIT_ENDDATE_POS_DAYS"]
df["MAX_ABS_DAYS_INSTALMENT_DIV_DAYS_BIRTH"] = df["MAX_ABS_DAYS_INSTALMENT"] / df["DAYS_BIRTH"]
df["FLAG_OWN_CAR"] = (df["FLAG_OWN_CAR"] == "Y").astype(int)
df["FLAG_OWN_REALTY"] = (df["FLAG_OWN_REALTY"] == "Y").astype(int)
df["AMT_CREDIT_DIV_AMT_INCOME_TOTAL"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]
df["AMT_CREDIT_PLUS_AMT_INCOME_TOTAL"] = df["AMT_CREDIT"] + df["AMT_INCOME_TOTAL"]
df["AMT_CREDIT_DIV_AMT_GOODS_PRICE"] = df["AMT_CREDIT"] / df["AMT_GOODS_PRICE"]
df["AMT_CREDIT_DIV_SUM_PAYMENT"] = df["AMT_CREDIT"] / df["SUM_PAYMENT"]
df["AMT_GOODS_PRICE_DIV_AMT_INCOME_TOTAL"] = df["AMT_GOODS_PRICE"] / df["AMT_INCOME_TOTAL"]
df["AMT_CREDIT_DIV_AMT_ANNUITY"] = df["AMT_CREDIT"] / df["AMT_ANNUITY"]
df["AMT_CREDIT_DIV_AVG_PREV_REQ_AMOUNT"] = df["AMT_CREDIT"] / df["AVG_PREV_REQ_AMOUNT"]
df["AMT_CREDIT_DIV_MAX_PREV_REQ_AMOUNT"] = df["AMT_CREDIT"] / df["MAX_PREV_REQ_AMOUNT"]
df["EXT_SOURCE_PROD"] = df["EXT_SOURCE_1"] * df["EXT_SOURCE_2"] * df["EXT_SOURCE_3"]
df["DAYS_EMPLOYED_DIV_DAYS_BIRTH"] = df["DAYS_EMPLOYED"] / df["DAYS_BIRTH"]
df["DAYS_EMPLOYED_PLUS_DAYS_REGISTRATION_PLUS_DAYS_LAST_PHONE_CHANGE"] = df["DAYS_EMPLOYED"] + df["DAYS_REGISTRATION"] + df["DAYS_LAST_PHONE_CHANGE"]
df["AVG_PAYMENT_SIZE_DIV_AMT_INCOME_TOTAL"] = df["AVG_PAYMENT_SIZE"] / df["AMT_INCOME_TOTAL"]
df["AVG_PAYMENT_SIZE_DIV_AMT_CREDIT"] = df["AVG_PAYMENT_SIZE"] / df["AMT_CREDIT"]
df["AVG_PAYMENT_SIZE_DIV_AMT_ANNUITY"] = df["AVG_PAYMENT_SIZE"] / df["AMT_ANNUITY"]
df["DAYS_REGISTRATION_PLUS_DAYS_ID_PUBLISH"] = df["DAYS_REGISTRATION"] + df["DAYS_ID_PUBLISH"]
df["SUM_REFUSED_CONTRACT_DIV_SUM_APPR_CONTRACT"] = df["SUM_REFUSED_CONTRACT"] / df["SUM_APPR_CONTRACT"]
df["MAX_UTILIZATION_DIV_AVG_UTILIZATION"] = df["MAX_UTILIZATION"] / df["AVG_UTILIZATION"]
df["MAX_PREV_REQ_AMOUNT_DIV_AMT_CREDIT"] = df["MAX_PREV_REQ_AMOUNT"] / df["AMT_CREDIT"]
df["AMT_INCOME_TOTAL_DIV_DAYS_BIRTH"] = df["AMT_INCOME_TOTAL"] / df["DAYS_BIRTH"]
df["SUM_DAYS_ID_REG_PHONE"] = df["DAYS_ID_PUBLISH"] + df["DAYS_REGISTRATION"] + df["DAYS_LAST_PHONE_CHANGE"]

df["SUM_REQ_CREDIT_YEAR"] = df["AMT_REQ_CREDIT_BUREAU_HOUR"] + df["AMT_REQ_CREDIT_BUREAU_DAY"] + df["AMT_REQ_CREDIT_BUREAU_WEEK"] + df["AMT_REQ_CREDIT_BUREAU_MON"] + df["AMT_REQ_CREDIT_BUREAU_QRT"] + df["AMT_REQ_CREDIT_BUREAU_YEAR"]
df["SUM_REQ_CREDIT_QRT"] = df["AMT_REQ_CREDIT_BUREAU_HOUR"] + df["AMT_REQ_CREDIT_BUREAU_DAY"] + df["AMT_REQ_CREDIT_BUREAU_WEEK"] + df["AMT_REQ_CREDIT_BUREAU_MON"] + df["AMT_REQ_CREDIT_BUREAU_QRT"]
df["SUM_REQ_CREDIT_1M"] = df["AMT_REQ_CREDIT_BUREAU_HOUR"] + df["AMT_REQ_CREDIT_BUREAU_DAY"] + df["AMT_REQ_CREDIT_BUREAU_WEEK"] + df["AMT_REQ_CREDIT_BUREAU_MON"]

df["SUM_REQ_CREDIT_1M_DIV_SUM_REQ_CREDIT_QRT"] = df["SUM_REQ_CREDIT_1M"] / df["SUM_REQ_CREDIT_QRT"]
df["SUM_REQ_CREDIT_QRT_DIV_SUM_REQ_CREDIT_YEAR"] = df["SUM_REQ_CREDIT_QRT"] / df["SUM_REQ_CREDIT_YEAR"]

df["DEF_30_PLUS_60_CNT_SOCIAL_CIRCLE"] = df["DEF_30_CNT_SOCIAL_CIRCLE"] + df["DEF_60_CNT_SOCIAL_CIRCLE"]
df["OWN_CAR_AGE_DIV_DAYS_BIRTH"] = df["OWN_CAR_AGE"] / df["DAYS_BIRTH"]
df["LANDAREA_DIV_TOTALAREA_MODE"] = df["LANDAREA_MODE"] / df["TOTALAREA_MODE"]
df["OWN_CAR_AGE_PLUS_DAYS_BIRTH"] = df["OWN_CAR_AGE"] + df["DAYS_BIRTH"]
df["AMT_ANNUITY_DIV_DAYS_BIRTH"] = df["AMT_ANNUITY"] / df["DAYS_BIRTH"]
df["AMT_ANNUITY_DIV_DAYS_EMPLOYED"] = df["AMT_ANNUITY"] / df["DAYS_EMPLOYED"]
df["AMT_ANNUITY_PROD_DAYS_EMPLOYED"] = df["AMT_ANNUITY"] * df["DAYS_EMPLOYED"]
df["DAYS_REGISTRATION_DIV_DAYS_ID_PUBLISH"] = df["DAYS_REGISTRATION"] / df["DAYS_ID_PUBLISH"]
df["DAYS_REGISTRATION_DIV_DAYS_LAST_PHONE_CHANGE"] = df["DAYS_REGISTRATION"] / df["DAYS_LAST_PHONE_CHANGE"]
df["REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULATION_RELATIVE"] = df["REGION_RATING_CLIENT_W_CITY"] / df["REGION_POPULATION_RELATIVE"]
df["REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULATION_RELATIVE"] = df["REGION_RATING_CLIENT_W_CITY"] * df["REGION_POPULATION_RELATIVE"]
df["SUM_REG_NOT_FLAG"] = df["REG_REGION_NOT_LIVE_REGION"] + df["REG_REGION_NOT_WORK_REGION"] + df["LIVE_REGION_NOT_WORK_REGION"] + df["REG_CITY_NOT_LIVE_CITY"] + df["REG_CITY_NOT_WORK_CITY"] + df["LIVE_CITY_NOT_WORK_CITY"]
df["SUM_AVG_BUILD"] = df["APARTMENTS_AVG"] + df["BASEMENTAREA_AVG"] + df["YEARS_BEGINEXPLUATATION_AVG"] + df["YEARS_BUILD_AVG"] + df["COMMONAREA_AVG"] + df["ELEVATORS_AVG"] + df["ENTRANCES_AVG"] + df["FLOORSMAX_AVG"] + df["FLOORSMIN_AVG"] + df["LANDAREA_AVG"] + df["LIVINGAPARTMENTS_AVG"] + df["LIVINGAREA_AVG"] + df["NONLIVINGAPARTMENTS_AVG"] + df["NONLIVINGAREA_AVG"]
df["SUM_MODE_BUILD"] = df["APARTMENTS_MODE"] + df["BASEMENTAREA_MODE"] + df["YEARS_BEGINEXPLUATATION_MODE"] + df["YEARS_BUILD_MODE"] + df["COMMONAREA_MODE"] + df["ELEVATORS_MODE"] + df["ENTRANCES_MODE"] + df["FLOORSMAX_MODE"] + df["FLOORSMIN_MODE"] + df["LANDAREA_MODE"] + df["LIVINGAPARTMENTS_MODE"] + df["LIVINGAREA_MODE"] + df["NONLIVINGAPARTMENTS_MODE"] + df["NONLIVINGAREA_MODE"]
df["SUM_MEDI_BUILD"] = df["APARTMENTS_MEDI"] + df["BASEMENTAREA_MEDI"] + df["YEARS_BEGINEXPLUATATION_MEDI"] + df["YEARS_BUILD_MEDI"] + df["COMMONAREA_MEDI"] + df["ELEVATORS_MEDI"] + df["ENTRANCES_MEDI"] + df["FLOORSMAX_MEDI"] + df["FLOORSMIN_MEDI"] + df["LANDAREA_MEDI"] + df["LIVINGAPARTMENTS_MEDI"] + df["LIVINGAREA_MEDI"] + df["NONLIVINGAPARTMENTS_MEDI"] + df["NONLIVINGAREA_MEDI"]
df["SUM_DOC_FLAG"] = df["FLAG_DOCUMENT_2"] + df["FLAG_DOCUMENT_3"] + df["FLAG_DOCUMENT_4"] + df["FLAG_DOCUMENT_5"] + df["FLAG_DOCUMENT_6"] + df["FLAG_DOCUMENT_7"] + df["FLAG_DOCUMENT_8"] + df["FLAG_DOCUMENT_9"] + df["FLAG_DOCUMENT_10"] + df["FLAG_DOCUMENT_11"] + df["FLAG_DOCUMENT_12"] + df["FLAG_DOCUMENT_13"] + df["FLAG_DOCUMENT_14"] + df["FLAG_DOCUMENT_15"] + df["FLAG_DOCUMENT_16"] + df["FLAG_DOCUMENT_17"] + df["FLAG_DOCUMENT_18"] + df["FLAG_DOCUMENT_19"] + df["FLAG_DOCUMENT_20"] + df["FLAG_DOCUMENT_21"]
df["CNT_CHILDREN_DIV_DAYS_BIRTH"] = df["CNT_CHILDREN"] / df["DAYS_BIRTH"]
df["CNT_CHILDREN_DIV_REGION_POPULATION_RELATIVE"] = df["CNT_CHILDREN"] / df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_REALTY_PROD_REGION_POPULATION_RELATIVE"] = df["FLAG_OWN_REALTY"] * df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_REALTY_DIV_REGION_POPULATION_RELATIVE"] = df["FLAG_OWN_REALTY"] / df["REGION_POPULATION_RELATIVE"]
df["FLAG_OWN_CAR_DIV_OWN_CAR_AGE"] = df["FLAG_OWN_CAR"] / df["OWN_CAR_AGE"]
df["EXT_SOURCE_1_DIV_DAYS_BIRTH"] = df["EXT_SOURCE_1"] / df["DAYS_BIRTH"]
df["EXT_SOURCE_1_PROD_DAYS_BIRTH"] = df["EXT_SOURCE_1"] * df["DAYS_BIRTH"]

time: 223 ms


#### Remove infinite values

In [104]:
df.replace([-np.inf, np.inf], np.nan, inplace=True)

time: 1.7 s


#### Remove income outliers

In [105]:
df.loc[df["AMT_INCOME_TOTAL"] > 500000, "AMT_INCOME_TOTAL"] = np.nan

time: 25 ms


#### Handle special values for DAYS_EMPLOYED

In [106]:
df.loc[df["DAYS_EMPLOYED"] > 0, "DAYS_EMPLOYED"] = np.nan

time: 30.1 ms


#### Encode categorical features

Order `ORGANIZATION_TYPE` categories and map to integers (`org_type_map` was obtained by inspecting the default rates by group within the training data)

In [107]:
org_type_map = {"Trade: type 4": 0, "Industry: type 12": 1, "Transport: type 1": 2, "Trade: type 6": 3,
    "Security Ministries": 4, "University": 5, "Police": 6, "Military": 7,
    "Bank": 8, "XNA": 9, "Culture": 10, "Insurance": 11,
    "Religion": 12, "School": 13, "Trade: type 5": 14, "Hotel": 15,
    "Industry: type 10": 16, "Medicine": 17, "Services": 18, "Electricity": 19,
    "Industry: type 9": 20, "Industry: type 5": 21, "Government": 22, "Trade: type 2": 23,
    "Kindergarten": 24, "Emergency": 25, "Industry: type 6": 26, "Industry: type 2": 27,
    "Telecom": 28, "Other": 29, "Transport: type 2": 30, "Legal Services": 31,
    "Housing": 32, "Industry: type 7": 33, "Business Entity Type 1": 34, "Advertising": 35,
    "Postal": 36, "Business Entity Type 2": 37, "Industry: type 11": 38, "Trade: type 1": 39,
    "Mobile": 40, "Transport: type 4": 41, "Business Entity Type 3": 42, "Trade: type 7": 43,
    "Security": 44, "Industry: type 4": 45, "Self-employed": 46, "Trade: type 3": 47,
    "Agriculture": 48, "Realtor": 49, "Industry: type 3": 50, "Industry: type 1": 51,
    "Cleaning": 52, "Construction": 53, "Restaurant": 54, "Industry: type 8": 55,
    "Industry: type 13": 56, "Transport: type 3": 57}

time: 28.7 ms


In [108]:
df["ORGANIZATION_TYPE"] = df["ORGANIZATION_TYPE"].map(org_type_map)

time: 21.7 ms


Dummy code remaining categorical features

In [109]:
df = pd.get_dummies(df, dummy_na=True)
df.columns = df.columns.str.replace("\s+", "_")

time: 1.29 s


In [110]:
df.head()

Unnamed: 0,SK_ID_CURR,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,MIN_PREV_AMT_ANNUITY_12M,MIN_PREV_AMT_ANNUITY_24M,MIN_PREV_PROP_APPROVED_12M,AVG_SYNTH_TARGET_12M,AVG_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_24M,MAX_PREV_PROP_APPROVED_12M,MAX_PREV_PROP_APPROVED_24M,COUNT_PREV_APP,MIN_PREV_DAYS_TERMINATION,MAX_PREV_DAYS_TERMINATION,AVG_PREV_DAYS_TERMINATION,RANGE_PREV_DAYS_TERMINATION,MIN_PREV_AMT_CREDIT,MAX_PREV_AMT_CREDIT,AVG_PREV_AMT_CREDIT,MIN_PREV_AMT_CREDIT_WEIGHTED,MAX_PREV_AMT_CREDIT_WEIGHTED,AVG_PREV_AMT_CREDIT_WEIGHTED,MIN_PREV_AMT_CREDIT_DIV_ANNUITY,MAX_PREV_AMT_CREDIT_DIV_ANNUITY,AVG_PREV_AMT_CREDIT_DIV_ANNUITY,MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MIN_PREV_AMT_ANNUITY,MAX_PREV_AMT_ANNUITY,AVG_PREV_AMT_ANNUITY,MIN_PREV_AMT_ANNUITY_WEIGHTED,MAX_PREV_AMT_ANNUITY_WEIGHTED,AVG_PREV_AMT_ANNUITY_WEIGHTED,MIN_DAYS_DECISION,MAX_DAYS_DECISION,RANGE_DAYS_DECISION,SUM_DAYS_LAST_DUE_NULL,AVG_DAYS_LAST_DUE_NULL,AVG_PREV_REQ_AMOUNT_WEIGHTED,MAX_PREV_REQ_AMOUNT_WEIGHTED,AVG_PREV_REQ_AMOUNT,MAX_PREV_REQ_AMOUNT,AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED,AVG_PREV_PROP_APPROVED_WEIGHTED,MAX_PREV_PROP_APPROVED_WEIGHTED,AVG_PREV_RATE_DOWNPAYMENT,AVG_PREV_PROP_APPROVED,MAX_PREV_PROP_APPROVED,MIN_PREV_PROP_APPROVED,AVG_PREV_INT_RATE,SUM_PREV_URGENT_NEEDS,SUM_PREV_REPAIRS,SUM_PREV_OTHER,SUM_PREV_LIMIT_REJECT,SUM_REFUSED_CONTRACT,SUM_CANC_CONTRACT,SUM_APPR_CONTRACT,SUM_PREV_HC_REJECT,SUM_PREV_INSURE_REQ,COUNT_PREV_WALK_IN,COUNT_PREV_HIGH_YIELD,COUNT_PREV_LOW_YIELD,AVG_SYNTH_TARGET,SUM_SYNTH_TARGET_WEIGHTED,SUM_SYNTH_TARGET,MAX_SYNTH_TARGET,MIN_SYNTH_TARGET,RANGE_SYNTH_TARGET,SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE,SUM_DAYS_FIRST_DRAWING_SENTINEL,SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,SUM_DAYS_LAST_DUE_LT_FIRST_VERSION,MIN_RATE_INTEREST_PRIMARY_12M,AVG_RATE_INTEREST_PRIVILEGED_12M,SUM_REFUSED_CONTRACT_6M,SUM_PRODUCT_COMBINATION_POS_HOUSE_INTEREST_12M,SUM_PRODUCT_COMBINATION_POS_MOBILE_INTEREST_12M,SUM_NAME_GOODS_CATEGORY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_CSTR_6M,SUM_NAME_PAYMENT_TYPE_XNA_6M,COUNT_NAME_CLIENT_TYPE_REPEATER_12M,COUNT_NAME_CLIENT_TYPE_NEW_12M,AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M,SUM_CC_DEBT_6M,SUM_CC_DEBT_12M,MAX_WORST_DQ_BUREAU_BALANCE_6M,MAX_WORST_DQ_BUREAU_BALANCE_12M,MAX_BUREAU_UTILIZATION_6M,MAX_BUREAU_UTILIZATION_12M,COUNT_ACTIVE_6M,COUNT_ACTIVE_12M,COUNT_ACTIVE_24M,DAYS_REMAINING_ACTIVE,MAX_CREDIT_DAY_OVERDUE_6M,MAX_CREDIT_DAY_OVERDUE_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_12M_24M,BUREAU_SUM_DEBT_DIFF_6M_12M,BUREAU_SUM_DEBT_DIFF_12M_24M,MAX_CNT_CREDIT_PROLONG,AVG_LEN_BUREAU_BALANCE,PROP_CURRENT,PROP_CLOSED,PROP_CURRENT_WEIGHTED,MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE,MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE,RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE,SUM_SUM_CURRENT_BUREAU_BALANCE,AVG_PROP_CURRENT,AVG_PROP_DQ,MAX_PROP_DQ,AVG_PROP_CURRENT_WEIGHTED,MIN_PROP_CURRENT_WEIGHTED,AVG_PROP_DQ_WEIGHTED,MAX_PROP_DQ_WEIGHTED,AVG_PROP_CURRENT_WEIGHTED_AMT,MIN_PROP_CURRENT_WEIGHTED_AMT,AVG_PROP_DQ_WEIGHTED_AMT,MAX_PROP_DQ_WEIGHTED_AMT,AVG_WORST_DQ_BUREAU_BALANCE,MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED,AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED,TOTAL_AMT_CREDIT_SUM_POS_DAYS,SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,MAX_LEN_BUREAU_BALANCE,SUM_LEN_BUREAU_BALANCE,MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE,MIN_DAYS_CREDIT_ENDDATE,MAX_DAYS_CREDIT_ENDDATE,SUM_DAYS_CREDIT_ENDDATE,SUM_NULL_DAYS_ENDDATE_FACT,COUNT_BUREAU_RECORDS,COUNT_ACTIVE,MAX_CREDIT_DAY_OVERDUE_WEIGHTED,SUM_CREDIT_DAY_OVERDUE_WEIGHTED,MAX_CREDIT_DAY_OVERDUE,SUM_CREDIT_DAY_OVERDUE,DAYS_SINCE_APPLIED,SUM_INVERSE_DAYS_CREDIT,MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,MAX_AMT_CREDIT_MAX_OVERDUE,SUM_AMT_CREDIT_MAX_OVERDUE,SUM_CNT_CREDIT_PROLONG,SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED,SUM_AMT_CREDIT_SUM_DEBT,BUREAU_UTILIZATION_AVG,BUREAU_UTILIZATION_MAX,BUREAU_PROP_SUM_OVERDUE_AVG,BUREAU_PROP_MAX_OVERDUE_AVG,MAX_DAYS_CREDIT_UPDATE,RANGE_DAYS_CREDIT_UPDATE,DAYS_CREDIT_RANGE,TOTAL_AMT_CREDIT_SUM_WEIGHTED,TOTAL_AMT_CREDIT_SUM,COUNT_CREDIT_CARD,COUNT_CAR_LOAN,COUNT_MORTGAGE,SUM_AMT_ANNUITY,MAX_CREDIT_CARD_SK_DPD_6M,MAX_CREDIT_CARD_SK_DPD_12M,MAX_AMT_DRAWINGS_CURRENT_6M,MAX_AMT_DRAWINGS_CURRENT_12M,MAX_AMT_INST_MIN_REGULARITY_6M,MAX_AMT_INST_MIN_REGULARITY_12M,MAX_CNT_DRAWINGS_POS_CURRENT_6M,MAX_CNT_DRAWINGS_POS_CURRENT_12M,SUM_CC_PAYMENT_DIFF_12M,DIFF_AVG_BALANCE_6M_12M,AVG_BALANCE_6M,AVG_UTILIZATION_6M,AVG_BALANCE,MAX_BALANCE,SUM_BALANCE,MAX_MONTHS_BALANCE,MIN_MONTHS_BALANCE,RANGE_MONTHS_BALANCE,AVG_UTILIZATION,MAX_UTILIZATION,AVG_BALANCE_WEIGHTED,MAX_BALANCE_WEIGHTED,SUM_BALANCE_WEIGHTED,AVG_UTILIZATION_WEIGHTED,MAX_UTILIZATION_WEIGHTED,MAX_DPD_WEIGHTED,MAX_DPD_DEF_WEIGHTED,SUM_CNT_DRAWINGS_CURRENT,AVG_CNT_DRAWINGS_CURRENT,MAX_CNT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_CURRENT,AVG_AMT_DRAWINGS_CURRENT,MAX_AMT_DRAWINGS_CURRENT,MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,SUM_CNT_DRAWINGS_ATM_CURRENT_6M,SUM_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_CNT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M,MAX_UTILIZATION_6M,MAX_UTILIZATION_3M,SUM_UNDERPAYMENT_12M,SUM_UNDERPAYMENT_6M,MAX_PAYMENT_SIZE_6M,MAX_PAYMENT_SIZE_12M,MIN_PAYMENT_SIZE_6M,MAX_ABS_DAYS_INSTALMENT,COUNT_UNDERPAYMENT,SUM_UNDERPAYMENT,SUM_UNDERPAYMENT_WEIGHTED,MAX_UNDERPAYMENT,AVG_PAYMENT_SIZE_WEIGHTED,AVG_PAYMENT_SIZE,MAX_PAYMENT_SIZE_WEIGHTED,MAX_PAYMENT_SIZE,MIN_PAYMENT_SIZE_WEIGHTED,MIN_PAYMENT_SIZE,SUM_PAYMENT_WEIGHTED,SUM_PAYMENT,SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT,MAX_DAYS_ENTRY_PAYMENT,MIN_DAYS_ENTRY_PAYMENT,RANGE_DAYS_ENTRY_PAYMENT,MAX_UNDERPAYMENT_6M,MAX_UNDERPAYMENT_12M,SUM_PAYMENT_6M,SUM_PAYMENT_DIFF_6M_12M,MAX_AMT_INSTALMENT_6M,MIN_AMT_INSTALMENT_6M,MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,MAX_POS_DPD,MAX_POS_DPD_DEF,NUM_POS_CASH,TOTAL_AMT_CREDIT_SUM_DIV_SUM_DAYS_CREDIT_ENDDATE,TOTAL_AMT_CREDIT_SUM_POS_DAYS_DIV_SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,MAX_ABS_DAYS_INSTALMENT_DIV_DAYS_BIRTH,AMT_CREDIT_DIV_AMT_INCOME_TOTAL,AMT_CREDIT_PLUS_AMT_INCOME_TOTAL,AMT_CREDIT_DIV_AMT_GOODS_PRICE,AMT_CREDIT_DIV_SUM_PAYMENT,AMT_GOODS_PRICE_DIV_AMT_INCOME_TOTAL,AMT_CREDIT_DIV_AMT_ANNUITY,AMT_CREDIT_DIV_AVG_PREV_REQ_AMOUNT,AMT_CREDIT_DIV_MAX_PREV_REQ_AMOUNT,EXT_SOURCE_PROD,DAYS_EMPLOYED_DIV_DAYS_BIRTH,DAYS_EMPLOYED_PLUS_DAYS_REGISTRATION_PLUS_DAYS_LAST_PHONE_CHANGE,AVG_PAYMENT_SIZE_DIV_AMT_INCOME_TOTAL,AVG_PAYMENT_SIZE_DIV_AMT_CREDIT,AVG_PAYMENT_SIZE_DIV_AMT_ANNUITY,DAYS_REGISTRATION_PLUS_DAYS_ID_PUBLISH,SUM_REFUSED_CONTRACT_DIV_SUM_APPR_CONTRACT,MAX_UTILIZATION_DIV_AVG_UTILIZATION,MAX_PREV_REQ_AMOUNT_DIV_AMT_CREDIT,AMT_INCOME_TOTAL_DIV_DAYS_BIRTH,SUM_DAYS_ID_REG_PHONE,SUM_REQ_CREDIT_YEAR,SUM_REQ_CREDIT_QRT,SUM_REQ_CREDIT_1M,SUM_REQ_CREDIT_1M_DIV_SUM_REQ_CREDIT_QRT,SUM_REQ_CREDIT_QRT_DIV_SUM_REQ_CREDIT_YEAR,DEF_30_PLUS_60_CNT_SOCIAL_CIRCLE,OWN_CAR_AGE_DIV_DAYS_BIRTH,LANDAREA_DIV_TOTALAREA_MODE,OWN_CAR_AGE_PLUS_DAYS_BIRTH,AMT_ANNUITY_DIV_DAYS_BIRTH,AMT_ANNUITY_DIV_DAYS_EMPLOYED,AMT_ANNUITY_PROD_DAYS_EMPLOYED,DAYS_REGISTRATION_DIV_DAYS_ID_PUBLISH,DAYS_REGISTRATION_DIV_DAYS_LAST_PHONE_CHANGE,REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULATION_RELATIVE,SUM_REG_NOT_FLAG,SUM_AVG_BUILD,SUM_MODE_BUILD,SUM_MEDI_BUILD,SUM_DOC_FLAG,CNT_CHILDREN_DIV_DAYS_BIRTH,CNT_CHILDREN_DIV_REGION_POPULATION_RELATIVE,FLAG_OWN_REALTY_PROD_REGION_POPULATION_RELATIVE,FLAG_OWN_REALTY_DIV_REGION_POPULATION_RELATIVE,FLAG_OWN_CAR_DIV_OWN_CAR_AGE,EXT_SOURCE_1_DIV_DAYS_BIRTH,EXT_SOURCE_1_PROD_DAYS_BIRTH,NAME_CONTRACT_TYPE_Cash_loans,NAME_CONTRACT_TYPE_Revolving_loans,NAME_CONTRACT_TYPE_nan,CODE_GENDER_F,CODE_GENDER_M,CODE_GENDER_nan,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group_of_people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse,_partner",NAME_TYPE_SUITE_Unaccompanied,NAME_TYPE_SUITE_nan,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial_associate,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State_servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_INCOME_TYPE_nan,NAME_EDUCATION_TYPE_Academic_degree,NAME_EDUCATION_TYPE_Higher_education,NAME_EDUCATION_TYPE_Incomplete_higher,NAME_EDUCATION_TYPE_Lower_secondary,NAME_EDUCATION_TYPE_Secondary_/_secondary_special,NAME_EDUCATION_TYPE_nan,NAME_FAMILY_STATUS_Civil_marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single_/_not_married,NAME_FAMILY_STATUS_Widow,NAME_FAMILY_STATUS_nan,NAME_HOUSING_TYPE_Co-op_apartment,NAME_HOUSING_TYPE_House_/_apartment,NAME_HOUSING_TYPE_Municipal_apartment,NAME_HOUSING_TYPE_Office_apartment,NAME_HOUSING_TYPE_Rented_apartment,NAME_HOUSING_TYPE_With_parents,NAME_HOUSING_TYPE_nan,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning_staff,OCCUPATION_TYPE_Cooking_staff,OCCUPATION_TYPE_Core_staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR_staff,OCCUPATION_TYPE_High_skill_tech_staff,OCCUPATION_TYPE_IT_staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill_Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine_staff,OCCUPATION_TYPE_Private_service_staff,OCCUPATION_TYPE_Realty_agents,OCCUPATION_TYPE_Sales_staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security_staff,OCCUPATION_TYPE_Waiters/barmen_staff,OCCUPATION_TYPE_nan,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,WEEKDAY_APPR_PROCESS_START_nan,FONDKAPREMONT_MODE_not_specified,FONDKAPREMONT_MODE_org_spec_account,FONDKAPREMONT_MODE_reg_oper_account,FONDKAPREMONT_MODE_reg_oper_spec_account,FONDKAPREMONT_MODE_nan,HOUSETYPE_MODE_block_of_flats,HOUSETYPE_MODE_specific_housing,HOUSETYPE_MODE_terraced_house,HOUSETYPE_MODE_nan,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone,_brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan
0,100001,0,1,0,135000.0,568800.0,20560.5,450000.0,0.01885,-19241,-2329.0,-5170.0,-812,,1,1,0,1,0,1,2.0,2,2,18,0,0,0,0,0,0,24,0.752614,0.789654,0.15952,0.066,0.059,0.9732,,,,0.1379,0.125,,,,0.0505,,,0.0672,0.0612,0.9732,,,,0.1379,0.125,,,,0.0526,,,0.0666,0.059,0.9732,,,,0.1379,0.125,,,,0.0514,,,0.0392,0.0,0.0,0.0,0.0,-1740.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,1.0,-1612.0,-1612.0,-1612.0,0.0,23787.0,23787.0,23787.0,13.67069,13.67069,13.67069,6.020501,6.020501,6.020501,0.00346,0.00346,0.00346,3951.0,3951.0,3951.0,2.27069,2.27069,2.27069,-1740.0,-1740.0,0.0,0.0,0.0,14.273276,14.273276,24835.5,24835.5,6e-05,0.00055,0.00055,0.104326,0.957782,0.957782,0.957782,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.101729,5.8e-05,0.101729,0.101729,0.101729,0.0,0.0,1.0,0.000575,0.000575,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,603.706712,0.0,0.0,1.0,1.0,0.987405,0.987405,3.0,3.0,3.0,3091.0,0.0,,,,596686.5,596686.5,0.0,8.857143,0.983871,1.774194,0.007155,47.5,0.5,47.0,61.0,0.992481,0.007519,0.052632,0.349547,0.021053,0.000835,0.005848,120775.784672,1800.0,282.105263,1974.736842,0.142857,0.111111,0.015873,884025.0,3091.0,19.0,19.0,-51.0,-1329.0,1778.0,577.0,3.0,7.0,3.0,0.0,0.0,0.0,0.0,49.0,0.029363,,0.0,,0.0,0.0,53216.5875,596686.5,,,0.0,,-6.0,149.0,1523.0,100412.66129,1453365.0,0.0,0.0,0.0,24817.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,2916.0,0.0,0.0,0.0,0.0,3.116986,5885.132143,10.686671,17397.9,1.365586,3951.0,21.8189,41195.925,1.0,-1628.0,-2916.0,1288.0,,,0.0,0.0,,,11.0,-36.0,7.0,7.0,2.0,2518.830156,285.999676,-0.151551,4.213333,703800.0,1.264,13.807191,3.333333,27.664697,22.9027,22.9027,0.094803,0.121044,-9239.0,0.043594,0.010347,0.286235,-5982.0,0.0,,0.043663,-7.016267,-7722.0,0.0,0.0,0.0,,,0.0,,,,-1.068578,-8.828038,-47885404.5,6.366995,2.971264,0.0377,0,,,,1,-0.0,0.0,0.01885,53.050398,,-3.9e-05,-14481.055414,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0
1,100005,0,1,0,99000.0,222768.0,17370.0,180000.0,0.035792,-18064,-4469.0,-9118.0,-1623,,1,1,0,1,0,0,2.0,2,2,9,0,0,0,0,0,0,46,0.56499,0.291656,0.432962,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,,,,0.060611,,,,,2.0,-460.0,-460.0,-460.0,0.0,0.0,40153.5,20076.75,0.0,53.042933,26.521466,8.342371,8.342371,8.342371,0.01102,0.01102,0.01102,4813.2,4813.2,4813.2,6.358256,6.358256,6.358256,-757.0,-315.0,442.0,1.0,0.5,29.469947,58.939894,22308.75,44617.5,0.000144,0.001189,0.001189,0.108964,0.89995,0.89995,0.89995,,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.071063,0.0003,0.142127,0.081516,0.060611,0.020905,0.0,1.0,0.001321,0.001321,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,,617.739835,0.0,0.0,0.0,0.0,0.954794,0.954794,2.0,2.0,2.0,1446.0,0.0,,,,568408.5,568408.5,0.0,5.333333,1.0,0.3125,0.086957,8.5,1.0,7.5,16.0,1.0,0.0,0.0,0.539216,0.117647,0.0,0.0,107036.117647,6882.352941,0.0,0.0,0.0,0.0,0.0,598626.0,1446.0,8.0,8.0,-12.0,-128.0,1324.0,1318.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,62.0,0.026109,0.0,0.0,0.0,0.0,0.0,50188.368035,568408.5,,,0.0,0.0,-11.0,110.0,311.0,53154.691016,657126.0,1.0,0.0,0.0,4261.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,706.0,0.0,0.0,0.0,0.0,11.09417,6240.205,37.566479,17656.245,6.539674,4813.2,99.847528,56161.845,1.0,-470.0,-736.0,266.0,,,0.0,0.0,,,1.0,-37.0,0.0,0.0,1.0,498.578149,413.987552,-0.039083,2.250182,321768.0,1.2376,3.966536,1.818182,12.82487,9.985678,4.992839,0.071345,0.247398,-13587.0,0.063032,0.028012,0.359252,-10741.0,0.0,,0.200287,-5.480514,-10741.0,3.0,0.0,0.0,,0.0,0.0,,,,-0.961581,-3.886776,-77626530.0,5.617991,,0.071584,0,,,,1,-0.0,0.0,0.035792,27.939204,,-3.1e-05,-10205.983005,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1
2,100013,1,1,0,202500.0,663264.0,69777.0,630000.0,0.019101,-20038,-4458.0,-2175.0,-3503,5.0,1,1,0,1,0,0,2.0,2,2,14,0,0,0,0,0,0,57,,0.699787,0.610991,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-856.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0,4742.415,4742.415,1.079158,0.073146,1.079158,1.079158,1.079158,1.079158,4.0,-1702.0,-85.0,-710.333333,1617.0,0.0,512370.0,146134.125,0.0,598.563084,177.826452,5.109738,22.128804,11.523312,0.003668,0.025851,0.016079,4742.415,23153.985,11478.195,3.270728,27.049048,15.897086,-1999.0,-222.0,1777.0,1.0,0.25,158.370361,525.700935,130871.25,450000.0,3.4e-05,0.001918,0.003953,0.067217,1.052363,1.1386,0.93933,,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,1.0,0.0,1.0,0.0,0.079875,0.000716,0.319498,0.102549,0.054493,0.048055,1.0,3.0,0.005331,0.003663,2.0,,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,31.75,0.944882,0.811024,0.005744,54.5,19.5,35.0,120.0,0.934538,0.065462,0.136364,0.027611,0.015847,0.001329,0.002502,11763.832141,1358.464615,935.629196,1895.27027,0.75,0.027778,0.016119,0.0,0.0,40.0,40.0,-68.0,-1707.0,-567.0,-4272.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,1210.0,0.002409,14.471514,14.471514,19305.0,19305.0,0.0,0.0,0.0,,,,,-4.0,1330.0,860.0,9516.034492,2072280.06,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18159.919219,161420.22,1743352.245,96.0,1.0,95.0,0.115301,1.02489,230.066978,1944.407308,22086.429911,0.001461,0.012345,0.014493,0.014493,23.0,0.239583,7.0,571500.0,5953.125,157500.0,0.0,,,0.0,0.0,0.0,0.0,,0.0,0.0,9484.83,9484.83,4741.245,357347.745,23.58,2705.0,10.0,179437.725,381.708698,23147.82,14.445926,9740.235774,1029.820591,357347.745,0.01181,6.165,2239.118537,1509736.545,11.0,-14.0,-2705.0,2691.0,4718.835,4718.835,15871.995,-357348.915,4742.415,274.32,21.0,-38.0,18.0,0.0,3.0,-485.084284,,-0.134994,3.275378,865764.0,1.0528,0.439324,3.111111,9.505482,5.068065,1.47392,,0.222477,-7489.0,0.0481,0.014685,0.139591,-5678.0,0.0,8.888818,0.678463,-10.105799,-6534.0,5.0,1.0,0.0,0.0,0.2,0.0,-0.00025,,-20033.0,-3.482234,-15.652086,-311065866.0,0.620896,2.540888,0.038202,0,,,,1,-0.0,0.0,0.019101,52.35328,0.2,,,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1
3,100028,0,1,2,315000.0,1575000.0,49018.5,1575000.0,0.026392,-13976,-1866.0,-2000.0,-4208,,1,1,0,1,1,0,4.0,2,2,11,0,0,0,0,0,0,42,0.525734,0.509677,0.612704,0.3052,0.1974,0.997,0.9592,0.1165,0.32,0.2759,0.375,0.0417,0.2042,0.2404,0.3673,0.0386,0.08,0.3109,0.2049,0.997,0.9608,0.1176,0.3222,0.2759,0.375,0.0417,0.2089,0.2626,0.3827,0.0389,0.0847,0.3081,0.1974,0.997,0.9597,0.1173,0.32,0.2759,0.375,0.0417,0.2078,0.2446,0.3739,0.0388,0.0817,0.37,0.0,0.0,0.0,0.0,-1805.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,,,,,,,,,5.0,-1081.0,365243.0,121182.666667,366324.0,0.0,225000.0,92920.5,0.0,151.006711,69.509847,8.507039,21.692944,16.733328,0.010674,0.013423,0.012038,6028.02,11250.0,8091.585,3.339623,8.778839,6.556266,-1805.0,-531.0,1274.0,2.0,0.4,40.92331,82.75596,49207.5,130765.5,7.2e-05,,,0.057708,,,0.902436,,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,1.0,0.082053,0.00041,0.410263,0.097559,0.060387,0.037173,2.0,2.0,0.001809,0.001255,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,161.513904,178591.5,178591.5,0.0,0.0,0.838975,0.838975,4.0,4.0,4.0,31491.0,0.0,,,,178591.5,178591.5,0.0,24.75,1.0,0.885522,0.002466,65.5,4.0,61.5,297.0,1.0,0.0,0.0,0.064056,0.015267,0.0,0.0,9514.816939,0.0,0.0,0.0,0.0,0.0,0.0,735750.0,32596.0,60.0,60.0,-69.0,-1862.0,30885.0,23877.0,5.0,12.0,5.0,0.0,0.0,0.0,0.0,269.0,0.013641,0.0,0.0,0.0,0.0,0.0,2458.279889,186304.5,,,0.0,0.0,-20.0,1544.0,1836.0,14667.35831,1520875.08,5.0,0.0,0.0,21084.075,0.0,0.0,22823.55,22823.55,11250.0,11250.0,9.0,12.0,9029.385,5495.775,13711.305,0.060939,8085.058163,37335.915,396167.85,49.0,1.0,48.0,0.035934,0.165937,1461.966014,37335.915,71636.334672,0.006498,0.165937,0.0,0.0,117.0,2.387755,12.0,301663.62,6156.400408,22823.55,0.565555,,,0.0,0.0,0.0,0.0,1.019568,0.165937,0.165937,16360.11,0.0,11250.0,11250.0,1.17,1773.0,10.0,70348.23,131.908966,8505.0,11.108996,4356.731549,387.931034,38988.54,0.001572,1.17,1255.316553,492310.665,12.0,-29.0,-1785.0,1756.0,0.0,8505.0,22503.51,-23583.33,11250.0,1.17,7.0,-19.0,0.0,0.0,2.0,63.696238,22.571788,-0.12686,5.0,1890000.0,1.0,3.199199,5.0,32.130726,32.007316,12.044461,0.164177,0.133515,-5671.0,0.013831,0.002766,0.088879,-6208.0,0.0,4.617891,0.083026,-22.538638,-8013.0,3.0,0.0,0.0,,0.0,0.0,,0.564595,,-3.507334,-26.269293,-91468521.0,0.475285,1.108033,0.052784,0,4.5184,4.5838,4.5389,1,-0.000143,75.78054,0.026392,37.89027,,-3.8e-05,-7347.658072,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0
4,100038,1,0,1,180000.0,625500.0,32067.0,625500.0,0.010032,-13040,-2191.0,-4000.0,-4262,16.0,1,1,1,1,0,0,3.0,2,2,5,0,0,0,0,1,1,42,0.202145,0.425687,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-821.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,24463.71,24463.71,1.118802,0.10268,1.118802,1.118802,1.118802,1.118802,2.0,-449.0,-449.0,-449.0,0.0,92605.5,508495.5,300550.5,112.795981,4581.040541,2346.918261,8.342387,20.785707,14.564047,0.010161,0.187259,0.09871,11100.6,24463.71,17782.155,13.520828,220.393784,116.957306,-821.0,-111.0,710.0,1.0,0.5,2096.599977,4094.594595,267727.5,454500.0,0.000107,0.005736,0.010079,0.087554,1.131358,1.143913,1.118802,,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.096641,0.001035,0.193281,0.10268,0.090602,0.012078,1.0,1.0,0.001218,0.001218,0.0,,,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,20.785707,20.785707,20.785707,1.118802,1.118802,532959.21,532959.21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,787.0,0.0,0.0,0.0,0.0,18.007955,11100.3375,23.81427,11100.6,13.841147,11097.45,216.095461,133204.05,0.0,-466.0,-802.0,336.0,,,0.0,0.0,,,-9.0,-18.0,0.0,0.0,1.0,,,-0.060353,3.475,805500.0,1.0,4.695803,3.475,19.506034,2.336331,1.376238,,0.168021,-7012.0,0.061669,0.017746,0.346161,-8262.0,0.0,,0.726619,-13.803681,-9083.0,,,,,,0.0,-0.001227,,-13024.0,-2.459126,-14.635783,-70258797.0,0.938527,4.872107,0.020064,2,,,,1,-7.7e-05,99.681021,0.0,0.0,0.0625,-1.6e-05,-2635.970697,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1


time: 767 ms


In [111]:
df.shape

(48744, 488)

time: 2.83 ms


#### Write preprocessed data to file

In [112]:
df.to_csv(path + train_or_test + ".csv", index=False, header=True)

time: 53.7 s
