# Home Credit EDA and Feature Construction

In [201]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr, randint, uniform
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective
from IPython.display import display
import warnings
import pickle
import gc

%load_ext autotime
# %unload_ext autotime

pd.options.display.max_columns = None
warnings.filterwarnings("ignore")
gc.enable()
np.random.seed(123)

path = "/Users/dsaxton/home_credit_default/"

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 13 ms


#### Full data

In [74]:
df = pd.read_csv(path + "preprocessed_train.csv")

time: 48.3 s


#### Application table

In [None]:
train_or_test = "train"
application = pd.read_csv(path + "application_" + train_or_test + "test.csv")

#### Previous application and behavioral tables

In [None]:
bureau_balance = pd.read_csv(path + "bureau_balance.csv")
bureau = pd.read_csv(path + "bureau.csv")
credit_card = pd.read_csv(path + "credit_card_balance.csv")
installments = pd.read_csv(path + "installments_payments.csv")
pos_cash = pd.read_csv(path + "POS_CASH_balance.csv")
previous_application = pd.read_csv(path + "previous_application.csv")

#### Aggregate tables

In [None]:
bureau_agg = pd.read_csv(path + "bureau_agg.csv")
credit_card_agg = pd.read_csv(path + "credit_card_agg.csv")

# Feature construction sand box

#### Quick model function

In [160]:
def quick_model(X, y, n_estimators, num_leaves, usecols, dropcols=[], folds=5):
    take = [c for c in usecols if c not in dropcols]
    clf = lgb.LGBMClassifier(n_estimators=n_estimators, num_leaves=num_leaves)
    clf.fit(X[take], y)
    var_imp = pd.DataFrame({"Feature": X[take].columns, "Importance": clf.feature_importances_})[["Feature", "Importance"]].sort_values("Importance", ascending=False)
    cv_result = cross_val_score(estimator=clf, X=X[take], y=y, scoring="roc_auc", cv=folds)
    auc_mean = round(cv_result.mean(), 4)
    auc_std = round(cv_result.std() / np.sqrt(folds), 4)
    
    return {"var_imp": var_imp, "auc_mean": auc_mean, "auc_std": auc_std}

time: 4 ms


# Bureau

#### Load data

Pull in performance data

In [286]:
frame = pd.read_csv(path + "preprocessed_train.csv", usecols=["SK_ID_CURR", "TARGET"])

time: 8.2 s


Load raw bureau data for calculating new aggregates

In [287]:
bureau = pd.read_csv(path + "bureau.csv")

time: 7.68 s


Load bureau_balance_agg and join with raw bureau data

In [288]:
bureau_balance_agg = pd.read_csv(path + "bureau_balance_agg.csv")
bureau_balance_agg = bureau_balance_agg[bureau_balance_agg["SK_ID_BUREAU"].isin(bureau["SK_ID_BUREAU"])]
bureau_joined = pd.merge(bureau, bureau_balance_agg, how="left", on="SK_ID_BUREAU")

time: 2.55 s


#### Aggregation function

In [289]:
def bureau_agg_func(g):
    mask6 = g["DAYS_CREDIT_UPDATE"] >= -180
    mask12 = g["DAYS_CREDIT_UPDATE"] >= -360
    mask24 = g["DAYS_CREDIT_UPDATE"] >= -720
    active = g["CREDIT_ACTIVE"] == "Active"
    cc = g["CREDIT_TYPE"] == "Credit card"
    
    d = {}
    
    return pd.Series(d)

time: 2.72 ms


#### Process data and join

Calculate new aggregates

In [268]:
bureau_agg_new = bureau_joined.groupby("SK_ID_CURR").apply(bureau_agg_func).reset_index()
new_cols = [c for c in bureau_agg_new.columns if c != "SK_ID_CURR"]

time: 29.6 s


In [269]:
new_cols

['SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M']

time: 5.1 ms


Load old aggregatea and join with new

In [270]:
bureau_agg = pd.read_csv(path + "bureau_agg.csv")
bureau_agg = pd.merge(bureau_agg, bureau_agg_new, how="left", on="SK_ID_CURR")

time: 9 s


Merge with performance data

In [271]:
df = pd.merge(frame, bureau_agg, how="left", on="SK_ID_CURR")

time: 1.31 s


#### Fit quick model with new aggregates

In [272]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist(), 
                     dropcols=[], 
                     n_estimators=500, 
                     num_leaves=17, 
                     folds=3)

time: 22.3 s


CV AUC mean

In [273]:
result["auc_mean"]

0.5796

time: 2.93 ms


#### Fit quick model without specified aggregates

In [275]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist(), 
                     dropcols=[], 
                     n_estimators=500, 
                     num_leaves=17, 
                     folds=3)

time: 22.9 s


CV AUC mean

In [276]:
result["auc_mean"]

0.5822

time: 5.85 ms


#### Write new aggregates to file

# Installments

#### Load data

Pull in performance data

In [297]:
frame = pd.read_csv(path + "preprocessed_train.csv", usecols=["SK_ID_CURR", "TARGET"])

time: 8.1 s


Load raw bureau data for calculating new aggregates

In [298]:
installments = pd.read_csv(path + "installments_payments.csv")

time: 28.3 s


#### Aggregation function

In [324]:
def installment_agg_func(g):
    mask6 = g["DAYS_ENTRY_PAYMENT"] >= -180
    mask12 = g["DAYS_ENTRY_PAYMENT"] >= -360
    
    d = {}
    
    return pd.Series(d)

time: 5.7 ms


#### Process data and join

Calculate new aggregates

In [325]:
installment_agg_new = installments.groupby("SK_ID_CURR").apply(installment_agg_func).reset_index()
new_cols = [c for c in installment_agg_new.columns if c != "SK_ID_CURR"]

time: 38min 43s


In [326]:
new_cols

['SUM_PAYMENT_6M',
 'SUM_PAYMENT_DIFF_6M_12M',
 'MAX_AMT_INSTALMENT_6M',
 'MIN_AMT_INSTALMENT_6M',
 'MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M',
 'MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M']

time: 6.08 ms


Load old aggregatea and join with new

In [327]:
installment_agg = pd.read_csv(path + "installment_agg.csv")
installment_agg = pd.merge(installment_agg, installment_agg_new, how="left", on="SK_ID_CURR")

time: 3.29 s


Merge with performance data

In [328]:
df = pd.merge(frame, installment_agg, how="left", on="SK_ID_CURR")

time: 628 ms


#### Fit quick model with new aggregates

In [329]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist(), 
                     dropcols=[], 
                     n_estimators=500, 
                     num_leaves=17, 
                     folds=3)

time: 49.2 s


CV AUC mean

In [330]:
result["auc_mean"]

0.6385

time: 3.65 ms


#### Fit quick model without specified aggregates

In [331]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist(), 
                     dropcols=[], 
                     n_estimators=500, 
                     num_leaves=17, 
                     folds=3)

time: 55.2 s


CV AUC mean

In [332]:
result["auc_mean"]

0.6414

time: 4.3 ms


#### Write new aggregates to file

Definitions should also be added to preprocessing script

# Credit card

#### Load data

Pull in performance data

In [336]:
frame = pd.read_csv(path + "preprocessed_train.csv", usecols=["SK_ID_CURR", "TARGET"])

time: 8.39 s


Load raw bureau data for calculating new aggregates

In [337]:
credit_card = pd.read_csv(path + "credit_card_balance.csv")

time: 24 s


#### Aggregation function

In [341]:
def credit_card_agg_func(g):
    mask6 = g["MONTHS_BALANCE"] >= -6
    mask12 = g["MONTHS_BALANCE"] >= -12
    active = g["NAME_CONTRACT_STATUS"] == "Active"
    
    d = {}
    
    return pd.Series(d)

time: 4.52 ms


#### Process data and join

Calculate new aggregates

In [342]:
credit_card_agg_new = credit_card.groupby("SK_ID_CURR").apply(credit_card_agg_func).reset_index()
new_cols = [c for c in credit_card_agg_new.columns if c != "SK_ID_CURR"]

time: 12min 37s


In [343]:
new_cols

['SUM_CNT_DRAWINGS_ATM_CURRENT_6M',
 'SUM_AMT_DRAWINGS_ATM_CURRENT_6M',
 'MAX_AMT_DRAWINGS_ATM_CURRENT_6M',
 'MAX_CNT_DRAWINGS_ATM_CURRENT_6M',
 'MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M',
 'MAX_CNT_INSTALMENT_MATURE_CUM_6M',
 'MAX_UTILIZATION_6M']

time: 3.64 ms


Load old aggregatea and join with new

In [344]:
credit_card_agg = pd.read_csv(path + "credit_card_agg.csv")
credit_card_agg = pd.merge(credit_card_agg, credit_card_agg_new, how="left", on="SK_ID_CURR")

time: 1.96 s


Merge with performance data

In [345]:
df = pd.merge(frame, credit_card_agg, how="left", on="SK_ID_CURR")

time: 340 ms


#### Fit quick model with new aggregates

In [358]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist(), 
                     dropcols=[], 
                     n_estimators=500, 
                     num_leaves=17, 
                     folds=3)

time: 59.9 s


CV AUC mean

In [359]:
result["auc_mean"]

0.557

time: 4.19 ms


#### Fit quick model without specified aggregates

In [368]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist(), 
                     dropcols=[], 
                     n_estimators=500, 
                     num_leaves=17, 
                     folds=3)

time: 1min


CV AUC mean

In [369]:
result["auc_mean"]

0.5562

time: 4.78 ms


Drop unimportant aggregates

In [370]:
to_drop = ["MAX_CNT_INSTALMENT_MATURE_CUM_6M"]

time: 7.12 ms


In [371]:
credit_card_agg.drop(to_drop, axis=1, inplace=True)
credit_card_agg.shape

(103558, 43)

time: 38.4 ms


#### Write new aggregates to file

Definitions should also be added to preprocessing script

# Previous application

#### Load data

Pull in performance data

In [336]:
frame = pd.read_csv(path + "preprocessed_train.csv", usecols=["SK_ID_CURR", "TARGET"])

time: 8.39 s


Load raw bureau data for calculating new aggregates

In [337]:
previous_application = pd.read_csv(path + "previous_application.csv")

time: 24 s


#### Calculate synthetic target and sentinel features

In [None]:
with open(path + "linear_model.pkl", "rb") as f:
    clf = pickle.load(f)

impute = Imputer(strategy="median")
scale = StandardScaler()

cols = ["AMT_ANNUITY", 
        "AMT_CREDIT", 
        "AMT_GOODS_PRICE", 
        "HOUR_APPR_PROCESS_START", 
        "NAME_CONTRACT_TYPE", 
        "NAME_TYPE_SUITE", 
        "WEEKDAY_APPR_PROCESS_START"]

prev_temp = pd.get_dummies(previous_application[cols])

dummy_cols = ["AMT_CREDIT",
              "AMT_GOODS_PRICE",
              "HOUR_APPR_PROCESS_START",
              "NAME_CONTRACT_TYPE_Cash loans",
              "NAME_CONTRACT_TYPE_Revolving loans",
              "NAME_TYPE_SUITE_Children",
              "NAME_TYPE_SUITE_Family",
              "NAME_TYPE_SUITE_Group of people",
              "NAME_TYPE_SUITE_Other_A",
              "NAME_TYPE_SUITE_Other_B",
              "NAME_TYPE_SUITE_Spouse, partner",
              "NAME_TYPE_SUITE_Unaccompanied",
              "WEEKDAY_APPR_PROCESS_START_FRIDAY",
              "WEEKDAY_APPR_PROCESS_START_MONDAY",
              "WEEKDAY_APPR_PROCESS_START_SATURDAY",
              "WEEKDAY_APPR_PROCESS_START_SUNDAY",
              "WEEKDAY_APPR_PROCESS_START_THURSDAY",
              "WEEKDAY_APPR_PROCESS_START_TUESDAY",
              "WEEKDAY_APPR_PROCESS_START_WEDNESDAY"]

previous_application["SYNTHETIC_TARGET"] = clf.predict_proba(scale.fit_transform(impute.fit_transform(prev_temp[dummy_cols])))[:,1]
previous_application["DAYS_FIRST_DRAWING_SENTINEL"] = (previous_application["DAYS_FIRST_DRAWING"] == 365243).astype(int)
previous_application["DAYS_FIRST_DUE_SENTINEL"] = (previous_application["DAYS_FIRST_DUE"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_1ST_VERSION_SENTINEL"] = (previous_application["DAYS_LAST_DUE_1ST_VERSION"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_SENTINEL"] = (previous_application["DAYS_LAST_DUE"] == 365243).astype(int)
previous_application["DAYS_TERMINATION_SENTINEL"] = (previous_application["DAYS_TERMINATION"] == 365243).astype(int)

#### Aggregation function

In [341]:
def previous_agg_func(g):
    mask12 = g["DAYS_DECISION"] >= -360
    mask24 = g["DAYS_DECISION"] >= -720

    d = {}

    return pd.Series(d)

time: 4.52 ms


#### Process data and join

Calculate new aggregates

In [342]:
previous_agg_new = previous_application.groupby("SK_ID_CURR").apply(previous_agg_func).reset_index()
new_cols = [c for c in previous_agg_new.columns if c != "SK_ID_CURR"]

time: 12min 37s


In [343]:
new_cols

['SUM_CNT_DRAWINGS_ATM_CURRENT_6M',
 'SUM_AMT_DRAWINGS_ATM_CURRENT_6M',
 'MAX_AMT_DRAWINGS_ATM_CURRENT_6M',
 'MAX_CNT_DRAWINGS_ATM_CURRENT_6M',
 'MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M',
 'MAX_CNT_INSTALMENT_MATURE_CUM_6M',
 'MAX_UTILIZATION_6M']

time: 3.64 ms


Load old aggregatea and join with new

In [344]:
previous_agg = pd.read_csv(path + "previous_agg.csv")
previous_agg = pd.merge(previous_agg, previous_agg_new, how="left", on="SK_ID_CURR")

time: 1.96 s


Merge with performance data

In [345]:
df = pd.merge(frame, previous_agg, how="left", on="SK_ID_CURR")

time: 340 ms


#### Fit quick model with new aggregates

In [358]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist(), 
                     dropcols=[], 
                     n_estimators=500, 
                     num_leaves=17, 
                     folds=3)

time: 59.9 s


CV AUC mean

In [359]:
result["auc_mean"]

0.557

time: 4.19 ms


#### Fit quick model without specified aggregates

In [368]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist(), 
                     dropcols=[], 
                     n_estimators=500, 
                     num_leaves=17, 
                     folds=3)

time: 1min


CV AUC mean

In [369]:
result["auc_mean"]

0.5562

time: 4.78 ms


Drop unimportant aggregates

In [370]:
to_drop = []

time: 7.12 ms


In [371]:
previous_agg.drop(to_drop, axis=1, inplace=True)
previous_agg.shape

(103558, 43)

time: 38.4 ms


#### Write new aggregates to file

Definitions should also be added to preprocessing script