# Home Credit Feature Construction

In [59]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr, randint, uniform
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective
from IPython.display import display
import warnings
import pickle
import gc

%load_ext autotime
# %unload_ext autotime

pd.options.display.max_columns = None
warnings.filterwarnings("ignore")
gc.enable()
np.random.seed(123)

path = "/Users/dsaxton/home_credit_default/"

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 26.9 ms


#### Quick model function

In [139]:
def quick_model(X, y, n_estimators, num_leaves, usecols, dropcols=[], folds=5):
    take = [c for c in usecols if c not in dropcols]
    clf = lgb.LGBMClassifier(n_estimators=n_estimators, num_leaves=num_leaves)
    clf.fit(X[take], y)
    var_imp = pd.DataFrame({"Feature": X[take].columns, "Importance": clf.feature_importances_})[["Feature", "Importance"]].sort_values("Importance", ascending=False)
    cv_result = cross_val_score(estimator=clf, X=X[take], y=y, scoring="roc_auc", cv=folds)
    auc_mean = cv_result.mean()
    auc_std = cv_result.std() / np.sqrt(folds)
    
    return {"var_imp": var_imp, "auc_mean": auc_mean, "auc_std": auc_std}

time: 8.31 ms


#### Full data

In [61]:
df = pd.read_csv(path + "train.csv")

time: 45.2 s


#### Application table

In [None]:
train_or_test = "train"
application = pd.read_csv(path + "application_" + train_or_test + "test.csv")

#### Previous application and behavioral tables

In [None]:
bureau_balance = pd.read_csv(path + "bureau_balance.csv")
bureau = pd.read_csv(path + "bureau.csv")
credit_card = pd.read_csv(path + "credit_card_balance.csv")
installments = pd.read_csv(path + "installments_payments.csv")
pos_cash = pd.read_csv(path + "POS_CASH_balance.csv")
previous_application = pd.read_csv(path + "previous_application.csv")

#### Aggregate tables

In [None]:
bureau_agg = pd.read_csv(path + "bureau_agg.csv")
credit_card_agg = pd.read_csv(path + "credit_card_agg.csv")

# Feature construction sand box

# Bureau

#### Load data

Pull in performance data

In [286]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])

time: 8.2 s


Load raw bureau data for calculating new aggregates

In [287]:
bureau = pd.read_csv(path + "bureau.csv")

time: 7.68 s


Load bureau_balance_agg and join with raw bureau data

In [288]:
bureau_balance_agg = pd.read_csv(path + "bureau_balance_agg.csv")
bureau_balance_agg = bureau_balance_agg[bureau_balance_agg["SK_ID_BUREAU"].isin(bureau["SK_ID_BUREAU"])]
bureau_joined = pd.merge(bureau, bureau_balance_agg, how="left", on="SK_ID_BUREAU")

time: 2.55 s


#### Aggregation function

In [289]:
def bureau_agg_func(g):
    mask3 = g["DAYS_CREDIT_UPDATE"] >= -90
    mask6 = g["DAYS_CREDIT_UPDATE"] >= -180
    mask12 = g["DAYS_CREDIT_UPDATE"] >= -360
    mask24 = g["DAYS_CREDIT_UPDATE"] >= -720
    active = g["CREDIT_ACTIVE"] == "Active"
    cc = g["CREDIT_TYPE"] == "Credit card"
    
    d = {}
    
    return pd.Series(d)

time: 2.72 ms


#### Process data and join

Calculate new aggregates

In [268]:
bureau_agg_new = bureau_joined.groupby("SK_ID_CURR").apply(bureau_agg_func).reset_index()
new_cols = [c for c in bureau_agg_new.columns if c != "SK_ID_CURR"]

time: 29.6 s


In [269]:
new_cols

['SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M']

time: 5.1 ms


Load old aggregatea and join with new

In [270]:
bureau_agg = pd.read_csv(path + "bureau_agg.csv")
bureau_agg = pd.merge(bureau_agg, bureau_agg_new, how="left", on="SK_ID_CURR")

time: 9 s


Merge with performance data

In [271]:
df = pd.merge(frame, bureau_agg, how="left", on="SK_ID_CURR")

time: 1.31 s


#### Fit quick model with new aggregates

In [272]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist(), 
                     dropcols=[], 
                     n_estimators=500, 
                     num_leaves=17, 
                     folds=3)

time: 22.3 s


CV AUC mean

In [273]:
result["auc_mean"]

0.5796

time: 2.93 ms


#### Fit quick model without specified aggregates

In [275]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist(), 
                     dropcols=[], 
                     n_estimators=500, 
                     num_leaves=17, 
                     folds=3)

time: 22.9 s


CV AUC mean

In [276]:
result["auc_mean"]

0.5822

time: 5.85 ms


#### Write new aggregates to file

# Installments

#### Load data

Pull in performance data

In [297]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])

time: 8.1 s


Load raw bureau data for calculating new aggregates

In [298]:
installments = pd.read_csv(path + "installments_payments.csv")

time: 28.3 s


#### Aggregation function

In [324]:
def installment_agg_func(g):
    mask6 = g["DAYS_ENTRY_PAYMENT"] >= -180
    mask12 = g["DAYS_ENTRY_PAYMENT"] >= -360
    
    d = {
        
    }
    
    return pd.Series(d)

time: 5.7 ms


#### Process data and join

Calculate new aggregates

In [325]:
installment_agg_new = installments.groupby("SK_ID_CURR").apply(installment_agg_func).reset_index()
new_cols = [c for c in installment_agg_new.columns if c != "SK_ID_CURR"]

time: 38min 43s


In [326]:
new_cols

['SUM_PAYMENT_6M',
 'SUM_PAYMENT_DIFF_6M_12M',
 'MAX_AMT_INSTALMENT_6M',
 'MIN_AMT_INSTALMENT_6M',
 'MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M',
 'MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M']

time: 6.08 ms


Load old aggregates and join with new

In [327]:
installment_agg = pd.read_csv(path + "installment_agg.csv")
installment_agg = pd.merge(installment_agg, installment_agg_new, how="left", on="SK_ID_CURR")

time: 3.29 s


Merge with performance data

In [328]:
df = pd.merge(frame, installment_agg, how="left", on="SK_ID_CURR")

time: 628 ms


#### Fit quick model with new aggregates

In [329]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist(), 
                     dropcols=[], 
                     n_estimators=500, 
                     num_leaves=17, 
                     folds=3)

time: 49.2 s


CV AUC mean

In [330]:
result["auc_mean"]

0.6385

time: 3.65 ms


#### Fit quick model without specified aggregates

In [331]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist(), 
                     dropcols=[], 
                     n_estimators=500, 
                     num_leaves=17, 
                     folds=3)

time: 55.2 s


CV AUC mean

In [332]:
result["auc_mean"]

0.6414

time: 4.3 ms


#### Write new aggregates to file

Definitions should also be added to preprocessing script

# Credit card

#### Load data

Pull in performance data

In [166]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])

time: 9.24 s


Load raw bureau data for calculating new aggregates

In [167]:
credit_card = pd.read_csv(path + "credit_card_balance.csv")

time: 35.1 s


In [168]:
credit_card.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,1800.0,1800.0,0.0,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,2250.0,2250.0,26926.425,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,11925.0,11925.0,224949.285,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,27000.0,27000.0,443044.395,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


time: 94.3 ms


#### Aggregation function

In [169]:
def credit_card_agg_func(g):
    mask3 = g["MONTHS_BALANCE"] >= -3
    mask6 = g["MONTHS_BALANCE"] >= -6
    mask12 = g["MONTHS_BALANCE"] >= -12
    active = g["NAME_CONTRACT_STATUS"] == "Active"
    overdue = g["SK_DPD"] > 0
    
    d = {
        
    }
    
    return pd.Series(d)

time: 29.7 ms


#### Process data and join

Calculate new aggregates

In [170]:
credit_card_agg_new = credit_card.groupby("SK_ID_CURR").apply(credit_card_agg_func).reset_index()
new_cols = [c for c in credit_card_agg_new.columns if c != "SK_ID_CURR"]

time: 14min 1s


In [171]:
new_cols

['COUNT_OVERDUE_6M',
 'COUNT_OVERDUE_12M',
 'MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M',
 'MIN_CREDIT_CARD_INST_AMT_PAST_DUE_6M',
 'MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M']

time: 4.7 ms


Load old aggregates and join with new

In [172]:
credit_card_agg = pd.read_csv(path + "credit_card_agg.csv")
credit_card_agg = pd.merge(credit_card_agg, credit_card_agg_new, how="left", on="SK_ID_CURR")

time: 2.24 s


Merge with performance data

In [173]:
df = pd.merge(frame, credit_card_agg, how="left", on="SK_ID_CURR")
all_cols = df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist()
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,MAX_CREDIT_CARD_SK_DPD_6M,MAX_CREDIT_CARD_SK_DPD_12M,MAX_AMT_DRAWINGS_CURRENT_6M,MAX_AMT_DRAWINGS_CURRENT_12M,MAX_AMT_INST_MIN_REGULARITY_6M,MAX_AMT_INST_MIN_REGULARITY_12M,MAX_CNT_DRAWINGS_POS_CURRENT_6M,MAX_CNT_DRAWINGS_POS_CURRENT_12M,SUM_CC_PAYMENT_DIFF_12M,DIFF_AVG_BALANCE_6M_12M,AVG_BALANCE_6M,AVG_UTILIZATION_6M,AVG_BALANCE,MAX_BALANCE,SUM_BALANCE,MAX_MONTHS_BALANCE,MIN_MONTHS_BALANCE,RANGE_MONTHS_BALANCE,AVG_UTILIZATION,MAX_UTILIZATION,AVG_BALANCE_WEIGHTED,MAX_BALANCE_WEIGHTED,SUM_BALANCE_WEIGHTED,AVG_UTILIZATION_WEIGHTED,MAX_UTILIZATION_WEIGHTED,MAX_DPD_WEIGHTED,MAX_DPD_DEF_WEIGHTED,SUM_CNT_DRAWINGS_CURRENT,AVG_CNT_DRAWINGS_CURRENT,MAX_CNT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_CURRENT,AVG_AMT_DRAWINGS_CURRENT,MAX_AMT_DRAWINGS_CURRENT,MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,SUM_CNT_DRAWINGS_ATM_CURRENT_6M,SUM_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_CNT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M,MAX_UTILIZATION_6M,MAX_UTILIZATION_3M,COUNT_OVERDUE_6M,COUNT_OVERDUE_12M,MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M,MIN_CREDIT_CARD_INST_AMT_PAST_DUE_6M,MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M
0,100002,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,100003,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,100004,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,100006,0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,0.0,0.0,0.0,0.0,0.0,6.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,0.0,0.0,,,
4,100007,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


time: 754 ms


In [174]:
df.shape

(307511, 50)

time: 6 ms


#### Fit quick model with all aggregates

In [175]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=all_cols, 
                     dropcols=[], 
                     n_estimators=100, 
                     num_leaves=13, 
                     folds=3)

time: 16.8 s


CV AUC mean

In [176]:
round(result["auc_mean"], 4)

0.5657

time: 4.33 ms


#### Iterate over each new aggregate and compute leave-one-out AUC

In [177]:
baseline = result["auc_mean"]
to_drop = []

for c in new_cols:
    auc = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=all_cols, 
                     dropcols=[c], 
                     n_estimators=100, 
                     num_leaves=13, 
                     folds=3)["auc_mean"]
    if auc >= baseline: to_drop.append(c)
    print(c + ": " + str(round(auc, 4)))

COUNT_OVERDUE_6M: 0.5661
COUNT_OVERDUE_12M: 0.5658
MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M: 0.5657
MIN_CREDIT_CARD_INST_AMT_PAST_DUE_6M: 0.5657
MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M: 0.565
time: 1min 39s


Aggregates to be dropped

In [178]:
to_drop

['COUNT_OVERDUE_6M',
 'COUNT_OVERDUE_12M',
 'MIN_CREDIT_CARD_INST_AMT_PAST_DUE_6M']

time: 3.63 ms


Aggregates to be kept

In [179]:
[c for c in new_cols if c not in to_drop]

['MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M',
 'MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M']

time: 5 ms


Drop unimportant aggregates

In [180]:
credit_card_agg.drop(to_drop, axis=1, inplace=True)
credit_card_agg.shape

(103558, 46)

time: 29.4 ms


In [181]:
credit_card_agg.head()

Unnamed: 0,SK_ID_CURR,MAX_CREDIT_CARD_SK_DPD_6M,MAX_CREDIT_CARD_SK_DPD_12M,MAX_AMT_DRAWINGS_CURRENT_6M,MAX_AMT_DRAWINGS_CURRENT_12M,MAX_AMT_INST_MIN_REGULARITY_6M,MAX_AMT_INST_MIN_REGULARITY_12M,MAX_CNT_DRAWINGS_POS_CURRENT_6M,MAX_CNT_DRAWINGS_POS_CURRENT_12M,SUM_CC_PAYMENT_DIFF_12M,DIFF_AVG_BALANCE_6M_12M,AVG_BALANCE_6M,AVG_UTILIZATION_6M,AVG_BALANCE,MAX_BALANCE,SUM_BALANCE,MAX_MONTHS_BALANCE,MIN_MONTHS_BALANCE,RANGE_MONTHS_BALANCE,AVG_UTILIZATION,MAX_UTILIZATION,AVG_BALANCE_WEIGHTED,MAX_BALANCE_WEIGHTED,SUM_BALANCE_WEIGHTED,AVG_UTILIZATION_WEIGHTED,MAX_UTILIZATION_WEIGHTED,MAX_DPD_WEIGHTED,MAX_DPD_DEF_WEIGHTED,SUM_CNT_DRAWINGS_CURRENT,AVG_CNT_DRAWINGS_CURRENT,MAX_CNT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_CURRENT,AVG_AMT_DRAWINGS_CURRENT,MAX_AMT_DRAWINGS_CURRENT,MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,SUM_CNT_DRAWINGS_ATM_CURRENT_6M,SUM_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_CNT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M,MAX_UTILIZATION_6M,MAX_UTILIZATION_3M,MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M,MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M
0,100006,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,0.0,0.0,0.0,0.0,0.0,6.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,,
1,100011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54482.111149,189000.0,4031676.225,75.0,2.0,73.0,0.302678,1.05,891.528045,2520.0,65973.075311,0.004953,0.014,0.0,0.0,4.0,0.054054,4.0,180000.0,2432.432432,180000.0,1.0,inf,inf,0.0,0.0,0.0,0.0,,0.0,0.0,,
2,100013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18159.919219,161420.22,1743352.245,96.0,1.0,95.0,0.115301,1.02489,230.066978,1944.407308,22086.429911,0.001461,0.012345,0.014493,0.014493,23.0,0.239583,7.0,571500.0,5953.125,157500.0,0.0,inf,inf,0.0,0.0,0.0,0.0,,0.0,0.0,,
3,100021,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,2.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,,
4,100023,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,4.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,,,


time: 88 ms


#### Write new aggregates to file

Definitions should also be added to preprocessing script

In [182]:
credit_card_agg.to_csv(path + "credit_card_agg.csv", index=False, header=True)

time: 7.76 s


# Previous application

#### Load data

Pull in performance data

In [85]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])

time: 9.18 s


Load raw bureau data for calculating new aggregates

In [24]:
previous_application = pd.read_csv(path + "previous_application.csv")

time: 18.1 s


#### Calculate sentinel features

In [25]:
previous_application["DAYS_FIRST_DRAWING_SENTINEL"] = (previous_application["DAYS_FIRST_DRAWING"] == 365243).astype(int)
previous_application["DAYS_FIRST_DUE_SENTINEL"] = (previous_application["DAYS_FIRST_DUE"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_1ST_VERSION_SENTINEL"] = (previous_application["DAYS_LAST_DUE_1ST_VERSION"] == 365243).astype(int)
previous_application["DAYS_LAST_DUE_SENTINEL"] = (previous_application["DAYS_LAST_DUE"] == 365243).astype(int)
previous_application["DAYS_TERMINATION_SENTINEL"] = (previous_application["DAYS_TERMINATION"] == 365243).astype(int)

time: 12.1 s


#### Aggregation function

In [152]:
def previous_agg_func(g):
    mask6 = g["DAYS_DECISION"] >= -180
    mask12 = g["DAYS_DECISION"] >= -360
    mask24 = g["DAYS_DECISION"] >= -720
    
    d = {

    }

    return pd.Series(d)

time: 26.4 ms


#### Process data and join

Calculate new aggregates

In [153]:
previous_agg_new = previous_application.groupby("SK_ID_CURR").apply(previous_agg_func).reset_index()
new_cols = [c for c in previous_agg_new.columns if c != "SK_ID_CURR"]

time: 1h 25min 2s


In [154]:
new_cols

['AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M',
 'MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M',
 'MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M',
 'AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M',
 'MIN_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M',
 'MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M',
 'AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M',
 'MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M',
 'MAX_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M']

time: 3.24 ms


Load old aggregates and join with new

In [155]:
previous_agg = pd.read_csv(path + "previous_agg.csv")
previous_agg = pd.merge(previous_agg, previous_agg_new, how="left", on="SK_ID_CURR")

time: 28.9 s


Merge with performance data

In [156]:
df = pd.merge(frame, previous_agg, how="left", on="SK_ID_CURR")
all_cols = df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist()
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,MIN_PREV_AMT_ANNUITY_12M,MIN_PREV_AMT_ANNUITY_24M,MIN_PREV_PROP_APPROVED_12M,AVG_SYNTH_TARGET_12M,AVG_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_24M,MAX_PREV_PROP_APPROVED_12M,MAX_PREV_PROP_APPROVED_24M,COUNT_PREV_APP,MIN_PREV_DAYS_TERMINATION,MAX_PREV_DAYS_TERMINATION,AVG_PREV_DAYS_TERMINATION,RANGE_PREV_DAYS_TERMINATION,MIN_PREV_AMT_CREDIT,MAX_PREV_AMT_CREDIT,AVG_PREV_AMT_CREDIT,MIN_PREV_AMT_CREDIT_WEIGHTED,MAX_PREV_AMT_CREDIT_WEIGHTED,AVG_PREV_AMT_CREDIT_WEIGHTED,MIN_PREV_AMT_CREDIT_DIV_ANNUITY,MAX_PREV_AMT_CREDIT_DIV_ANNUITY,AVG_PREV_AMT_CREDIT_DIV_ANNUITY,MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MIN_PREV_AMT_ANNUITY,MAX_PREV_AMT_ANNUITY,AVG_PREV_AMT_ANNUITY,MIN_PREV_AMT_ANNUITY_WEIGHTED,MAX_PREV_AMT_ANNUITY_WEIGHTED,AVG_PREV_AMT_ANNUITY_WEIGHTED,MIN_DAYS_DECISION,MAX_DAYS_DECISION,RANGE_DAYS_DECISION,SUM_DAYS_LAST_DUE_NULL,AVG_DAYS_LAST_DUE_NULL,AVG_PREV_REQ_AMOUNT_WEIGHTED,MAX_PREV_REQ_AMOUNT_WEIGHTED,AVG_PREV_REQ_AMOUNT,MAX_PREV_REQ_AMOUNT,AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED,AVG_PREV_PROP_APPROVED_WEIGHTED,MAX_PREV_PROP_APPROVED_WEIGHTED,AVG_PREV_RATE_DOWNPAYMENT,AVG_PREV_PROP_APPROVED,MAX_PREV_PROP_APPROVED,MIN_PREV_PROP_APPROVED,AVG_PREV_INT_RATE,SUM_PREV_URGENT_NEEDS,SUM_PREV_REPAIRS,SUM_PREV_OTHER,SUM_PREV_LIMIT_REJECT,SUM_REFUSED_CONTRACT,SUM_CANC_CONTRACT,SUM_APPR_CONTRACT,SUM_PREV_HC_REJECT,SUM_PREV_INSURE_REQ,COUNT_PREV_WALK_IN,COUNT_PREV_HIGH_YIELD,COUNT_PREV_LOW_YIELD,AVG_SYNTH_TARGET,SUM_SYNTH_TARGET_WEIGHTED,SUM_SYNTH_TARGET,MAX_SYNTH_TARGET,MIN_SYNTH_TARGET,RANGE_SYNTH_TARGET,SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE,SUM_DAYS_FIRST_DRAWING_SENTINEL,SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,SUM_DAYS_LAST_DUE_LT_FIRST_VERSION,MIN_RATE_INTEREST_PRIMARY_12M,AVG_RATE_INTEREST_PRIVILEGED_12M,SUM_REFUSED_CONTRACT_6M,SUM_PRODUCT_COMBINATION_POS_HOUSE_INTEREST_12M,SUM_PRODUCT_COMBINATION_POS_MOBILE_INTEREST_12M,SUM_NAME_GOODS_CATEGORY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_CSTR_6M,SUM_NAME_PAYMENT_TYPE_XNA_6M,COUNT_NAME_CLIENT_TYPE_REPEATER_12M,COUNT_NAME_CLIENT_TYPE_NEW_12M,AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,MIN_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,MAX_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M
0,100002,1,,9251.775,,,,1.0,,1.0,1.0,-17.0,-17.0,-17.0,0.0,179055.0,179055.0,179055.0,295.470297,295.470297,295.470297,19.353584,19.353584,19.353584,0.031937,0.031937,0.031937,9251.775,9251.775,9251.775,15.266955,15.266955,15.266955,-606.0,-606.0,0.0,0.0,0.0,295.470297,295.470297,179055.0,179055.0,0.0,0.00165,0.00165,0.0,1.0,1.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.071974,0.000119,0.071974,0.071974,0.071974,0.0,0.0,1.0,0.00165,0.00165,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,
1,100003,0,,,,,,,,,3.0,-1976.0,-527.0,-1047.333333,1449.0,68053.5,1035882.0,484191.0,29.070269,1388.581769,612.90394,5.399568,10.531859,8.677472,0.004315,0.014118,0.008318,6737.31,98356.995,56553.99,2.877962,131.845838,70.901357,-2341.0,-746.0,1595.0,0.0,0.0,547.812073,1206.434316,435436.5,900000.0,2.1e-05,0.001071,0.001543,0.05003,1.057664,1.15098,0.989013,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,0.078878,0.000225,0.236634,0.090332,0.070374,0.019958,2.0,3.0,0.002975,0.00134,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,
2,100004,0,,,,,,,,,1.0,-714.0,-714.0,-714.0,0.0,20106.0,20106.0,20106.0,24.669939,24.669939,24.669939,3.753045,3.753045,3.753045,0.004605,0.004605,0.004605,5357.25,5357.25,5357.25,6.573313,6.573313,6.573313,-815.0,-815.0,0.0,0.0,0.0,29.793865,29.793865,24282.0,24282.0,0.00026,0.001016,0.001016,0.212008,0.828021,0.828021,0.828021,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.119115,0.000146,0.119115,0.119115,0.119115,0.0,0.0,1.0,0.001227,0.001227,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,
3,100006,0,13500.0,2482.92,0.799989,0.063868,1.029197,1.012684,1.316797,1.316797,9.0,-416.0,365243.0,182481.75,365659.0,0.0,906615.0,291695.5,0.0,5008.922652,1358.887335,9.230206,27.839644,17.767287,0.015809,0.15381,0.081751,2482.92,39954.51,23651.175,4.024182,180.641436,96.293912,-617.0,-181.0,436.0,5.0,0.555556,1242.561634,3803.867403,272203.26,688500.0,0.000439,0.004129,0.007275,0.163412,1.012684,1.316797,0.799989,,0.0,0.0,0.0,1.0,1.0,3.0,5.0,0.0,0.0,0.0,2.0,2.0,0.065491,0.002612,0.589419,0.116006,0.041129,0.074877,1.0,4.0,0.015886,0.005525,2.0,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,,,,,,,,,
4,100007,0,,16037.64,,,,1.108236,,1.108236,6.0,-2041.0,365243.0,72143.8,367284.0,14616.0,284400.0,166638.75,6.201103,733.391711,248.03877,7.968206,21.858453,12.644075,0.003381,0.045729,0.016725,1834.29,22678.785,12278.805,0.778231,42.88139,16.715844,-2357.0,-374.0,1983.0,1.0,0.166667,222.881532,661.764706,150530.25,247500.0,7.5e-05,0.001244,0.002963,0.159516,1.046356,1.264,0.85093,,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,3.0,1.0,3.0,0.0,0.09508,0.000625,0.570482,0.112414,0.0779,0.034514,3.0,5.0,0.005724,0.002674,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,


time: 4.9 s


In [157]:
df.shape

(307511, 93)

time: 21.8 ms


#### Fit quick model with all aggregates

In [158]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=all_cols, 
                     dropcols=[], 
                     n_estimators=100, 
                     num_leaves=13, 
                     folds=3)

time: 1min 6s


CV AUC mean

In [159]:
round(result["auc_mean"], 4)

0.6557

time: 11 ms


#### Iterate over each new aggregate and compute leave-one-out AUC

In [160]:
baseline = result["auc_mean"]
to_drop = []

for c in new_cols:
    auc = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=all_cols, 
                     dropcols=[c], 
                     n_estimators=100, 
                     num_leaves=13, 
                     folds=3)["auc_mean"]
    if auc >= baseline: to_drop.append(c)
    print(c + ": " + str(round(auc, 4)))

AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M: 0.6557
MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M: 0.6551
MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M: 0.6552
AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M: 0.6556
MIN_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M: 0.6558
MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M: 0.6555
AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M: 0.6557
MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M: 0.6556
MAX_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M: 0.656
time: 5min 29s


Aggregates to be dropped

In [161]:
to_drop

['MIN_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M',
 'MAX_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M']

time: 2.45 ms


Aggregates to be kept

In [162]:
[c for c in new_cols if c not in to_drop]

['AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M',
 'MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M',
 'MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M',
 'AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M',
 'MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M',
 'AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M',
 'MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M']

time: 3.18 ms


Drop unimportant aggregates

In [163]:
previous_agg.drop(to_drop, axis=1, inplace=True)
previous_agg.shape

(338857, 90)

time: 536 ms


In [164]:
previous_agg.head()

Unnamed: 0,SK_ID_CURR,MIN_PREV_AMT_ANNUITY_12M,MIN_PREV_AMT_ANNUITY_24M,MIN_PREV_PROP_APPROVED_12M,AVG_SYNTH_TARGET_12M,AVG_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_24M,MAX_PREV_PROP_APPROVED_12M,MAX_PREV_PROP_APPROVED_24M,COUNT_PREV_APP,MIN_PREV_DAYS_TERMINATION,MAX_PREV_DAYS_TERMINATION,AVG_PREV_DAYS_TERMINATION,RANGE_PREV_DAYS_TERMINATION,MIN_PREV_AMT_CREDIT,MAX_PREV_AMT_CREDIT,AVG_PREV_AMT_CREDIT,MIN_PREV_AMT_CREDIT_WEIGHTED,MAX_PREV_AMT_CREDIT_WEIGHTED,AVG_PREV_AMT_CREDIT_WEIGHTED,MIN_PREV_AMT_CREDIT_DIV_ANNUITY,MAX_PREV_AMT_CREDIT_DIV_ANNUITY,AVG_PREV_AMT_CREDIT_DIV_ANNUITY,MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MIN_PREV_AMT_ANNUITY,MAX_PREV_AMT_ANNUITY,AVG_PREV_AMT_ANNUITY,MIN_PREV_AMT_ANNUITY_WEIGHTED,MAX_PREV_AMT_ANNUITY_WEIGHTED,AVG_PREV_AMT_ANNUITY_WEIGHTED,MIN_DAYS_DECISION,MAX_DAYS_DECISION,RANGE_DAYS_DECISION,SUM_DAYS_LAST_DUE_NULL,AVG_DAYS_LAST_DUE_NULL,AVG_PREV_REQ_AMOUNT_WEIGHTED,MAX_PREV_REQ_AMOUNT_WEIGHTED,AVG_PREV_REQ_AMOUNT,MAX_PREV_REQ_AMOUNT,AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED,AVG_PREV_PROP_APPROVED_WEIGHTED,MAX_PREV_PROP_APPROVED_WEIGHTED,AVG_PREV_RATE_DOWNPAYMENT,AVG_PREV_PROP_APPROVED,MAX_PREV_PROP_APPROVED,MIN_PREV_PROP_APPROVED,AVG_PREV_INT_RATE,SUM_PREV_URGENT_NEEDS,SUM_PREV_REPAIRS,SUM_PREV_OTHER,SUM_PREV_LIMIT_REJECT,SUM_REFUSED_CONTRACT,SUM_CANC_CONTRACT,SUM_APPR_CONTRACT,SUM_PREV_HC_REJECT,SUM_PREV_INSURE_REQ,COUNT_PREV_WALK_IN,COUNT_PREV_HIGH_YIELD,COUNT_PREV_LOW_YIELD,AVG_SYNTH_TARGET,SUM_SYNTH_TARGET_WEIGHTED,SUM_SYNTH_TARGET,MAX_SYNTH_TARGET,MIN_SYNTH_TARGET,RANGE_SYNTH_TARGET,SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE,SUM_DAYS_FIRST_DRAWING_SENTINEL,SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,SUM_DAYS_LAST_DUE_LT_FIRST_VERSION,MIN_RATE_INTEREST_PRIMARY_12M,AVG_RATE_INTEREST_PRIVILEGED_12M,SUM_REFUSED_CONTRACT_6M,SUM_PRODUCT_COMBINATION_POS_HOUSE_INTEREST_12M,SUM_PRODUCT_COMBINATION_POS_MOBILE_INTEREST_12M,SUM_NAME_GOODS_CATEGORY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_CSTR_6M,SUM_NAME_PAYMENT_TYPE_XNA_6M,COUNT_NAME_CLIENT_TYPE_REPEATER_12M,COUNT_NAME_CLIENT_TYPE_NEW_12M,AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M
0,100001,,,,,,,,,1.0,-1612.0,-1612.0,-1612.0,0.0,23787.0,23787.0,23787.0,13.67069,13.67069,13.67069,6.020501,6.020501,6.020501,0.00346,0.00346,0.00346,3951.0,3951.0,3951.0,2.27069,2.27069,2.27069,-1740.0,-1740.0,0.0,0.0,0.0,14.273276,14.273276,24835.5,24835.5,6e-05,0.00055,0.00055,0.104326,0.957782,0.957782,0.957782,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.101729,5.8e-05,0.101729,0.101729,0.101729,0.0,0.0,1.0,0.000575,0.000575,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,
1,100002,,9251.775,,,,1.0,,1.0,1.0,-17.0,-17.0,-17.0,0.0,179055.0,179055.0,179055.0,295.470297,295.470297,295.470297,19.353584,19.353584,19.353584,0.031937,0.031937,0.031937,9251.775,9251.775,9251.775,15.266955,15.266955,15.266955,-606.0,-606.0,0.0,0.0,0.0,295.470297,295.470297,179055.0,179055.0,0.0,0.00165,0.00165,0.0,1.0,1.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.071974,0.000119,0.071974,0.071974,0.071974,0.0,0.0,1.0,0.00165,0.00165,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,
2,100003,,,,,,,,,3.0,-1976.0,-527.0,-1047.333333,1449.0,68053.5,1035882.0,484191.0,29.070269,1388.581769,612.90394,5.399568,10.531859,8.677472,0.004315,0.014118,0.008318,6737.31,98356.995,56553.99,2.877962,131.845838,70.901357,-2341.0,-746.0,1595.0,0.0,0.0,547.812073,1206.434316,435436.5,900000.0,2.1e-05,0.001071,0.001543,0.05003,1.057664,1.15098,0.989013,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,0.078878,0.000225,0.236634,0.090332,0.070374,0.019958,2.0,3.0,0.002975,0.00134,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,
3,100004,,,,,,,,,1.0,-714.0,-714.0,-714.0,0.0,20106.0,20106.0,20106.0,24.669939,24.669939,24.669939,3.753045,3.753045,3.753045,0.004605,0.004605,0.004605,5357.25,5357.25,5357.25,6.573313,6.573313,6.573313,-815.0,-815.0,0.0,0.0,0.0,29.793865,29.793865,24282.0,24282.0,0.00026,0.001016,0.001016,0.212008,0.828021,0.828021,0.828021,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.119115,0.000146,0.119115,0.119115,0.119115,0.0,0.0,1.0,0.001227,0.001227,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,
4,100005,,,,0.060611,,,,,2.0,-460.0,-460.0,-460.0,0.0,0.0,40153.5,20076.75,0.0,53.042933,26.521466,8.342371,8.342371,8.342371,0.01102,0.01102,0.01102,4813.2,4813.2,4813.2,6.358256,6.358256,6.358256,-757.0,-315.0,442.0,1.0,0.5,29.469947,58.939894,22308.75,44617.5,0.000144,0.001189,0.001189,0.108964,0.89995,0.89995,0.89995,,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.071063,0.0003,0.142127,0.081516,0.060611,0.020905,0.0,1.0,0.001321,0.001321,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,


time: 188 ms


#### Write new aggregates to file

Definitions should also be added to preprocessing script

previous_agg.to_csv(path + "previous_agg.csv", index=False, header=True)

# Point of Sale

#### Load data

Pull in performance data

In [183]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])

time: 9.13 s


Load raw bureau data for calculating new aggregates

In [184]:
pos_cash = pd.read_csv(path + "POS_CASH_balance.csv")

time: 15.8 s


In [187]:
pos_cash.query("MONTHS_BALANCE >= -6 and SK_DPD > 0").head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
1312,2270633,274147,-3,24.0,17.0,Active,16,0
7490,1121698,417906,-5,12.0,3.0,Active,8,8
9220,1467136,125771,-6,12.0,2.0,Active,2,2
9470,2675685,224293,-6,36.0,19.0,Active,4,4
10169,1754696,128260,-4,12.0,2.0,Active,3,3


time: 196 ms


#### Aggregation function

In [188]:
def pos_cash_agg_func(g):
    mask3 = g["MONTHS_BALANCE"] >= -3
    mask6 = g["MONTHS_BALANCE"] >= -6
    mask12 = g["MONTHS_BALANCE"] >= -12
    overdue = g["SK_DPD"] > 0
    
    d = {

    }
    
    return pd.Series(d)

time: 20 ms


#### Process data and join

Calculate new aggregates

In [189]:
pos_cash_agg_new = pos_cash.groupby("SK_ID_CURR").apply(pos_cash_agg_func).reset_index()
new_cols = [c for c in pos_cash_agg_new.columns if c != "SK_ID_CURR"]

time: 54min 21s


In [190]:
new_cols

['MAX_POS_DPD_6M',
 'MAX_POS_DPD_DEF_6M',
 'COUNT_POS_OVERDUE_6M',
 'COUNT_POS_OVERDUE_3M',
 'MIN_CNT_INSTALMENT_FUTURE_6M',
 'MAX_CNT_INSTALMENT_FUTURE_6M',
 'MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_6M',
 'MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_DEF_6M',
 'MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_12M']

time: 4.52 ms


Load old aggregates and join with new

In [191]:
pos_cash_agg = pd.read_csv(path + "pos_cash_agg.csv")
pos_cash_agg = pd.merge(pos_cash_agg, pos_cash_agg_new, how="left", on="SK_ID_CURR")

time: 454 ms


Merge with performance data

In [192]:
df = pd.merge(frame, pos_cash_agg, how="left", on="SK_ID_CURR")
all_cols = df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist()
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,MAX_POS_DPD,MAX_POS_DPD_DEF,NUM_POS_CASH,MAX_POS_DPD_6M,MAX_POS_DPD_DEF_6M,COUNT_POS_OVERDUE_6M,COUNT_POS_OVERDUE_3M,MIN_CNT_INSTALMENT_FUTURE_6M,MAX_CNT_INSTALMENT_FUTURE_6M,MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_6M,MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_DEF_6M,MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_12M
0,100002,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.0,11.0,0.0,0.0,0.0
1,100003,0,0.0,0.0,3.0,,,0.0,0.0,,,,,
2,100004,0,0.0,0.0,1.0,,,0.0,0.0,,,,,
3,100006,0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,48.0,0.0,0.0,0.0
4,100007,0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,13.0,18.0,0.0,0.0,0.0


time: 337 ms


In [193]:
df.shape

(307511, 14)

time: 3.75 ms


#### Fit quick model with all aggregates

In [194]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=all_cols, 
                     dropcols=[], 
                     n_estimators=100, 
                     num_leaves=13, 
                     folds=3)

time: 10.4 s


CV AUC mean

In [195]:
round(result["auc_mean"], 4)

0.5841

time: 4.76 ms


#### Iterate over each new aggregate and compute leave-one-out AUC

In [196]:
baseline = result["auc_mean"]
to_drop = []

for c in new_cols:
    auc = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=all_cols, 
                     dropcols=[c], 
                     n_estimators=100, 
                     num_leaves=13, 
                     folds=3)["auc_mean"]
    if auc >= baseline: to_drop.append(c)
    print(c + ": " + str(round(auc, 4)))

MAX_POS_DPD_6M: 0.5843
MAX_POS_DPD_DEF_6M: 0.5842
COUNT_POS_OVERDUE_6M: 0.5843
COUNT_POS_OVERDUE_3M: 0.5843
MIN_CNT_INSTALMENT_FUTURE_6M: 0.583
MAX_CNT_INSTALMENT_FUTURE_6M: 0.5789
MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_6M: 0.5843
MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_DEF_6M: 0.5843
MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_12M: 0.5818
time: 1min 27s


Aggregates to be dropped

In [197]:
to_drop

['MAX_POS_DPD_6M',
 'MAX_POS_DPD_DEF_6M',
 'COUNT_POS_OVERDUE_6M',
 'COUNT_POS_OVERDUE_3M',
 'MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_6M',
 'MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_DEF_6M']

time: 5.84 ms


Aggregates to be kept

In [198]:
[c for c in new_cols if c not in to_drop]

['MIN_CNT_INSTALMENT_FUTURE_6M',
 'MAX_CNT_INSTALMENT_FUTURE_6M',
 'MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_12M']

time: 6.66 ms


Drop unimportant aggregates

In [206]:
pos_cash_agg.drop(['MAX_POS_DPD_6M',
 'MAX_POS_DPD_DEF_6M',
 'COUNT_POS_OVERDUE_6M',
 'COUNT_POS_OVERDUE_3M',
 'MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_6M',
 'MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_DEF_6M'], axis=1, inplace=True)
pos_cash_agg.shape

(337252, 7)

time: 159 ms


#### Write new aggregates to file

Definitions should also be added to preprocessing script

pos_cash_agg.to_csv(path + "pos_cash_agg.csv", index=False, header=True)

# AUC-based feature selection

In [199]:
df = pd.read_csv(path + "train.csv")
all_cols = df.drop(["TARGET", "SK_ID_CURR"], axis=1).columns.tolist()

time: 53.6 s


In [200]:
df.shape

(307511, 494)

time: 3.72 ms


#### Fit quick model with all features

In [201]:
result = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=all_cols, 
                     dropcols=[], 
                     n_estimators=100, 
                     num_leaves=13, 
                     folds=3)

time: 2min 30s


CV AUC mean

In [202]:
round(result["auc_mean"], 4)

0.7817

time: 18.7 ms


#### Iterate over each feature and drop if AUC contribution is not positive

In [None]:
baseline = result["auc_mean"]
to_drop = []

for c in all_cols:
    auc = quick_model(X=df, 
                     y=df["TARGET"], 
                     usecols=all_cols, 
                     dropcols=to_drop + [c], 
                     n_estimators=100, 
                     num_leaves=13, 
                     folds=3)["auc_mean"]
    if auc >= baseline: 
        to_drop.append(c)
        print(c + " dropped...")
        baseline = auc

In [212]:
df.drop(to_drop, axis=1).to_csv(path + "train_pruned.csv", index=False, header=True)

time: 2min 10s
