# Home Credit Supervised Feature Transformation

In [21]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr, randint, uniform
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective
from IPython.display import display
import warnings
import pickle
import gc

%load_ext autotime

pd.options.display.max_columns = None
warnings.filterwarnings("ignore")
gc.enable()
np.random.seed(123)

path = "/Users/dsaxton/home_credit_default/"

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 4.87 ms


In [3]:
impute = Imputer(strategy="median")
scale = StandardScaler()

time: 1.65 ms


# Bureau Aggregate Synthetic Target

In [102]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])
bureau_agg = pd.read_csv(path + "bureau_agg.csv")

df = pd.merge(frame, bureau_agg, how="left", on="SK_ID_CURR")
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 21.8 s


In [103]:
df = pd.DataFrame(scale.fit_transform(impute.fit_transform(df.replace([-np.inf, np.inf], np.nan))), columns=df.columns)

time: 12.2 s


In [104]:
linear_model = LogisticRegressionCV(Cs=[np.exp(i) for i in range(-10, 10)], 
                                   penalty="l2", 
                                   fit_intercept=True, 
                                   scoring="roc_auc", 
                                   cv=5)

linear_model.fit(df, y)

LogisticRegressionCV(Cs=[4.5399929762484854e-05, 0.00012340980408667956, 0.00033546262790251185, 0.0009118819655545162, 0.0024787521766663585, 0.006737946999085467, 0.01831563888873418, 0.049787068367863944, 0.1353352832366127, 0.36787944117144233, 1.0, 2.718281828459045, 7.38905609893065, 20.085536923187668, 54.598150033144236, 148.4131591025766, 403.4287934927351, 1096.6331584284585, 2980.9579870417283, 8103.083927575384],
           class_weight=None, cv=5, dual=False, fit_intercept=True,
           intercept_scaling=1.0, max_iter=100, multi_class='ovr',
           n_jobs=1, penalty='l2', random_state=None, refit=True,
           scoring='roc_auc', solver='lbfgs', tol=0.0001, verbose=0)

time: 3min 11s


#### Get out of fold predictions

In [105]:
kfold = StratifiedKFold(n_splits=5)
scores = pd.DataFrame({"SK_ID_CURR": [], "BUREAU_AGG_SYNTHETIC_TARGET": []})

time: 6.76 ms


In [106]:
for train_idx, test_idx in kfold.split(df, y):
    clf = LogisticRegression(C=linear_model.C_[0], penalty="l2")
    print("Fitting...")
    clf.fit(df.iloc[train_idx], y[train_idx])
    print("Scoring...")
    fold_scores = pd.DataFrame({"SK_ID_CURR": sk_id_curr[test_idx], "BUREAU_AGG_SYNTHETIC_TARGET": clf.predict_proba(df.iloc[test_idx])[:,1]})
    scores = pd.concat([scores, fold_scores], axis=0)

Fitting...
Scoring...
Fitting...
Scoring...
Fitting...
Scoring...
Fitting...
Scoring...
Fitting...
Scoring...
time: 7min 44s


#### Append to bureau_agg

In [107]:
bureau_agg = pd.merge(bureau_agg, scores, how="left", on="SK_ID_CURR")

time: 441 ms


#### Fill in test set cases

Fit model on full training data and predict TARGET for the test cases

In [108]:
clf = LogisticRegression(C=linear_model.C_[0], penalty="l2")
clf.fit(df, y)

LogisticRegression(C=54.598150033144236, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

time: 2min 15s


In [109]:
null_id = bureau_agg["BUREAU_AGG_SYNTHETIC_TARGET"].isnull()
temp_frame = bureau_agg[null_id].drop(["SK_ID_CURR", "BUREAU_AGG_SYNTHETIC_TARGET"], axis=1).replace([-np.inf, np.inf], np.nan)
arr = scale.fit_transform(impute.fit_transform(temp_frame))
bureau_agg.loc[null_id, "BUREAU_AGG_SYNTHETIC_TARGET"] = clf.predict_proba(pd.DataFrame(arr, columns=temp_frame.columns))[:,1]

time: 2.45 s


In [110]:
bureau_agg.shape

(305811, 79)

time: 2.91 ms


In [122]:
bureau_agg.head()

Unnamed: 0,SK_ID_CURR,SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M,SUM_CC_DEBT_6M,SUM_CC_DEBT_12M,MAX_WORST_DQ_BUREAU_BALANCE_6M,MAX_WORST_DQ_BUREAU_BALANCE_12M,MAX_BUREAU_UTILIZATION_6M,MAX_BUREAU_UTILIZATION_12M,COUNT_ACTIVE_6M,COUNT_ACTIVE_12M,COUNT_ACTIVE_24M,DAYS_REMAINING_ACTIVE,MAX_CREDIT_DAY_OVERDUE_6M,MAX_CREDIT_DAY_OVERDUE_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_12M_24M,BUREAU_SUM_DEBT_DIFF_6M_12M,BUREAU_SUM_DEBT_DIFF_12M_24M,MAX_CNT_CREDIT_PROLONG,AVG_LEN_BUREAU_BALANCE,PROP_CURRENT,PROP_CLOSED,PROP_CURRENT_WEIGHTED,MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE,MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE,RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE,SUM_SUM_CURRENT_BUREAU_BALANCE,AVG_PROP_CURRENT,AVG_PROP_DQ,MAX_PROP_DQ,AVG_PROP_CURRENT_WEIGHTED,MIN_PROP_CURRENT_WEIGHTED,AVG_PROP_DQ_WEIGHTED,MAX_PROP_DQ_WEIGHTED,AVG_PROP_CURRENT_WEIGHTED_AMT,MIN_PROP_CURRENT_WEIGHTED_AMT,AVG_PROP_DQ_WEIGHTED_AMT,MAX_PROP_DQ_WEIGHTED_AMT,AVG_WORST_DQ_BUREAU_BALANCE,MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED,AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED,TOTAL_AMT_CREDIT_SUM_POS_DAYS,SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,MAX_LEN_BUREAU_BALANCE,SUM_LEN_BUREAU_BALANCE,MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE,MIN_DAYS_CREDIT_ENDDATE,MAX_DAYS_CREDIT_ENDDATE,SUM_DAYS_CREDIT_ENDDATE,SUM_NULL_DAYS_ENDDATE_FACT,COUNT_BUREAU_RECORDS,COUNT_ACTIVE,MAX_CREDIT_DAY_OVERDUE_WEIGHTED,SUM_CREDIT_DAY_OVERDUE_WEIGHTED,MAX_CREDIT_DAY_OVERDUE,SUM_CREDIT_DAY_OVERDUE,DAYS_SINCE_APPLIED,SUM_INVERSE_DAYS_CREDIT,MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,MAX_AMT_CREDIT_MAX_OVERDUE,SUM_AMT_CREDIT_MAX_OVERDUE,SUM_CNT_CREDIT_PROLONG,SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED,SUM_AMT_CREDIT_SUM_DEBT,BUREAU_UTILIZATION_AVG,BUREAU_UTILIZATION_MAX,BUREAU_PROP_SUM_OVERDUE_AVG,BUREAU_PROP_MAX_OVERDUE_AVG,MAX_DAYS_CREDIT_UPDATE,RANGE_DAYS_CREDIT_UPDATE,DAYS_CREDIT_RANGE,TOTAL_AMT_CREDIT_SUM_WEIGHTED,TOTAL_AMT_CREDIT_SUM,COUNT_CREDIT_CARD,COUNT_CAR_LOAN,COUNT_MORTGAGE,SUM_AMT_ANNUITY,BUREAU_AGG_SYNTHETIC_TARGET
0,100001,603.706712,0.0,0.0,1.0,1.0,0.987405,0.987405,3.0,3.0,3.0,3091.0,0.0,,,,596686.5,596686.5,0.0,8.857143,0.983871,1.774194,0.007155,47.5,0.5,47.0,61.0,0.992481,0.007519,0.052632,0.349547,0.021053,0.000835,0.005848,120775.784672,1800.0,282.105263,1974.736842,0.142857,0.111111,0.015873,884025.0,3091.0,19.0,19.0,-51.0,-1329.0,1778.0,577.0,3.0,7.0,3.0,0.0,0.0,0.0,0.0,49.0,0.029363,,0.0,,0.0,0.0,53216.5875,596686.5,inf,inf,0.0,,-6.0,149.0,1523.0,100412.66129,1453365.0,0.0,0.0,0.0,24817.5,0.102038
1,100002,315.103846,0.0,0.0,0.0,0.0,0.54618,0.54618,2.0,2.0,2.0,780.0,0.0,,,,245781.0,245781.0,0.0,10.875,0.689655,0.264368,0.003698,40.5,1.5,39.0,60.0,0.716964,0.283036,0.5,0.109328,0.014109,0.010476,0.025641,4863.768166,0.0,1617.905476,7012.987013,0.75,0.051282,0.027542,638235.0,927.0,20.0,20.0,-47.0,-1072.0,780.0,-2094.0,2.0,8.0,2.0,0.0,0.0,0.0,0.0,103.0,0.017755,148.3425,153.695563,5043.645,8405.145,0.0,35111.571429,245781.0,inf,inf,0.0,inf,-7.0,1178.0,1334.0,69432.89321,865055.565,4.0,0.0,0.0,0.0,0.110174
2,100003,0.0,0.0,0.0,,,0.0,0.0,1.0,1.0,1.0,1216.0,0.0,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,810000.0,1216.0,,,,-2434.0,1216.0,-2178.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,606.0,0.003938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-43.0,2088.0,1980.0,19188.078259,1017400.5,2.0,0.0,0.0,0.0,0.051376
3,100004,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,0.0,0.0,,,,-595.0,-382.0,-977.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,408.0,0.003205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,-382.0,300.0,918.0,386.044202,189037.8,0.0,0.0,0.0,0.0,0.06984
4,100005,617.739835,0.0,0.0,0.0,0.0,0.954794,0.954794,2.0,2.0,2.0,1446.0,0.0,,,,568408.5,568408.5,0.0,5.333333,1.0,0.3125,0.086957,8.5,1.0,7.5,16.0,1.0,0.0,0.0,0.539216,0.117647,0.0,0.0,107036.117647,6882.352941,0.0,0.0,0.0,0.0,0.0,598626.0,1446.0,8.0,8.0,-12.0,-128.0,1324.0,1318.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,62.0,0.026109,0.0,0.0,0.0,0.0,0.0,50188.368035,568408.5,inf,inf,0.0,0.0,-11.0,110.0,311.0,53154.691016,657126.0,1.0,0.0,0.0,4261.5,0.130488


time: 132 ms


bureau_agg.to_csv(path + "bureau_agg.csv", index=False, header=True)

# Previous Application Aggregate Synthetic Target

In [111]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])
previous_agg = pd.read_csv(path + "previous_agg.csv")

df = pd.merge(frame, previous_agg, how="left", on="SK_ID_CURR")
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 24.8 s


In [112]:
df = pd.DataFrame(scale.fit_transform(impute.fit_transform(df.replace([-np.inf, np.inf], np.nan))), columns=df.columns)

time: 13.5 s


In [113]:
linear_model = LogisticRegressionCV(Cs=[np.exp(i) for i in range(-10, 10)], 
                                   penalty="l2", 
                                   fit_intercept=True, 
                                   scoring="roc_auc", 
                                   cv=5)

linear_model.fit(df, y)

LogisticRegressionCV(Cs=[4.5399929762484854e-05, 0.00012340980408667956, 0.00033546262790251185, 0.0009118819655545162, 0.0024787521766663585, 0.006737946999085467, 0.01831563888873418, 0.049787068367863944, 0.1353352832366127, 0.36787944117144233, 1.0, 2.718281828459045, 7.38905609893065, 20.085536923187668, 54.598150033144236, 148.4131591025766, 403.4287934927351, 1096.6331584284585, 2980.9579870417283, 8103.083927575384],
           class_weight=None, cv=5, dual=False, fit_intercept=True,
           intercept_scaling=1.0, max_iter=100, multi_class='ovr',
           n_jobs=1, penalty='l2', random_state=None, refit=True,
           scoring='roc_auc', solver='lbfgs', tol=0.0001, verbose=0)

time: 4min 35s


#### Get out of fold predictions

In [114]:
kfold = StratifiedKFold(n_splits=5)
scores = pd.DataFrame({"SK_ID_CURR": [], "PREVIOUS_AGG_SYNTHETIC_TARGET": []})

time: 8.89 ms


In [115]:
for train_idx, test_idx in kfold.split(df, y):
    clf = LogisticRegression(C=linear_model.C_[0], penalty="l2")
    print("Fitting...")
    clf.fit(df.iloc[train_idx], y[train_idx])
    print("Scoring...")
    fold_scores = pd.DataFrame({"SK_ID_CURR": sk_id_curr[test_idx], "PREVIOUS_AGG_SYNTHETIC_TARGET": clf.predict_proba(df.iloc[test_idx])[:,1]})
    scores = pd.concat([scores, fold_scores], axis=0)

Fitting...
Scoring...
Fitting...
Scoring...
Fitting...
Scoring...
Fitting...
Scoring...
Fitting...
Scoring...
time: 4min 38s


#### Append to previous_agg

In [116]:
previous_agg = pd.merge(previous_agg, scores, how="left", on="SK_ID_CURR")

time: 515 ms


#### Fill in test set cases

Fit model on full training data and predict TARGET for the test cases

In [117]:
clf = LogisticRegression(C=linear_model.C_[0], penalty="l2")
clf.fit(df, y)

LogisticRegression(C=0.36787944117144233, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

time: 1min 17s


In [118]:
null_id = previous_agg["PREVIOUS_AGG_SYNTHETIC_TARGET"].isnull()
temp_frame = previous_agg[null_id].drop(["SK_ID_CURR", "PREVIOUS_AGG_SYNTHETIC_TARGET"], axis=1).replace([-np.inf, np.inf], np.nan)
arr = scale.fit_transform(impute.fit_transform(temp_frame))
previous_agg.loc[null_id, "PREVIOUS_AGG_SYNTHETIC_TARGET"] = clf.predict_proba(pd.DataFrame(arr, columns=temp_frame.columns))[:,1]

time: 2.49 s


In [119]:
previous_agg.shape

(338857, 91)

time: 4.97 ms


In [121]:
previous_agg.head()

Unnamed: 0,SK_ID_CURR,MIN_PREV_AMT_ANNUITY_12M,MIN_PREV_AMT_ANNUITY_24M,MIN_PREV_PROP_APPROVED_12M,AVG_SYNTH_TARGET_12M,AVG_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_24M,MAX_PREV_PROP_APPROVED_12M,MAX_PREV_PROP_APPROVED_24M,COUNT_PREV_APP,MIN_PREV_DAYS_TERMINATION,MAX_PREV_DAYS_TERMINATION,AVG_PREV_DAYS_TERMINATION,RANGE_PREV_DAYS_TERMINATION,MIN_PREV_AMT_CREDIT,MAX_PREV_AMT_CREDIT,AVG_PREV_AMT_CREDIT,MIN_PREV_AMT_CREDIT_WEIGHTED,MAX_PREV_AMT_CREDIT_WEIGHTED,AVG_PREV_AMT_CREDIT_WEIGHTED,MIN_PREV_AMT_CREDIT_DIV_ANNUITY,MAX_PREV_AMT_CREDIT_DIV_ANNUITY,AVG_PREV_AMT_CREDIT_DIV_ANNUITY,MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MIN_PREV_AMT_ANNUITY,MAX_PREV_AMT_ANNUITY,AVG_PREV_AMT_ANNUITY,MIN_PREV_AMT_ANNUITY_WEIGHTED,MAX_PREV_AMT_ANNUITY_WEIGHTED,AVG_PREV_AMT_ANNUITY_WEIGHTED,MIN_DAYS_DECISION,MAX_DAYS_DECISION,RANGE_DAYS_DECISION,SUM_DAYS_LAST_DUE_NULL,AVG_DAYS_LAST_DUE_NULL,AVG_PREV_REQ_AMOUNT_WEIGHTED,MAX_PREV_REQ_AMOUNT_WEIGHTED,AVG_PREV_REQ_AMOUNT,MAX_PREV_REQ_AMOUNT,AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED,AVG_PREV_PROP_APPROVED_WEIGHTED,MAX_PREV_PROP_APPROVED_WEIGHTED,AVG_PREV_RATE_DOWNPAYMENT,AVG_PREV_PROP_APPROVED,MAX_PREV_PROP_APPROVED,MIN_PREV_PROP_APPROVED,AVG_PREV_INT_RATE,SUM_PREV_URGENT_NEEDS,SUM_PREV_REPAIRS,SUM_PREV_OTHER,SUM_PREV_LIMIT_REJECT,SUM_REFUSED_CONTRACT,SUM_CANC_CONTRACT,SUM_APPR_CONTRACT,SUM_PREV_HC_REJECT,SUM_PREV_INSURE_REQ,COUNT_PREV_WALK_IN,COUNT_PREV_HIGH_YIELD,COUNT_PREV_LOW_YIELD,AVG_SYNTH_TARGET,SUM_SYNTH_TARGET_WEIGHTED,SUM_SYNTH_TARGET,MAX_SYNTH_TARGET,MIN_SYNTH_TARGET,RANGE_SYNTH_TARGET,SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE,SUM_DAYS_FIRST_DRAWING_SENTINEL,SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,SUM_DAYS_LAST_DUE_LT_FIRST_VERSION,MIN_RATE_INTEREST_PRIMARY_12M,AVG_RATE_INTEREST_PRIVILEGED_12M,SUM_REFUSED_CONTRACT_6M,SUM_PRODUCT_COMBINATION_POS_HOUSE_INTEREST_12M,SUM_PRODUCT_COMBINATION_POS_MOBILE_INTEREST_12M,SUM_NAME_GOODS_CATEGORY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_CSTR_6M,SUM_NAME_PAYMENT_TYPE_XNA_6M,COUNT_NAME_CLIENT_TYPE_REPEATER_12M,COUNT_NAME_CLIENT_TYPE_NEW_12M,AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,PREVIOUS_AGG_SYNTHETIC_TARGET
0,100001,,,,,,,,,1.0,-1612.0,-1612.0,-1612.0,0.0,23787.0,23787.0,23787.0,13.67069,13.67069,13.67069,6.020501,6.020501,6.020501,0.00346,0.00346,0.00346,3951.0,3951.0,3951.0,2.27069,2.27069,2.27069,-1740.0,-1740.0,0.0,0.0,0.0,14.273276,14.273276,24835.5,24835.5,6e-05,0.00055,0.00055,0.104326,0.957782,0.957782,0.957782,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.101729,5.8e-05,0.101729,0.101729,0.101729,0.0,0.0,1.0,0.000575,0.000575,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.100105
1,100002,,9251.775,,,,1.0,,1.0,1.0,-17.0,-17.0,-17.0,0.0,179055.0,179055.0,179055.0,295.470297,295.470297,295.470297,19.353584,19.353584,19.353584,0.031937,0.031937,0.031937,9251.775,9251.775,9251.775,15.266955,15.266955,15.266955,-606.0,-606.0,0.0,0.0,0.0,295.470297,295.470297,179055.0,179055.0,0.0,0.00165,0.00165,0.0,1.0,1.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.071974,0.000119,0.071974,0.071974,0.071974,0.0,0.0,1.0,0.00165,0.00165,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.070127
2,100003,,,,,,,,,3.0,-1976.0,-527.0,-1047.333333,1449.0,68053.5,1035882.0,484191.0,29.070269,1388.581769,612.90394,5.399568,10.531859,8.677472,0.004315,0.014118,0.008318,6737.31,98356.995,56553.99,2.877962,131.845838,70.901357,-2341.0,-746.0,1595.0,0.0,0.0,547.812073,1206.434316,435436.5,900000.0,2.1e-05,0.001071,0.001543,0.05003,1.057664,1.15098,0.989013,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,0.078878,0.000225,0.236634,0.090332,0.070374,0.019958,2.0,3.0,0.002975,0.00134,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.022026
3,100004,,,,,,,,,1.0,-714.0,-714.0,-714.0,0.0,20106.0,20106.0,20106.0,24.669939,24.669939,24.669939,3.753045,3.753045,3.753045,0.004605,0.004605,0.004605,5357.25,5357.25,5357.25,6.573313,6.573313,6.573313,-815.0,-815.0,0.0,0.0,0.0,29.793865,29.793865,24282.0,24282.0,0.00026,0.001016,0.001016,0.212008,0.828021,0.828021,0.828021,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.119115,0.000146,0.119115,0.119115,0.119115,0.0,0.0,1.0,0.001227,0.001227,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.09567
4,100005,,,,0.060611,,,,,2.0,-460.0,-460.0,-460.0,0.0,0.0,40153.5,20076.75,0.0,53.042933,26.521466,8.342371,8.342371,8.342371,0.01102,0.01102,0.01102,4813.2,4813.2,4813.2,6.358256,6.358256,6.358256,-757.0,-315.0,442.0,1.0,0.5,29.469947,58.939894,22308.75,44617.5,0.000144,0.001189,0.001189,0.108964,0.89995,0.89995,0.89995,,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.071063,0.0003,0.142127,0.081516,0.060611,0.020905,0.0,1.0,0.001321,0.001321,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,,0.115603


time: 159 ms


previous_agg.to_csv(path + "previous_agg.csv", index=False, header=True)

# Credit Card Aggregate Synthetic Target

In [92]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])
credit_card_agg = pd.read_csv(path + "credit_card_agg.csv")

df = pd.merge(frame, credit_card_agg, how="left", on="SK_ID_CURR")
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 6.22 s


In [93]:
df = pd.DataFrame(scale.fit_transform(impute.fit_transform(df.replace([-np.inf, np.inf], np.nan))), columns=df.columns)

time: 2.77 s


In [94]:
linear_model = LogisticRegressionCV(Cs=[np.exp(i) for i in range(-10, 10)], 
                                   penalty="l2", 
                                   fit_intercept=True, 
                                   scoring="roc_auc", 
                                   cv=5)

linear_model.fit(df, y)

LogisticRegressionCV(Cs=[4.5399929762484854e-05, 0.00012340980408667956, 0.00033546262790251185, 0.0009118819655545162, 0.0024787521766663585, 0.006737946999085467, 0.01831563888873418, 0.049787068367863944, 0.1353352832366127, 0.36787944117144233, 1.0, 2.718281828459045, 7.38905609893065, 20.085536923187668, 54.598150033144236, 148.4131591025766, 403.4287934927351, 1096.6331584284585, 2980.9579870417283, 8103.083927575384],
           class_weight=None, cv=5, dual=False, fit_intercept=True,
           intercept_scaling=1.0, max_iter=100, multi_class='ovr',
           n_jobs=1, penalty='l2', random_state=None, refit=True,
           scoring='roc_auc', solver='lbfgs', tol=0.0001, verbose=0)

time: 2min 19s


#### Get out of fold predictions

In [95]:
kfold = StratifiedKFold(n_splits=5)
scores = pd.DataFrame({"SK_ID_CURR": [], "CREDIT_CARD_AGG_SYNTHETIC_TARGET": []})

time: 4.33 ms


In [96]:
for train_idx, test_idx in kfold.split(df, y):
    clf = LogisticRegression(C=linear_model.C_[0], penalty="l2")
    print("Fitting...")
    clf.fit(df.iloc[train_idx], y[train_idx])
    print("Scoring...")
    fold_scores = pd.DataFrame({"SK_ID_CURR": sk_id_curr[test_idx], "CREDIT_CARD_AGG_SYNTHETIC_TARGET": clf.predict_proba(df.iloc[test_idx])[:,1]})
    scores = pd.concat([scores, fold_scores], axis=0)

Fitting...
Scoring...
Fitting...
Scoring...
Fitting...
Scoring...
Fitting...
Scoring...
Fitting...
Scoring...
time: 46 s


#### Append to previous_agg

In [97]:
credit_card_agg = pd.merge(credit_card_agg, scores, how="left", on="SK_ID_CURR")

time: 295 ms


#### Fill in test set cases

Fit model on full training data and predict TARGET for the test cases

In [98]:
clf = LogisticRegression(C=linear_model.C_[0], penalty="l2")
clf.fit(df, y)

LogisticRegression(C=0.006737946999085467, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

time: 12.6 s


In [99]:
null_id = credit_card_agg["CREDIT_CARD_AGG_SYNTHETIC_TARGET"].isnull()
temp_frame = credit_card_agg[null_id].drop(["SK_ID_CURR", "CREDIT_CARD_AGG_SYNTHETIC_TARGET"], axis=1).replace([-np.inf, np.inf], np.nan)
arr = scale.fit_transform(impute.fit_transform(temp_frame))
credit_card_agg.loc[null_id, "CREDIT_CARD_AGG_SYNTHETIC_TARGET"] = clf.predict_proba(pd.DataFrame(arr, columns=temp_frame.columns))[:,1]

time: 262 ms


In [101]:
credit_card_agg.shape

(103558, 47)

time: 2.88 ms


In [120]:
credit_card_agg.head()

Unnamed: 0,SK_ID_CURR,MAX_CREDIT_CARD_SK_DPD_6M,MAX_CREDIT_CARD_SK_DPD_12M,MAX_AMT_DRAWINGS_CURRENT_6M,MAX_AMT_DRAWINGS_CURRENT_12M,MAX_AMT_INST_MIN_REGULARITY_6M,MAX_AMT_INST_MIN_REGULARITY_12M,MAX_CNT_DRAWINGS_POS_CURRENT_6M,MAX_CNT_DRAWINGS_POS_CURRENT_12M,SUM_CC_PAYMENT_DIFF_12M,DIFF_AVG_BALANCE_6M_12M,AVG_BALANCE_6M,AVG_UTILIZATION_6M,AVG_BALANCE,MAX_BALANCE,SUM_BALANCE,MAX_MONTHS_BALANCE,MIN_MONTHS_BALANCE,RANGE_MONTHS_BALANCE,AVG_UTILIZATION,MAX_UTILIZATION,AVG_BALANCE_WEIGHTED,MAX_BALANCE_WEIGHTED,SUM_BALANCE_WEIGHTED,AVG_UTILIZATION_WEIGHTED,MAX_UTILIZATION_WEIGHTED,MAX_DPD_WEIGHTED,MAX_DPD_DEF_WEIGHTED,SUM_CNT_DRAWINGS_CURRENT,AVG_CNT_DRAWINGS_CURRENT,MAX_CNT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_CURRENT,AVG_AMT_DRAWINGS_CURRENT,MAX_AMT_DRAWINGS_CURRENT,MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,SUM_CNT_DRAWINGS_ATM_CURRENT_6M,SUM_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_CNT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M,MAX_UTILIZATION_6M,MAX_UTILIZATION_3M,MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M,MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M,CREDIT_CARD_AGG_SYNTHETIC_TARGET
0,100006,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,0.0,0.0,0.0,0.0,0.0,6.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,,,0.055377
1,100011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54482.111149,189000.0,4031676.225,75.0,2.0,73.0,0.302678,1.05,891.528045,2520.0,65973.075311,0.004953,0.014,0.0,0.0,4.0,0.054054,4.0,180000.0,2432.432432,180000.0,1.0,inf,inf,0.0,0.0,0.0,0.0,,0.0,0.0,,,0.068132
2,100013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18159.919219,161420.22,1743352.245,96.0,1.0,95.0,0.115301,1.02489,230.066978,1944.407308,22086.429911,0.001461,0.012345,0.014493,0.014493,23.0,0.239583,7.0,571500.0,5953.125,157500.0,0.0,inf,inf,0.0,0.0,0.0,0.0,,0.0,0.0,,,0.067048
3,100021,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,2.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,,,0.056155
4,100023,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,4.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,,,,0.065454


time: 178 ms


credit_card_agg.to_csv(path + "credit_card_agg.csv", index=False, header=True)

# Installments Aggregate Synthetic Target

In [123]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])
installment_agg = pd.read_csv(path + "installment_agg.csv")

df = pd.merge(frame, installment_agg, how="left", on="SK_ID_CURR")
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 15.2 s


In [124]:
df = pd.DataFrame(scale.fit_transform(impute.fit_transform(df.replace([-np.inf, np.inf], np.nan))), columns=df.columns)

time: 3.75 s


In [125]:
linear_model = LogisticRegressionCV(Cs=[np.exp(i) for i in range(-10, 10)], 
                                   penalty="l2", 
                                   fit_intercept=True, 
                                   scoring="roc_auc", 
                                   cv=5)

linear_model.fit(df, y)

LogisticRegressionCV(Cs=[4.5399929762484854e-05, 0.00012340980408667956, 0.00033546262790251185, 0.0009118819655545162, 0.0024787521766663585, 0.006737946999085467, 0.01831563888873418, 0.049787068367863944, 0.1353352832366127, 0.36787944117144233, 1.0, 2.718281828459045, 7.38905609893065, 20.085536923187668, 54.598150033144236, 148.4131591025766, 403.4287934927351, 1096.6331584284585, 2980.9579870417283, 8103.083927575384],
           class_weight=None, cv=5, dual=False, fit_intercept=True,
           intercept_scaling=1.0, max_iter=100, multi_class='ovr',
           n_jobs=1, penalty='l2', random_state=None, refit=True,
           scoring='roc_auc', solver='lbfgs', tol=0.0001, verbose=0)

time: 2min 11s


#### Get out of fold predictions

In [126]:
kfold = StratifiedKFold(n_splits=5)
scores = pd.DataFrame({"SK_ID_CURR": [], "INSTALLMENT_AGG_SYNTHETIC_TARGET": []})

time: 5.4 ms


In [127]:
for train_idx, test_idx in kfold.split(df, y):
    clf = LogisticRegression(C=linear_model.C_[0], penalty="l2")
    print("Fitting...")
    clf.fit(df.iloc[train_idx], y[train_idx])
    print("Scoring...")
    fold_scores = pd.DataFrame({"SK_ID_CURR": sk_id_curr[test_idx], "INSTALLMENT_AGG_SYNTHETIC_TARGET": clf.predict_proba(df.iloc[test_idx])[:,1]})
    scores = pd.concat([scores, fold_scores], axis=0)

Fitting...
Scoring...
Fitting...
Scoring...
Fitting...
Scoring...
Fitting...
Scoring...
Fitting...
Scoring...
time: 54.4 s


#### Append to previous_agg

In [128]:
installment_agg = pd.merge(installment_agg, scores, how="left", on="SK_ID_CURR")

time: 563 ms


#### Fill in test set cases

Fit model on full training data and predict TARGET for the test cases

In [129]:
clf = LogisticRegression(C=linear_model.C_[0], penalty="l2")
clf.fit(df, y)

LogisticRegression(C=0.36787944117144233, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

time: 10.2 s


In [130]:
null_id = installment_agg["INSTALLMENT_AGG_SYNTHETIC_TARGET"].isnull()
temp_frame = installment_agg[null_id].drop(["SK_ID_CURR", "INSTALLMENT_AGG_SYNTHETIC_TARGET"], axis=1).replace([-np.inf, np.inf], np.nan)
arr = scale.fit_transform(impute.fit_transform(temp_frame))
installment_agg.loc[null_id, "INSTALLMENT_AGG_SYNTHETIC_TARGET"] = clf.predict_proba(pd.DataFrame(arr, columns=temp_frame.columns))[:,1]

time: 851 ms


In [131]:
installment_agg.shape

(339587, 32)

time: 3.08 ms


In [132]:
installment_agg.head()

Unnamed: 0,SK_ID_CURR,SUM_UNDERPAYMENT_12M,SUM_UNDERPAYMENT_6M,MAX_PAYMENT_SIZE_6M,MAX_PAYMENT_SIZE_12M,MIN_PAYMENT_SIZE_6M,MAX_ABS_DAYS_INSTALMENT,COUNT_UNDERPAYMENT,SUM_UNDERPAYMENT,SUM_UNDERPAYMENT_WEIGHTED,MAX_UNDERPAYMENT,AVG_PAYMENT_SIZE_WEIGHTED,AVG_PAYMENT_SIZE,MAX_PAYMENT_SIZE_WEIGHTED,MAX_PAYMENT_SIZE,MIN_PAYMENT_SIZE_WEIGHTED,MIN_PAYMENT_SIZE,SUM_PAYMENT_WEIGHTED,SUM_PAYMENT,SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT,MAX_DAYS_ENTRY_PAYMENT,MIN_DAYS_ENTRY_PAYMENT,RANGE_DAYS_ENTRY_PAYMENT,MAX_UNDERPAYMENT_6M,MAX_UNDERPAYMENT_12M,SUM_PAYMENT_6M,SUM_PAYMENT_DIFF_6M_12M,MAX_AMT_INSTALMENT_6M,MIN_AMT_INSTALMENT_6M,MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,INSTALLMENT_AGG_SYNTHETIC_TARGET
0,100001,0.0,0.0,,,,2916.0,0.0,0.0,0.0,0.0,3.116986,5885.132143,10.686671,17397.9,1.365586,3951.0,21.8189,41195.925,1.0,-1628.0,-2916.0,1288.0,,,0.0,0.0,,,11.0,-36.0,0.06575
1,100002,0.0,0.0,53093.745,53093.745,9251.775,565.0,0.0,0.0,0.0,0.0,95.448632,11559.247105,1083.545816,53093.745,15.761116,9251.775,1813.524009,219625.695,0.0,-49.0,-587.0,538.0,0.0,0.0,90100.845,34590.195,53093.745,9251.775,-12.0,-31.0,0.087013
2,100003,0.0,0.0,,,,2310.0,0.0,0.0,0.0,0.0,100.798053,64754.586,1030.947353,560835.36,2.899015,6662.97,2519.951327,1618864.65,0.0,-544.0,-2324.0,1780.0,,,0.0,0.0,,,-1.0,-14.0,0.050903
3,100004,0.0,0.0,,,,784.0,0.0,0.0,0.0,0.0,9.434878,7096.155,14.544656,10573.965,6.738679,5357.25,28.304633,21288.465,0.0,-727.0,-795.0,68.0,,,0.0,0.0,,,-3.0,-11.0,0.111008
4,100005,0.0,0.0,,,,706.0,0.0,0.0,0.0,0.0,11.09417,6240.205,37.566479,17656.245,6.539674,4813.2,99.847528,56161.845,1.0,-470.0,-736.0,266.0,,,0.0,0.0,,,1.0,-37.0,0.114271


time: 40.5 ms


installment_agg.to_csv(path + "installment_agg.csv", index=False, header=True)