# Home Credit Feature Transformation

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr, randint, uniform
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA, FastICA
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler, QuantileTransformer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective
from IPython.display import display
import warnings
import pickle
import gc

%load_ext autotime

pd.options.display.max_columns = None
warnings.filterwarnings("ignore")
gc.enable()
np.random.seed(123)

path = "/Users/dsaxton/home_credit_default/"

impute = Imputer(strategy="median")
quant = QuantileTransformer(output_distribution="normal")
scale = StandardScaler()

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 4.65 ms


# Ridge Regression

## Bureau Aggregate Synthetic Target

In [2]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])
bureau_agg = pd.read_csv(path + "bureau_agg.csv", usecols=lambda c: ("AGG_SYNTH" not in c) and ("AGG_COMP" not in c))

df = pd.merge(frame, bureau_agg, how="left", on="SK_ID_CURR")
del frame
gc.collect()
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 21.1 s


In [3]:
df.head()

Unnamed: 0,SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M,SUM_CC_DEBT_6M,SUM_CC_DEBT_12M,MAX_WORST_DQ_BUREAU_BALANCE_6M,MAX_WORST_DQ_BUREAU_BALANCE_12M,MAX_BUREAU_UTILIZATION_6M,MAX_BUREAU_UTILIZATION_12M,COUNT_ACTIVE_6M,COUNT_ACTIVE_12M,COUNT_ACTIVE_24M,DAYS_REMAINING_ACTIVE,MAX_CREDIT_DAY_OVERDUE_6M,MAX_CREDIT_DAY_OVERDUE_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_12M_24M,BUREAU_SUM_DEBT_DIFF_6M_12M,BUREAU_SUM_DEBT_DIFF_12M_24M,MAX_CNT_CREDIT_PROLONG,AVG_LEN_BUREAU_BALANCE,PROP_CURRENT,PROP_CLOSED,PROP_CURRENT_WEIGHTED,MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE,MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE,RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE,SUM_SUM_CURRENT_BUREAU_BALANCE,AVG_PROP_CURRENT,AVG_PROP_DQ,MAX_PROP_DQ,AVG_PROP_CURRENT_WEIGHTED,MIN_PROP_CURRENT_WEIGHTED,AVG_PROP_DQ_WEIGHTED,MAX_PROP_DQ_WEIGHTED,AVG_PROP_CURRENT_WEIGHTED_AMT,MIN_PROP_CURRENT_WEIGHTED_AMT,AVG_PROP_DQ_WEIGHTED_AMT,MAX_PROP_DQ_WEIGHTED_AMT,AVG_WORST_DQ_BUREAU_BALANCE,MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED,AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED,TOTAL_AMT_CREDIT_SUM_POS_DAYS,SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,MAX_LEN_BUREAU_BALANCE,SUM_LEN_BUREAU_BALANCE,MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE,MIN_DAYS_CREDIT_ENDDATE,MAX_DAYS_CREDIT_ENDDATE,SUM_DAYS_CREDIT_ENDDATE,SUM_NULL_DAYS_ENDDATE_FACT,COUNT_BUREAU_RECORDS,COUNT_ACTIVE,MAX_CREDIT_DAY_OVERDUE_WEIGHTED,SUM_CREDIT_DAY_OVERDUE_WEIGHTED,MAX_CREDIT_DAY_OVERDUE,SUM_CREDIT_DAY_OVERDUE,DAYS_SINCE_APPLIED,SUM_INVERSE_DAYS_CREDIT,MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,MAX_AMT_CREDIT_MAX_OVERDUE,SUM_AMT_CREDIT_MAX_OVERDUE,SUM_CNT_CREDIT_PROLONG,SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED,SUM_AMT_CREDIT_SUM_DEBT,BUREAU_UTILIZATION_AVG,BUREAU_UTILIZATION_MAX,BUREAU_PROP_SUM_OVERDUE_AVG,BUREAU_PROP_MAX_OVERDUE_AVG,MAX_DAYS_CREDIT_UPDATE,RANGE_DAYS_CREDIT_UPDATE,DAYS_CREDIT_RANGE,TOTAL_AMT_CREDIT_SUM_WEIGHTED,TOTAL_AMT_CREDIT_SUM,COUNT_CREDIT_CARD,COUNT_CAR_LOAN,COUNT_MORTGAGE,SUM_AMT_ANNUITY
0,315.103846,0.0,0.0,0.0,0.0,0.54618,0.54618,2.0,2.0,2.0,780.0,0.0,,,,245781.0,245781.0,0.0,10.875,0.689655,0.264368,0.003698,40.5,1.5,39.0,60.0,0.716964,0.283036,0.5,0.109328,0.014109,0.010476,0.025641,4863.768166,0.0,1617.905476,7012.987013,0.75,0.051282,0.027542,638235.0,927.0,20.0,20.0,-47.0,-1072.0,780.0,-2094.0,2.0,8.0,2.0,0.0,0.0,0.0,0.0,103.0,0.017755,148.3425,153.695563,5043.645,8405.145,0.0,35111.571429,245781.0,inf,inf,0.0,inf,-7.0,1178.0,1334.0,69432.89321,865055.565,4.0,0.0,0.0,0.0
1,0.0,0.0,0.0,,,0.0,0.0,1.0,1.0,1.0,1216.0,0.0,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,810000.0,1216.0,,,,-2434.0,1216.0,-2178.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,606.0,0.003938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-43.0,2088.0,1980.0,19188.078259,1017400.5,2.0,0.0,0.0,0.0
2,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,0.0,0.0,,,,-595.0,-382.0,-977.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,408.0,0.003205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,-382.0,300.0,918.0,386.044202,189037.8,0.0,0.0,0.0,0.0
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,0.0,0.0,,,,-783.0,-783.0,-783.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1149.0,0.00087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,-783.0,0.0,0.0,186.781609,146250.0,0.0,0.0,0.0,0.0


time: 91.6 ms


In [4]:
df = pd.DataFrame(scale.fit_transform(impute.fit_transform(df.replace([-np.inf, np.inf], np.nan))), columns=df.columns)
df.head()

Unnamed: 0,SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M,SUM_CC_DEBT_6M,SUM_CC_DEBT_12M,MAX_WORST_DQ_BUREAU_BALANCE_6M,MAX_WORST_DQ_BUREAU_BALANCE_12M,MAX_BUREAU_UTILIZATION_6M,MAX_BUREAU_UTILIZATION_12M,COUNT_ACTIVE_6M,COUNT_ACTIVE_12M,COUNT_ACTIVE_24M,DAYS_REMAINING_ACTIVE,MAX_CREDIT_DAY_OVERDUE_6M,MAX_CREDIT_DAY_OVERDUE_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_12M_24M,BUREAU_SUM_DEBT_DIFF_6M_12M,BUREAU_SUM_DEBT_DIFF_12M_24M,MAX_CNT_CREDIT_PROLONG,AVG_LEN_BUREAU_BALANCE,PROP_CURRENT,PROP_CLOSED,PROP_CURRENT_WEIGHTED,MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE,MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE,RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE,SUM_SUM_CURRENT_BUREAU_BALANCE,AVG_PROP_CURRENT,AVG_PROP_DQ,MAX_PROP_DQ,AVG_PROP_CURRENT_WEIGHTED,MIN_PROP_CURRENT_WEIGHTED,AVG_PROP_DQ_WEIGHTED,MAX_PROP_DQ_WEIGHTED,AVG_PROP_CURRENT_WEIGHTED_AMT,MIN_PROP_CURRENT_WEIGHTED_AMT,AVG_PROP_DQ_WEIGHTED_AMT,MAX_PROP_DQ_WEIGHTED_AMT,AVG_WORST_DQ_BUREAU_BALANCE,MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED,AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED,TOTAL_AMT_CREDIT_SUM_POS_DAYS,SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,MAX_LEN_BUREAU_BALANCE,SUM_LEN_BUREAU_BALANCE,MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE,MIN_DAYS_CREDIT_ENDDATE,MAX_DAYS_CREDIT_ENDDATE,SUM_DAYS_CREDIT_ENDDATE,SUM_NULL_DAYS_ENDDATE_FACT,COUNT_BUREAU_RECORDS,COUNT_ACTIVE,MAX_CREDIT_DAY_OVERDUE_WEIGHTED,SUM_CREDIT_DAY_OVERDUE_WEIGHTED,MAX_CREDIT_DAY_OVERDUE,SUM_CREDIT_DAY_OVERDUE,DAYS_SINCE_APPLIED,SUM_INVERSE_DAYS_CREDIT,MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,MAX_AMT_CREDIT_MAX_OVERDUE,SUM_AMT_CREDIT_MAX_OVERDUE,SUM_CNT_CREDIT_PROLONG,SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED,SUM_AMT_CREDIT_SUM_DEBT,BUREAU_UTILIZATION_AVG,BUREAU_UTILIZATION_MAX,BUREAU_PROP_SUM_OVERDUE_AVG,BUREAU_PROP_MAX_OVERDUE_AVG,MAX_DAYS_CREDIT_UPDATE,RANGE_DAYS_CREDIT_UPDATE,DAYS_CREDIT_RANGE,TOTAL_AMT_CREDIT_SUM_WEIGHTED,TOTAL_AMT_CREDIT_SUM,COUNT_CREDIT_CARD,COUNT_CAR_LOAN,COUNT_MORTGAGE,SUM_AMT_ANNUITY
0,-0.017451,-0.36644,-0.368052,-0.149244,-0.179554,-0.010895,-0.010504,0.255989,0.223639,0.021481,-0.321577,-0.030216,-0.006791,0.001623,-0.001839,-0.21453,-0.21453,-0.153187,-0.292629,-6.091449,-0.433453,-0.172159,-0.578847,-0.625365,0.070782,0.598159,-5.934009,5.934009,5.104959,0.147871,-0.187341,1.167564,0.933341,-0.23016,-0.059154,0.228243,0.241201,2.426528,0.463551,0.808018,-0.206464,-0.378366,-0.085887,-0.085887,0.644639,-0.002689,-0.358515,-0.363513,-0.034576,0.651059,-0.029349,-0.034083,-0.033543,-0.049551,-0.049444,-0.715526,0.192498,0.054073,0.048104,-0.002021,0.006226,-0.151294,-0.045053,-0.215155,-0.008199,-0.006899,-0.005464,0.00598,0.357777,0.150541,0.073218,-0.027026,-0.248757,1.962993,-0.237459,-0.210096,-0.052514
1,-0.054782,-0.36644,-0.368052,-0.149244,-0.179554,-0.017304,-0.016913,-0.427915,-0.445819,-0.604823,-0.278515,-0.030216,-0.006791,0.001623,-0.001839,-0.397412,-0.397412,-0.153187,-0.136725,0.188252,-0.1471,-0.113857,-0.004981,-0.19817,-0.025517,-0.405341,0.190887,-0.190887,-0.243705,-0.155099,-0.119283,-0.101983,-0.11499,-0.091846,-0.04564,-0.059392,-0.057311,-0.209887,-0.176998,-0.154519,-0.133278,-0.353425,-0.16972,-0.16972,-0.010943,-0.56201,-0.30846,-0.370704,-0.635596,-0.327136,-0.633562,-0.034083,-0.033543,-0.049551,-0.049444,0.28645,-0.397029,-0.087456,-0.092504,-0.019744,-0.023189,-0.151294,-0.23545,-0.376695,-0.008581,-0.007198,-0.005464,0.00598,0.247248,1.022249,0.82989,-0.22562,-0.2088,0.529455,-0.237459,-0.210096,-0.052514
2,-0.054782,-0.36644,-0.368052,-0.149244,-0.179554,-0.007339,-0.007151,-1.111819,-1.115277,-1.231127,-0.398615,-0.030216,-0.006791,0.001623,-0.001839,-0.397412,-0.397412,-0.153187,-0.136725,0.188252,-0.1471,-0.113857,-0.004981,-0.19817,-0.025517,-0.405341,0.190887,-0.190887,-0.243705,-0.155099,-0.119283,-0.101983,-0.11499,-0.091846,-0.04564,-0.059392,-0.057311,-0.209887,-0.176998,-0.154519,-0.478407,-0.458367,-0.16972,-0.16972,-0.010943,0.193196,-0.491919,-0.267889,-1.236616,-0.816233,-1.237774,-0.034083,-0.033543,-0.049551,-0.049444,-0.107966,-0.428317,-0.087456,-0.092504,-0.019744,-0.023189,-0.151294,-0.23545,-0.376695,-0.008199,-0.006899,-0.005464,0.00598,-0.793568,-0.690515,-0.414052,-0.299935,-0.426062,-0.904082,-0.237459,-0.210096,-0.052514
3,-0.028147,-0.36644,-0.368052,-0.149244,-0.179554,-0.007339,-0.007151,-0.427915,-0.445819,0.021481,-0.317725,-0.030216,-0.006791,0.001623,-0.001839,-0.288425,-0.288425,-0.153187,-0.136725,0.188252,-0.1471,-0.113857,-0.004981,-0.19817,-0.025517,-0.405341,0.190887,-0.190887,-0.243705,-0.155099,-0.119283,-0.101983,-0.11499,-0.091846,-0.04564,-0.059392,-0.057311,-0.209887,-0.176998,-0.154519,-0.291682,-0.360243,-0.16972,-0.16972,-0.010943,-0.083179,-0.343705,-0.216953,-0.034576,-0.327136,-0.029349,-0.034083,-0.033543,-0.049551,-0.049444,-0.323102,-0.220406,-0.087456,-0.092504,-0.019744,-0.023189,-0.151294,-0.195289,-0.265607,-0.008199,-0.006899,-0.005464,0.00598,0.320934,-0.11193,-0.011117,-0.204876,-0.223408,-0.187314,-0.237459,-0.210096,-0.052514
4,-0.054782,-0.36644,-0.368052,-0.149244,-0.179554,-0.007339,-0.007151,-1.111819,-1.115277,-1.231127,-0.398615,-0.030216,-0.006791,0.001623,-0.001839,-0.397412,-0.397412,-0.153187,-0.136725,0.188252,-0.1471,-0.113857,-0.004981,-0.19817,-0.025517,-0.405341,0.190887,-0.190887,-0.243705,-0.155099,-0.119283,-0.101983,-0.11499,-0.091846,-0.04564,-0.059392,-0.057311,-0.209887,-0.176998,-0.154519,-0.478407,-0.458367,-0.16972,-0.16972,-0.010943,0.115992,-0.537956,-0.251282,-1.236616,-1.060782,-1.237774,-0.034083,-0.033543,-0.049551,-0.049444,1.368105,-0.527937,-0.087456,-0.092504,-0.019744,-0.023189,-0.151294,-0.23545,-0.376695,-0.008199,-0.006899,-0.005464,0.00598,-2.02474,-0.977891,-1.489323,-0.300722,-0.437285,-0.904082,-0.237459,-0.210096,-0.052514


time: 11 s


#### Get out of fold predictions

In [5]:
kfold = StratifiedKFold(n_splits=3)
scores = pd.DataFrame({"SK_ID_CURR": [], "BUREAU_AGG_SYNTHETIC_TARGET": []})

time: 2.74 ms


In [6]:
for train_indx, test_indx in kfold.split(df.values, y.values):
    print("Determining lambda and fitting...")
    clf = LogisticRegressionCV(Cs=10, 
                               penalty="l2", 
                               fit_intercept=False, 
                               scoring="roc_auc", 
                               cv=kfold)
    clf.fit(df.iloc[train_indx], y.iloc[train_indx])
    print("Scoring...")
    fold_scores = pd.DataFrame({"SK_ID_CURR": sk_id_curr.iloc[test_indx], "BUREAU_AGG_SYNTHETIC_TARGET": clf.predict_proba(df.iloc[test_indx])[:,1]})
    scores = pd.concat([scores, fold_scores], axis=0)
    
scores.head()

Determining lambda and fitting...
Scoring...
Determining lambda and fitting...
Scoring...
Determining lambda and fitting...
Scoring...


Unnamed: 0,SK_ID_CURR,BUREAU_AGG_SYNTHETIC_TARGET
0,100002.0,0.526688
1,100003.0,0.459155
2,100004.0,0.484266
3,100006.0,0.501675
4,100007.0,0.493031


time: 4min 2s


#### Append

In [7]:
bureau_agg = bureau_agg.merge(scores, how="left", on="SK_ID_CURR")
bureau_agg.head()

Unnamed: 0,SK_ID_CURR,SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M,SUM_CC_DEBT_6M,SUM_CC_DEBT_12M,MAX_WORST_DQ_BUREAU_BALANCE_6M,MAX_WORST_DQ_BUREAU_BALANCE_12M,MAX_BUREAU_UTILIZATION_6M,MAX_BUREAU_UTILIZATION_12M,COUNT_ACTIVE_6M,COUNT_ACTIVE_12M,COUNT_ACTIVE_24M,DAYS_REMAINING_ACTIVE,MAX_CREDIT_DAY_OVERDUE_6M,MAX_CREDIT_DAY_OVERDUE_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_12M_24M,BUREAU_SUM_DEBT_DIFF_6M_12M,BUREAU_SUM_DEBT_DIFF_12M_24M,MAX_CNT_CREDIT_PROLONG,AVG_LEN_BUREAU_BALANCE,PROP_CURRENT,PROP_CLOSED,PROP_CURRENT_WEIGHTED,MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE,MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE,RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE,SUM_SUM_CURRENT_BUREAU_BALANCE,AVG_PROP_CURRENT,AVG_PROP_DQ,MAX_PROP_DQ,AVG_PROP_CURRENT_WEIGHTED,MIN_PROP_CURRENT_WEIGHTED,AVG_PROP_DQ_WEIGHTED,MAX_PROP_DQ_WEIGHTED,AVG_PROP_CURRENT_WEIGHTED_AMT,MIN_PROP_CURRENT_WEIGHTED_AMT,AVG_PROP_DQ_WEIGHTED_AMT,MAX_PROP_DQ_WEIGHTED_AMT,AVG_WORST_DQ_BUREAU_BALANCE,MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED,AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED,TOTAL_AMT_CREDIT_SUM_POS_DAYS,SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,MAX_LEN_BUREAU_BALANCE,SUM_LEN_BUREAU_BALANCE,MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE,MIN_DAYS_CREDIT_ENDDATE,MAX_DAYS_CREDIT_ENDDATE,SUM_DAYS_CREDIT_ENDDATE,SUM_NULL_DAYS_ENDDATE_FACT,COUNT_BUREAU_RECORDS,COUNT_ACTIVE,MAX_CREDIT_DAY_OVERDUE_WEIGHTED,SUM_CREDIT_DAY_OVERDUE_WEIGHTED,MAX_CREDIT_DAY_OVERDUE,SUM_CREDIT_DAY_OVERDUE,DAYS_SINCE_APPLIED,SUM_INVERSE_DAYS_CREDIT,MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,MAX_AMT_CREDIT_MAX_OVERDUE,SUM_AMT_CREDIT_MAX_OVERDUE,SUM_CNT_CREDIT_PROLONG,SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED,SUM_AMT_CREDIT_SUM_DEBT,BUREAU_UTILIZATION_AVG,BUREAU_UTILIZATION_MAX,BUREAU_PROP_SUM_OVERDUE_AVG,BUREAU_PROP_MAX_OVERDUE_AVG,MAX_DAYS_CREDIT_UPDATE,RANGE_DAYS_CREDIT_UPDATE,DAYS_CREDIT_RANGE,TOTAL_AMT_CREDIT_SUM_WEIGHTED,TOTAL_AMT_CREDIT_SUM,COUNT_CREDIT_CARD,COUNT_CAR_LOAN,COUNT_MORTGAGE,SUM_AMT_ANNUITY,BUREAU_AGG_SYNTHETIC_TARGET
0,100001,603.706712,0.0,0.0,1.0,1.0,0.987405,0.987405,3.0,3.0,3.0,3091.0,0.0,,,,596686.5,596686.5,0.0,8.857143,0.983871,1.774194,0.007155,47.5,0.5,47.0,61.0,0.992481,0.007519,0.052632,0.349547,0.021053,0.000835,0.005848,120775.784672,1800.0,282.105263,1974.736842,0.142857,0.111111,0.015873,884025.0,3091.0,19.0,19.0,-51.0,-1329.0,1778.0,577.0,3.0,7.0,3.0,0.0,0.0,0.0,0.0,49.0,0.029363,,0.0,,0.0,0.0,53216.5875,596686.5,inf,inf,0.0,,-6.0,149.0,1523.0,100412.66129,1453365.0,0.0,0.0,0.0,24817.5,
1,100002,315.103846,0.0,0.0,0.0,0.0,0.54618,0.54618,2.0,2.0,2.0,780.0,0.0,,,,245781.0,245781.0,0.0,10.875,0.689655,0.264368,0.003698,40.5,1.5,39.0,60.0,0.716964,0.283036,0.5,0.109328,0.014109,0.010476,0.025641,4863.768166,0.0,1617.905476,7012.987013,0.75,0.051282,0.027542,638235.0,927.0,20.0,20.0,-47.0,-1072.0,780.0,-2094.0,2.0,8.0,2.0,0.0,0.0,0.0,0.0,103.0,0.017755,148.3425,153.695563,5043.645,8405.145,0.0,35111.571429,245781.0,inf,inf,0.0,inf,-7.0,1178.0,1334.0,69432.89321,865055.565,4.0,0.0,0.0,0.0,0.526688
2,100003,0.0,0.0,0.0,,,0.0,0.0,1.0,1.0,1.0,1216.0,0.0,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,810000.0,1216.0,,,,-2434.0,1216.0,-2178.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,606.0,0.003938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-43.0,2088.0,1980.0,19188.078259,1017400.5,2.0,0.0,0.0,0.0,0.459155
3,100004,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,0.0,0.0,,,,-595.0,-382.0,-977.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,408.0,0.003205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,-382.0,300.0,918.0,386.044202,189037.8,0.0,0.0,0.0,0.0,0.484266
4,100005,617.739835,0.0,0.0,0.0,0.0,0.954794,0.954794,2.0,2.0,2.0,1446.0,0.0,,,,568408.5,568408.5,0.0,5.333333,1.0,0.3125,0.086957,8.5,1.0,7.5,16.0,1.0,0.0,0.0,0.539216,0.117647,0.0,0.0,107036.117647,6882.352941,0.0,0.0,0.0,0.0,0.0,598626.0,1446.0,8.0,8.0,-12.0,-128.0,1324.0,1318.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,62.0,0.026109,0.0,0.0,0.0,0.0,0.0,50188.368035,568408.5,inf,inf,0.0,0.0,-11.0,110.0,311.0,53154.691016,657126.0,1.0,0.0,0.0,4261.5,


time: 570 ms


#### Fill in test set cases

Fit model on full training data and predict TARGET for the test cases

In [8]:
clf = LogisticRegressionCV(Cs=10, 
                           penalty="l2", 
                           fit_intercept=False, 
                           scoring="roc_auc", 
                           cv=kfold)
clf.fit(df, y)

LogisticRegressionCV(Cs=10, class_weight=None,
           cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
           dual=False, fit_intercept=False, intercept_scaling=1.0,
           max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
           random_state=None, refit=True, scoring='roc_auc',
           solver='lbfgs', tol=0.0001, verbose=0)

time: 2min 11s


In [9]:
null_id = bureau_agg["BUREAU_AGG_SYNTHETIC_TARGET"].isnull()
temp_frame = bureau_agg[null_id].drop(["SK_ID_CURR", "BUREAU_AGG_SYNTHETIC_TARGET"], axis=1).replace([-np.inf, np.inf], np.nan)
arr = scale.fit_transform(impute.fit_transform(temp_frame))
bureau_agg.loc[null_id, "BUREAU_AGG_SYNTHETIC_TARGET"] = clf.predict_proba(pd.DataFrame(arr, columns=temp_frame.columns))[:,1]

time: 1.63 s


#### Reappend principal components if necessary

In [10]:
prin_comp = pd.read_csv(path + "bureau_agg.csv", usecols=lambda c: (c == "SK_ID_CURR") or ("AGG_COMP" in c))
bureau_agg = bureau_agg.merge(prin_comp, how="left", on="SK_ID_CURR")
del prin_comp
bureau_agg.head()

Unnamed: 0,SK_ID_CURR,SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M,SUM_CC_DEBT_6M,SUM_CC_DEBT_12M,MAX_WORST_DQ_BUREAU_BALANCE_6M,MAX_WORST_DQ_BUREAU_BALANCE_12M,MAX_BUREAU_UTILIZATION_6M,MAX_BUREAU_UTILIZATION_12M,COUNT_ACTIVE_6M,COUNT_ACTIVE_12M,COUNT_ACTIVE_24M,DAYS_REMAINING_ACTIVE,MAX_CREDIT_DAY_OVERDUE_6M,MAX_CREDIT_DAY_OVERDUE_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_12M_24M,BUREAU_SUM_DEBT_DIFF_6M_12M,BUREAU_SUM_DEBT_DIFF_12M_24M,MAX_CNT_CREDIT_PROLONG,AVG_LEN_BUREAU_BALANCE,PROP_CURRENT,PROP_CLOSED,PROP_CURRENT_WEIGHTED,MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE,MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE,RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE,SUM_SUM_CURRENT_BUREAU_BALANCE,AVG_PROP_CURRENT,AVG_PROP_DQ,MAX_PROP_DQ,AVG_PROP_CURRENT_WEIGHTED,MIN_PROP_CURRENT_WEIGHTED,AVG_PROP_DQ_WEIGHTED,MAX_PROP_DQ_WEIGHTED,AVG_PROP_CURRENT_WEIGHTED_AMT,MIN_PROP_CURRENT_WEIGHTED_AMT,AVG_PROP_DQ_WEIGHTED_AMT,MAX_PROP_DQ_WEIGHTED_AMT,AVG_WORST_DQ_BUREAU_BALANCE,MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED,AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED,TOTAL_AMT_CREDIT_SUM_POS_DAYS,SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,MAX_LEN_BUREAU_BALANCE,SUM_LEN_BUREAU_BALANCE,MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE,MIN_DAYS_CREDIT_ENDDATE,MAX_DAYS_CREDIT_ENDDATE,SUM_DAYS_CREDIT_ENDDATE,SUM_NULL_DAYS_ENDDATE_FACT,COUNT_BUREAU_RECORDS,COUNT_ACTIVE,MAX_CREDIT_DAY_OVERDUE_WEIGHTED,SUM_CREDIT_DAY_OVERDUE_WEIGHTED,MAX_CREDIT_DAY_OVERDUE,SUM_CREDIT_DAY_OVERDUE,DAYS_SINCE_APPLIED,SUM_INVERSE_DAYS_CREDIT,MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,MAX_AMT_CREDIT_MAX_OVERDUE,SUM_AMT_CREDIT_MAX_OVERDUE,SUM_CNT_CREDIT_PROLONG,SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED,SUM_AMT_CREDIT_SUM_DEBT,BUREAU_UTILIZATION_AVG,BUREAU_UTILIZATION_MAX,BUREAU_PROP_SUM_OVERDUE_AVG,BUREAU_PROP_MAX_OVERDUE_AVG,MAX_DAYS_CREDIT_UPDATE,RANGE_DAYS_CREDIT_UPDATE,DAYS_CREDIT_RANGE,TOTAL_AMT_CREDIT_SUM_WEIGHTED,TOTAL_AMT_CREDIT_SUM,COUNT_CREDIT_CARD,COUNT_CAR_LOAN,COUNT_MORTGAGE,SUM_AMT_ANNUITY,BUREAU_AGG_SYNTHETIC_TARGET,BUREAU_AGG_COMP1,BUREAU_AGG_COMP2,BUREAU_AGG_COMP3,BUREAU_AGG_COMP4,BUREAU_AGG_COMP5,BUREAU_AGG_COMP6,BUREAU_AGG_COMP7
0,100001,603.706712,0.0,0.0,1.0,1.0,0.987405,0.987405,3.0,3.0,3.0,3091.0,0.0,,,,596686.5,596686.5,0.0,8.857143,0.983871,1.774194,0.007155,47.5,0.5,47.0,61.0,0.992481,0.007519,0.052632,0.349547,0.021053,0.000835,0.005848,120775.784672,1800.0,282.105263,1974.736842,0.142857,0.111111,0.015873,884025.0,3091.0,19.0,19.0,-51.0,-1329.0,1778.0,577.0,3.0,7.0,3.0,0.0,0.0,0.0,0.0,49.0,0.029363,,0.0,,0.0,0.0,53216.5875,596686.5,inf,inf,0.0,,-6.0,149.0,1523.0,100412.66129,1453365.0,0.0,0.0,0.0,24817.5,0.525046,1.10057,1.523734,1.496514,-0.660367,-1.992638,0.017996,0.257508
1,100002,315.103846,0.0,0.0,0.0,0.0,0.54618,0.54618,2.0,2.0,2.0,780.0,0.0,,,,245781.0,245781.0,0.0,10.875,0.689655,0.264368,0.003698,40.5,1.5,39.0,60.0,0.716964,0.283036,0.5,0.109328,0.014109,0.010476,0.025641,4863.768166,0.0,1617.905476,7012.987013,0.75,0.051282,0.027542,638235.0,927.0,20.0,20.0,-47.0,-1072.0,780.0,-2094.0,2.0,8.0,2.0,0.0,0.0,0.0,0.0,103.0,0.017755,148.3425,153.695563,5043.645,8405.145,0.0,35111.571429,245781.0,inf,inf,0.0,inf,-7.0,1178.0,1334.0,69432.89321,865055.565,4.0,0.0,0.0,0.0,0.526688,1.038894,7.601345,0.534316,-0.817806,-0.592501,-0.290651,0.111304
2,100003,0.0,0.0,0.0,,,0.0,0.0,1.0,1.0,1.0,1216.0,0.0,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,810000.0,1216.0,,,,-2434.0,1216.0,-2178.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,606.0,0.003938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-43.0,2088.0,1980.0,19188.078259,1017400.5,2.0,0.0,0.0,0.0,0.459155,-1.386349,-0.52014,-0.831323,-0.242488,-0.712057,0.070022,0.154299
3,100004,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,0.0,0.0,,,,-595.0,-382.0,-977.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,408.0,0.003205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,-382.0,300.0,918.0,386.044202,189037.8,0.0,0.0,0.0,0.0,0.484266,-3.20874,-0.155941,-0.524505,0.269582,0.094367,-0.046783,0.032632
4,100005,617.739835,0.0,0.0,0.0,0.0,0.954794,0.954794,2.0,2.0,2.0,1446.0,0.0,,,,568408.5,568408.5,0.0,5.333333,1.0,0.3125,0.086957,8.5,1.0,7.5,16.0,1.0,0.0,0.0,0.539216,0.117647,0.0,0.0,107036.117647,6882.352941,0.0,0.0,0.0,0.0,0.0,598626.0,1446.0,8.0,8.0,-12.0,-128.0,1324.0,1318.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,62.0,0.026109,0.0,0.0,0.0,0.0,0.0,50188.368035,568408.5,inf,inf,0.0,0.0,-11.0,110.0,311.0,53154.691016,657126.0,1.0,0.0,0.0,4261.5,0.546448,-1.948244,-0.867605,4.826918,-0.613374,-1.4752,0.188919,0.200762


time: 3.83 s


In [11]:
bureau_agg.shape

(305811, 86)

time: 4.96 ms


### AUC

In [12]:
temp = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"]).merge(bureau_agg[["SK_ID_CURR", "BUREAU_AGG_SYNTHETIC_TARGET"]], how="left", on="SK_ID_CURR").dropna()
print(round(roc_auc_score(temp["TARGET"], temp["BUREAU_AGG_SYNTHETIC_TARGET"]), 4))
del temp
gc.collect()

0.64


49

time: 13 s


In [13]:
bureau_agg.to_csv(path + "bureau_agg.csv", index=False, header=True)

time: 53.8 s


## Previous Application Aggregate Synthetic Target

In [65]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])
previous_agg = pd.read_csv(path + "previous_agg.csv", usecols=lambda c: ("AGG_SYNTH" not in c) and ("AGG_COMP" not in c))

df = pd.merge(frame, previous_agg, how="left", on="SK_ID_CURR")
del frame
gc.collect()
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 13.4 s


In [66]:
df.head()

Unnamed: 0,MIN_PREV_AMT_ANNUITY_12M,MIN_PREV_AMT_ANNUITY_24M,MIN_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_24M,MAX_PREV_PROP_APPROVED_12M,MAX_PREV_PROP_APPROVED_24M,COUNT_PREV_APP,MIN_PREV_DAYS_TERMINATION,MAX_PREV_DAYS_TERMINATION,AVG_PREV_DAYS_TERMINATION,RANGE_PREV_DAYS_TERMINATION,MIN_PREV_AMT_CREDIT,MAX_PREV_AMT_CREDIT,AVG_PREV_AMT_CREDIT,MIN_PREV_AMT_CREDIT_WEIGHTED,MAX_PREV_AMT_CREDIT_WEIGHTED,AVG_PREV_AMT_CREDIT_WEIGHTED,MIN_PREV_AMT_CREDIT_DIV_ANNUITY,MAX_PREV_AMT_CREDIT_DIV_ANNUITY,AVG_PREV_AMT_CREDIT_DIV_ANNUITY,MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MIN_PREV_AMT_ANNUITY,MAX_PREV_AMT_ANNUITY,AVG_PREV_AMT_ANNUITY,MIN_PREV_AMT_ANNUITY_WEIGHTED,MAX_PREV_AMT_ANNUITY_WEIGHTED,AVG_PREV_AMT_ANNUITY_WEIGHTED,MIN_DAYS_DECISION,MAX_DAYS_DECISION,RANGE_DAYS_DECISION,SUM_DAYS_LAST_DUE_NULL,AVG_DAYS_LAST_DUE_NULL,AVG_PREV_REQ_AMOUNT_WEIGHTED,MAX_PREV_REQ_AMOUNT_WEIGHTED,AVG_PREV_REQ_AMOUNT,MAX_PREV_REQ_AMOUNT,AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED,AVG_PREV_PROP_APPROVED_WEIGHTED,MAX_PREV_PROP_APPROVED_WEIGHTED,AVG_PREV_RATE_DOWNPAYMENT,AVG_PREV_PROP_APPROVED,MAX_PREV_PROP_APPROVED,MIN_PREV_PROP_APPROVED,AVG_PREV_INT_RATE,SUM_PREV_URGENT_NEEDS,SUM_PREV_REPAIRS,SUM_PREV_OTHER,SUM_PREV_LIMIT_REJECT,SUM_REFUSED_CONTRACT,SUM_CANC_CONTRACT,SUM_APPR_CONTRACT,SUM_PREV_HC_REJECT,SUM_PREV_INSURE_REQ,COUNT_PREV_WALK_IN,COUNT_PREV_HIGH_YIELD,COUNT_PREV_LOW_YIELD,SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE,SUM_DAYS_FIRST_DRAWING_SENTINEL,SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,SUM_DAYS_LAST_DUE_LT_FIRST_VERSION,MIN_RATE_INTEREST_PRIMARY_12M,AVG_RATE_INTEREST_PRIVILEGED_12M,SUM_REFUSED_CONTRACT_6M,SUM_PRODUCT_COMBINATION_POS_HOUSE_INTEREST_12M,SUM_PRODUCT_COMBINATION_POS_MOBILE_INTEREST_12M,SUM_NAME_GOODS_CATEGORY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_CSTR_6M,SUM_NAME_PAYMENT_TYPE_XNA_6M,COUNT_NAME_CLIENT_TYPE_REPEATER_12M,COUNT_NAME_CLIENT_TYPE_NEW_12M,AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M
0,,9251.775,,,1.0,,1.0,1.0,-17.0,-17.0,-17.0,0.0,179055.0,179055.0,179055.0,295.470297,295.470297,295.470297,19.353584,19.353584,19.353584,0.031937,0.031937,0.031937,9251.775,9251.775,9251.775,15.266955,15.266955,15.266955,-606.0,-606.0,0.0,0.0,0.0,295.470297,295.470297,179055.0,179055.0,0.0,0.00165,0.00165,0.0,1.0,1.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.00165,0.00165,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,
1,,,,,,,,3.0,-1976.0,-527.0,-1047.333333,1449.0,68053.5,1035882.0,484191.0,29.070269,1388.581769,612.90394,5.399568,10.531859,8.677472,0.004315,0.014118,0.008318,6737.31,98356.995,56553.99,2.877962,131.845838,70.901357,-2341.0,-746.0,1595.0,0.0,0.0,547.812073,1206.434316,435436.5,900000.0,2.1e-05,0.001071,0.001543,0.05003,1.057664,1.15098,0.989013,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,2.0,3.0,0.002975,0.00134,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,
2,,,,,,,,1.0,-714.0,-714.0,-714.0,0.0,20106.0,20106.0,20106.0,24.669939,24.669939,24.669939,3.753045,3.753045,3.753045,0.004605,0.004605,0.004605,5357.25,5357.25,5357.25,6.573313,6.573313,6.573313,-815.0,-815.0,0.0,0.0,0.0,29.793865,29.793865,24282.0,24282.0,0.00026,0.001016,0.001016,0.212008,0.828021,0.828021,0.828021,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.001227,0.001227,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,
3,13500.0,2482.92,0.799989,1.029197,1.012684,1.316797,1.316797,9.0,-416.0,365243.0,182481.75,365659.0,0.0,906615.0,291695.5,0.0,5008.922652,1358.887335,9.230206,27.839644,17.767287,0.015809,0.15381,0.081751,2482.92,39954.51,23651.175,4.024182,180.641436,96.293912,-617.0,-181.0,436.0,5.0,0.555556,1242.561634,3803.867403,272203.26,688500.0,0.000439,0.004129,0.007275,0.163412,1.012684,1.316797,0.799989,,0.0,0.0,0.0,1.0,1.0,3.0,5.0,0.0,0.0,0.0,2.0,2.0,1.0,4.0,0.015886,0.005525,2.0,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,,,,,,,
4,,16037.64,,,1.108236,,1.108236,6.0,-2041.0,365243.0,72143.8,367284.0,14616.0,284400.0,166638.75,6.201103,733.391711,248.03877,7.968206,21.858453,12.644075,0.003381,0.045729,0.016725,1834.29,22678.785,12278.805,0.778231,42.88139,16.715844,-2357.0,-374.0,1983.0,1.0,0.166667,222.881532,661.764706,150530.25,247500.0,7.5e-05,0.001244,0.002963,0.159516,1.046356,1.264,0.85093,,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,3.0,1.0,3.0,0.0,3.0,5.0,0.005724,0.002674,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,


time: 68.8 ms


In [67]:
df = pd.DataFrame(scale.fit_transform(impute.fit_transform(df.replace([-np.inf, np.inf], np.nan))), columns=df.columns)
df.head()

Unnamed: 0,MIN_PREV_AMT_ANNUITY_12M,MIN_PREV_AMT_ANNUITY_24M,MIN_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_24M,MAX_PREV_PROP_APPROVED_12M,MAX_PREV_PROP_APPROVED_24M,COUNT_PREV_APP,MIN_PREV_DAYS_TERMINATION,MAX_PREV_DAYS_TERMINATION,AVG_PREV_DAYS_TERMINATION,RANGE_PREV_DAYS_TERMINATION,MIN_PREV_AMT_CREDIT,MAX_PREV_AMT_CREDIT,AVG_PREV_AMT_CREDIT,MIN_PREV_AMT_CREDIT_WEIGHTED,MAX_PREV_AMT_CREDIT_WEIGHTED,AVG_PREV_AMT_CREDIT_WEIGHTED,MIN_PREV_AMT_CREDIT_DIV_ANNUITY,MAX_PREV_AMT_CREDIT_DIV_ANNUITY,AVG_PREV_AMT_CREDIT_DIV_ANNUITY,MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MIN_PREV_AMT_ANNUITY,MAX_PREV_AMT_ANNUITY,AVG_PREV_AMT_ANNUITY,MIN_PREV_AMT_ANNUITY_WEIGHTED,MAX_PREV_AMT_ANNUITY_WEIGHTED,AVG_PREV_AMT_ANNUITY_WEIGHTED,MIN_DAYS_DECISION,MAX_DAYS_DECISION,RANGE_DAYS_DECISION,SUM_DAYS_LAST_DUE_NULL,AVG_DAYS_LAST_DUE_NULL,AVG_PREV_REQ_AMOUNT_WEIGHTED,MAX_PREV_REQ_AMOUNT_WEIGHTED,AVG_PREV_REQ_AMOUNT,MAX_PREV_REQ_AMOUNT,AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED,AVG_PREV_PROP_APPROVED_WEIGHTED,MAX_PREV_PROP_APPROVED_WEIGHTED,AVG_PREV_RATE_DOWNPAYMENT,AVG_PREV_PROP_APPROVED,MAX_PREV_PROP_APPROVED,MIN_PREV_PROP_APPROVED,AVG_PREV_INT_RATE,SUM_PREV_URGENT_NEEDS,SUM_PREV_REPAIRS,SUM_PREV_OTHER,SUM_PREV_LIMIT_REJECT,SUM_REFUSED_CONTRACT,SUM_CANC_CONTRACT,SUM_APPR_CONTRACT,SUM_PREV_HC_REJECT,SUM_PREV_INSURE_REQ,COUNT_PREV_WALK_IN,COUNT_PREV_HIGH_YIELD,COUNT_PREV_LOW_YIELD,SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE,SUM_DAYS_FIRST_DRAWING_SENTINEL,SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,SUM_DAYS_LAST_DUE_LT_FIRST_VERSION,MIN_RATE_INTEREST_PRIMARY_12M,AVG_RATE_INTEREST_PRIVILEGED_12M,SUM_REFUSED_CONTRACT_6M,SUM_PRODUCT_COMBINATION_POS_HOUSE_INTEREST_12M,SUM_PRODUCT_COMBINATION_POS_MOBILE_INTEREST_12M,SUM_NAME_GOODS_CATEGORY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_CSTR_6M,SUM_NAME_PAYMENT_TYPE_XNA_6M,COUNT_NAME_CLIENT_TYPE_REPEATER_12M,COUNT_NAME_CLIENT_TYPE_NEW_12M,AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M
0,-0.200503,-0.280145,-0.088223,-0.037291,-0.490707,-0.042666,-0.605926,-0.943576,-0.269426,-1.092069,-0.793686,-0.843603,1.397768,-0.502523,0.072989,0.181463,-0.131681,-0.096781,2.797175,0.143591,1.413703,0.220685,-0.154585,-0.081738,0.206274,-0.757469,-0.527428,0.016193,-0.166969,-0.14672,1.107695,-0.255939,-1.202719,-0.645423,-1.025872,-0.092151,-0.131084,0.18471,-0.451843,-0.172878,-0.159318,-0.196333,-0.94826,-0.16202,-0.59716,0.523521,-0.000927,-0.115808,-0.174476,-0.149492,-0.247045,-0.449611,-0.513724,-0.987931,-0.353334,-0.758034,-0.389068,-0.767641,-0.136381,-1.017007,-0.888998,-0.269736,-0.174889,-0.108617,-0.003902,0.015476,-0.299862,-0.279748,-0.257643,-0.367522,-0.358671,-0.068583,-0.389234,-0.562503,-0.224113,-0.134388,-0.187858,-0.074248,-0.057687,-0.031403,-0.174179,-0.171899
1,-0.200503,-0.261967,-0.088223,-0.037291,0.016602,-0.042666,-0.032036,-0.448425,-0.289847,-1.094869,-0.803285,-0.835563,0.219641,1.413659,1.960317,-0.044303,-0.079282,-0.032474,-0.638947,-0.80446,-0.752372,-0.087796,-0.183878,-0.190546,-0.138355,4.011806,4.325895,-0.078317,-0.0253,0.04532,-0.954705,-0.52153,0.623775,-0.645423,-1.025872,-0.036182,-0.082892,1.899396,1.296445,-0.144258,-0.21471,-0.200199,-0.356741,0.447405,0.327368,0.443085,-0.000927,-0.115808,-0.174476,-0.149492,-0.247045,-0.449611,-0.513724,-0.020339,-0.353334,0.804858,-0.389068,-0.767641,-0.136381,0.504613,0.153448,-0.198209,-0.193144,-0.108617,-0.003902,0.015476,-0.299862,-0.279748,-0.257643,-0.367522,-0.358671,-0.068583,-0.389234,-0.562503,-0.224113,-0.134388,-0.187858,-0.074248,-0.057687,-0.031403,-0.174179,-0.171899
2,-0.200503,-0.261967,-0.088223,-0.037291,0.016602,-0.042666,-0.032036,-0.943576,-0.276692,-1.095896,-0.80018,-0.843603,-0.289255,-0.857992,-0.910143,-0.048032,-0.144662,-0.151642,-1.044397,-1.532965,-1.751488,-0.084556,-0.199516,-0.207651,-0.327505,-0.96592,-0.927016,-0.050127,-0.177534,-0.176729,0.859256,-0.652429,-1.202719,-0.645423,-1.025872,-0.151078,-0.145138,-0.850416,-0.827167,0.175486,-0.21997,-0.219182,1.558353,-1.97958,-1.650276,-0.735555,-0.000927,-0.115808,-0.174476,-0.149492,-0.247045,-0.449611,-0.513724,-0.987931,-0.353334,-0.758034,-0.389068,-0.767641,-0.808324,-1.017007,-0.888998,-0.292576,-0.199834,-0.108617,-0.003902,0.015476,-0.299862,-0.279748,-0.257643,-0.367522,-0.358671,-0.068583,-0.389234,-0.562503,-0.224113,-0.134388,-0.187858,-0.074248,-0.057687,-0.031403,-0.174179,-0.171899
3,0.140593,-0.869836,-2.405221,-0.168226,-0.365712,2.257922,1.459821,1.037028,-0.273585,0.913158,0.906647,1.185336,-0.502652,1.12457,0.769693,-0.068939,0.09426,0.118653,0.304333,1.055569,1.091859,0.040573,0.045766,0.147751,-0.721457,0.885866,0.949986,-0.069573,0.033998,0.13297,1.09462,0.550322,-0.703439,1.072794,1.094097,0.117912,0.054517,0.807688,0.783559,0.4144,0.077758,0.00633,0.983798,-0.027969,1.342752,-0.940777,-0.000927,-0.115808,-0.174476,-0.149492,1.343164,0.11382,1.313,0.947253,-0.353334,-0.758034,-0.389068,0.696217,0.535562,-0.256197,0.67467,0.498623,0.053515,0.655183,-0.003902,0.015476,0.634041,2.574449,-0.257643,-0.367522,-0.358671,-0.068583,-0.389234,2.271847,-0.224113,-0.134388,-0.187858,-0.074248,-0.057687,-0.031403,-0.174179,-0.171899
4,-0.200503,0.311028,-0.088223,-0.037291,0.575907,-0.042666,0.099853,0.294302,-0.290525,0.913158,-0.121367,1.194353,-0.347524,-0.266933,-0.003808,-0.063684,-0.110689,-0.10639,-0.00643,0.412784,0.052412,-0.098229,-0.13191,-0.151818,-0.810357,-0.038801,-0.216848,-0.094335,-0.133411,-0.141719,-0.973724,0.184185,1.068089,-0.30178,-0.389882,-0.108251,-0.111706,-0.006064,-0.285864,-0.072768,-0.198204,-0.149026,0.937738,0.327892,1.019447,-0.567834,-0.000927,-0.115808,-0.174476,-0.149492,-0.247045,-0.449611,-0.513724,1.431049,-0.353334,1.586305,0.543789,1.428146,-0.808324,1.265423,1.195893,-0.04986,-0.114549,-0.108617,-0.003902,0.015476,-0.299862,-0.279748,-0.257643,-0.367522,-0.358671,-0.068583,-0.389234,-0.562503,-0.224113,-0.134388,-0.187858,-0.074248,-0.057687,-0.031403,-0.174179,-0.171899


time: 7.35 s


In [68]:
df.shape

(307511, 82)

time: 6.6 ms


#### Get out of fold predictions

In [69]:
kfold = StratifiedKFold(n_splits=3)
scores = pd.DataFrame({"SK_ID_CURR": [], "PREVIOUS_AGG_SYNTHETIC_TARGET": []})

time: 3.79 ms


In [70]:
for train_indx, test_indx in kfold.split(df.values, y.values):
    print("Determining lambda and fitting...")
    clf = LogisticRegressionCV(Cs=10, 
                               penalty="l2", 
                               fit_intercept=False, 
                               scoring="roc_auc", 
                               cv=kfold)
    clf.fit(df.iloc[train_indx], y.iloc[train_indx])
    print("Scoring...")
    fold_scores = pd.DataFrame({"SK_ID_CURR": sk_id_curr.iloc[test_indx], "PREVIOUS_AGG_SYNTHETIC_TARGET": clf.predict_proba(df.iloc[test_indx])[:,1]})
    scores = pd.concat([scores, fold_scores], axis=0)
    
scores.head()

Determining lambda and fitting...
Scoring...
Determining lambda and fitting...
Scoring...
Determining lambda and fitting...
Scoring...


Unnamed: 0,SK_ID_CURR,PREVIOUS_AGG_SYNTHETIC_TARGET
0,100002.0,0.487067
1,100003.0,0.42019
2,100004.0,0.506853
3,100006.0,0.505468
4,100007.0,0.492233


time: 1min 21s


#### Append

In [71]:
previous_agg = previous_agg.merge(scores, how="left", on="SK_ID_CURR")
previous_agg.head()

Unnamed: 0,SK_ID_CURR,MIN_PREV_AMT_ANNUITY_12M,MIN_PREV_AMT_ANNUITY_24M,MIN_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_24M,MAX_PREV_PROP_APPROVED_12M,MAX_PREV_PROP_APPROVED_24M,COUNT_PREV_APP,MIN_PREV_DAYS_TERMINATION,MAX_PREV_DAYS_TERMINATION,AVG_PREV_DAYS_TERMINATION,RANGE_PREV_DAYS_TERMINATION,MIN_PREV_AMT_CREDIT,MAX_PREV_AMT_CREDIT,AVG_PREV_AMT_CREDIT,MIN_PREV_AMT_CREDIT_WEIGHTED,MAX_PREV_AMT_CREDIT_WEIGHTED,AVG_PREV_AMT_CREDIT_WEIGHTED,MIN_PREV_AMT_CREDIT_DIV_ANNUITY,MAX_PREV_AMT_CREDIT_DIV_ANNUITY,AVG_PREV_AMT_CREDIT_DIV_ANNUITY,MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MIN_PREV_AMT_ANNUITY,MAX_PREV_AMT_ANNUITY,AVG_PREV_AMT_ANNUITY,MIN_PREV_AMT_ANNUITY_WEIGHTED,MAX_PREV_AMT_ANNUITY_WEIGHTED,AVG_PREV_AMT_ANNUITY_WEIGHTED,MIN_DAYS_DECISION,MAX_DAYS_DECISION,RANGE_DAYS_DECISION,SUM_DAYS_LAST_DUE_NULL,AVG_DAYS_LAST_DUE_NULL,AVG_PREV_REQ_AMOUNT_WEIGHTED,MAX_PREV_REQ_AMOUNT_WEIGHTED,AVG_PREV_REQ_AMOUNT,MAX_PREV_REQ_AMOUNT,AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED,AVG_PREV_PROP_APPROVED_WEIGHTED,MAX_PREV_PROP_APPROVED_WEIGHTED,AVG_PREV_RATE_DOWNPAYMENT,AVG_PREV_PROP_APPROVED,MAX_PREV_PROP_APPROVED,MIN_PREV_PROP_APPROVED,AVG_PREV_INT_RATE,SUM_PREV_URGENT_NEEDS,SUM_PREV_REPAIRS,SUM_PREV_OTHER,SUM_PREV_LIMIT_REJECT,SUM_REFUSED_CONTRACT,SUM_CANC_CONTRACT,SUM_APPR_CONTRACT,SUM_PREV_HC_REJECT,SUM_PREV_INSURE_REQ,COUNT_PREV_WALK_IN,COUNT_PREV_HIGH_YIELD,COUNT_PREV_LOW_YIELD,SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE,SUM_DAYS_FIRST_DRAWING_SENTINEL,SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,SUM_DAYS_LAST_DUE_LT_FIRST_VERSION,MIN_RATE_INTEREST_PRIMARY_12M,AVG_RATE_INTEREST_PRIVILEGED_12M,SUM_REFUSED_CONTRACT_6M,SUM_PRODUCT_COMBINATION_POS_HOUSE_INTEREST_12M,SUM_PRODUCT_COMBINATION_POS_MOBILE_INTEREST_12M,SUM_NAME_GOODS_CATEGORY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_CSTR_6M,SUM_NAME_PAYMENT_TYPE_XNA_6M,COUNT_NAME_CLIENT_TYPE_REPEATER_12M,COUNT_NAME_CLIENT_TYPE_NEW_12M,AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,PREVIOUS_AGG_SYNTHETIC_TARGET
0,100001,,,,,,,,1.0,-1612.0,-1612.0,-1612.0,0.0,23787.0,23787.0,23787.0,13.67069,13.67069,13.67069,6.020501,6.020501,6.020501,0.00346,0.00346,0.00346,3951.0,3951.0,3951.0,2.27069,2.27069,2.27069,-1740.0,-1740.0,0.0,0.0,0.0,14.273276,14.273276,24835.5,24835.5,6e-05,0.00055,0.00055,0.104326,0.957782,0.957782,0.957782,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.000575,0.000575,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,
1,100002,,9251.775,,,1.0,,1.0,1.0,-17.0,-17.0,-17.0,0.0,179055.0,179055.0,179055.0,295.470297,295.470297,295.470297,19.353584,19.353584,19.353584,0.031937,0.031937,0.031937,9251.775,9251.775,9251.775,15.266955,15.266955,15.266955,-606.0,-606.0,0.0,0.0,0.0,295.470297,295.470297,179055.0,179055.0,0.0,0.00165,0.00165,0.0,1.0,1.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.00165,0.00165,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.487067
2,100003,,,,,,,,3.0,-1976.0,-527.0,-1047.333333,1449.0,68053.5,1035882.0,484191.0,29.070269,1388.581769,612.90394,5.399568,10.531859,8.677472,0.004315,0.014118,0.008318,6737.31,98356.995,56553.99,2.877962,131.845838,70.901357,-2341.0,-746.0,1595.0,0.0,0.0,547.812073,1206.434316,435436.5,900000.0,2.1e-05,0.001071,0.001543,0.05003,1.057664,1.15098,0.989013,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,2.0,3.0,0.002975,0.00134,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.42019
3,100004,,,,,,,,1.0,-714.0,-714.0,-714.0,0.0,20106.0,20106.0,20106.0,24.669939,24.669939,24.669939,3.753045,3.753045,3.753045,0.004605,0.004605,0.004605,5357.25,5357.25,5357.25,6.573313,6.573313,6.573313,-815.0,-815.0,0.0,0.0,0.0,29.793865,29.793865,24282.0,24282.0,0.00026,0.001016,0.001016,0.212008,0.828021,0.828021,0.828021,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.001227,0.001227,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.506853
4,100005,,,,,,,,2.0,-460.0,-460.0,-460.0,0.0,0.0,40153.5,20076.75,0.0,53.042933,26.521466,8.342371,8.342371,8.342371,0.01102,0.01102,0.01102,4813.2,4813.2,4813.2,6.358256,6.358256,6.358256,-757.0,-315.0,442.0,1.0,0.5,29.469947,58.939894,22308.75,44617.5,0.000144,0.001189,0.001189,0.108964,0.89995,0.89995,0.89995,,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.001321,0.001321,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,,


time: 363 ms


#### Fill in test set cases

Fit model on full training data and predict TARGET for the test cases

In [72]:
clf = LogisticRegressionCV(Cs=10, 
                           penalty="l2", 
                           fit_intercept=False, 
                           scoring="roc_auc", 
                           cv=kfold)
clf.fit(df, y)

LogisticRegressionCV(Cs=10, class_weight=None,
           cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
           dual=False, fit_intercept=False, intercept_scaling=1.0,
           max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
           random_state=None, refit=True, scoring='roc_auc',
           solver='lbfgs', tol=0.0001, verbose=0)

time: 39.2 s


In [73]:
null_id = previous_agg["PREVIOUS_AGG_SYNTHETIC_TARGET"].isnull()
temp_frame = previous_agg[null_id].drop(["SK_ID_CURR", "PREVIOUS_AGG_SYNTHETIC_TARGET"], axis=1).replace([-np.inf, np.inf], np.nan)
arr = scale.fit_transform(impute.fit_transform(temp_frame))
previous_agg.loc[null_id, "PREVIOUS_AGG_SYNTHETIC_TARGET"] = clf.predict_proba(pd.DataFrame(arr, columns=temp_frame.columns))[:,1]

time: 958 ms


In [74]:
previous_agg.shape

(338857, 84)

time: 2.51 ms


#### Reappend principal components if necessary

In [None]:
prin_comp = pd.read_csv(path + "previous_agg.csv", usecols=lambda c: (c == "SK_ID_CURR") or ("AGG_COMP" in c))
previous_agg = previous_agg.merge(prin_comp, how="left", on="SK_ID_CURR")
del prin_comp
previous_agg.head()

In [75]:
previous_agg.shape

(338857, 84)

time: 3.63 ms


### AUC

In [76]:
temp = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"]).merge(previous_agg[["SK_ID_CURR", "PREVIOUS_AGG_SYNTHETIC_TARGET"]], how="left", on="SK_ID_CURR").dropna()
print(round(roc_auc_score(temp["TARGET"], temp["PREVIOUS_AGG_SYNTHETIC_TARGET"]), 4))
del temp
gc.collect()

0.6464


70

time: 5.93 s


## Credit Card Aggregate Synthetic Target

In [299]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])
credit_card_agg = pd.read_csv(path + "credit_card_agg.csv", usecols=lambda c: ("AGG_SYNTH" not in c) and ("AGG_COMP" not in c))

df = pd.merge(frame, credit_card_agg, how="left", on="SK_ID_CURR")
del frame
gc.collect()
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 6.34 s


In [300]:
df.head()

Unnamed: 0,MAX_CREDIT_CARD_SK_DPD_6M,MAX_CREDIT_CARD_SK_DPD_12M,MAX_AMT_DRAWINGS_CURRENT_6M,MAX_AMT_DRAWINGS_CURRENT_12M,MAX_AMT_INST_MIN_REGULARITY_6M,MAX_AMT_INST_MIN_REGULARITY_12M,MAX_CNT_DRAWINGS_POS_CURRENT_6M,MAX_CNT_DRAWINGS_POS_CURRENT_12M,SUM_CC_PAYMENT_DIFF_12M,DIFF_AVG_BALANCE_6M_12M,AVG_BALANCE_6M,AVG_UTILIZATION_6M,AVG_BALANCE,MAX_BALANCE,SUM_BALANCE,MAX_MONTHS_BALANCE,MIN_MONTHS_BALANCE,RANGE_MONTHS_BALANCE,AVG_UTILIZATION,MAX_UTILIZATION,AVG_BALANCE_WEIGHTED,MAX_BALANCE_WEIGHTED,SUM_BALANCE_WEIGHTED,AVG_UTILIZATION_WEIGHTED,MAX_UTILIZATION_WEIGHTED,MAX_DPD_WEIGHTED,MAX_DPD_DEF_WEIGHTED,SUM_CNT_DRAWINGS_CURRENT,AVG_CNT_DRAWINGS_CURRENT,MAX_CNT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_CURRENT,AVG_AMT_DRAWINGS_CURRENT,MAX_AMT_DRAWINGS_CURRENT,MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,SUM_CNT_DRAWINGS_ATM_CURRENT_6M,SUM_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_CNT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M,MAX_UTILIZATION_6M,MAX_UTILIZATION_3M,MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M,MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,0.0,0.0,0.0,0.0,0.0,6.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


time: 34.5 ms


In [301]:
df = pd.DataFrame(scale.fit_transform(impute.fit_transform(df.replace([-np.inf, np.inf], np.nan))), columns=df.columns)
df.head()

Unnamed: 0,MAX_CREDIT_CARD_SK_DPD_6M,MAX_CREDIT_CARD_SK_DPD_12M,MAX_AMT_DRAWINGS_CURRENT_6M,MAX_AMT_DRAWINGS_CURRENT_12M,MAX_AMT_INST_MIN_REGULARITY_6M,MAX_AMT_INST_MIN_REGULARITY_12M,MAX_CNT_DRAWINGS_POS_CURRENT_6M,MAX_CNT_DRAWINGS_POS_CURRENT_12M,SUM_CC_PAYMENT_DIFF_12M,DIFF_AVG_BALANCE_6M_12M,AVG_BALANCE_6M,AVG_UTILIZATION_6M,AVG_BALANCE,MAX_BALANCE,SUM_BALANCE,MAX_MONTHS_BALANCE,MIN_MONTHS_BALANCE,RANGE_MONTHS_BALANCE,AVG_UTILIZATION,MAX_UTILIZATION,AVG_BALANCE_WEIGHTED,MAX_BALANCE_WEIGHTED,SUM_BALANCE_WEIGHTED,AVG_UTILIZATION_WEIGHTED,MAX_UTILIZATION_WEIGHTED,MAX_DPD_WEIGHTED,MAX_DPD_DEF_WEIGHTED,SUM_CNT_DRAWINGS_CURRENT,AVG_CNT_DRAWINGS_CURRENT,MAX_CNT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_CURRENT,AVG_AMT_DRAWINGS_CURRENT,MAX_AMT_DRAWINGS_CURRENT,MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,SUM_CNT_DRAWINGS_ATM_CURRENT_6M,SUM_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_CNT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M,MAX_UTILIZATION_6M,MAX_UTILIZATION_3M,MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M,MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M
0,-0.045109,-0.048419,-0.203562,-0.245929,-0.268081,-0.282665,-0.171615,-0.19488,-0.149137,-0.080697,-0.250955,-0.307676,-0.205706,-0.141115,-0.221606,-0.229496,-0.326393,-0.235721,-0.117844,0.305828,-0.231616,-0.246752,-0.232725,-0.252816,-0.293747,-0.041852,-0.005999,-0.167533,-0.182542,-0.182952,-0.152257,-0.19792,-0.128588,-0.006867,-0.002019,-0.002097,-0.185024,-0.162354,-0.168614,-0.196211,-0.003362,-0.324033,-0.311702,-0.028594,-0.027655
1,-0.045109,-0.048419,-0.203562,-0.245929,-0.268081,-0.282665,-0.171615,-0.19488,-0.149137,-0.080697,-0.250955,-0.307676,-0.205706,-0.141115,-0.221606,-0.229496,-0.326393,-0.235721,-0.117844,0.305828,-0.231616,-0.246752,-0.232725,-0.252816,-0.293747,-0.041852,-0.005999,-0.167533,-0.182542,-0.182952,-0.152257,-0.19792,-0.128588,-0.006867,-0.002019,-0.002097,-0.185024,-0.162354,-0.168614,-0.196211,-0.003362,-0.324033,-0.311702,-0.028594,-0.027655
2,-0.045109,-0.048419,-0.203562,-0.245929,-0.268081,-0.282665,-0.171615,-0.19488,-0.149137,-0.080697,-0.250955,-0.307676,-0.205706,-0.141115,-0.221606,-0.229496,-0.326393,-0.235721,-0.117844,0.305828,-0.231616,-0.246752,-0.232725,-0.252816,-0.293747,-0.041852,-0.005999,-0.167533,-0.182542,-0.182952,-0.152257,-0.19792,-0.128588,-0.006867,-0.002019,-0.002097,-0.185024,-0.162354,-0.168614,-0.196211,-0.003362,-0.324033,-0.311702,-0.028594,-0.027655
3,-0.045109,-0.048419,-0.203562,-0.245929,-0.268081,-0.282665,-0.171615,-0.19488,-0.149137,-0.080697,-0.250955,-0.309062,-0.642337,-1.174465,-0.619479,-1.063212,-0.326393,-1.013479,-1.561794,-2.947945,-0.278867,-0.283795,-0.409247,-0.37944,-0.386429,-0.041852,-0.005999,-0.480582,-0.28791,-0.693545,-0.754986,-0.430944,-1.176035,-0.006867,-0.002019,-0.002097,-0.185024,-0.162354,-0.168614,-0.196211,-0.003362,-0.327419,-0.311702,-0.028594,-0.027655
4,-0.045109,-0.048419,-0.203562,-0.245929,-0.268081,-0.282665,-0.171615,-0.19488,-0.149137,-0.080697,-0.250955,-0.307676,-0.205706,-0.141115,-0.221606,-0.229496,-0.326393,-0.235721,-0.117844,0.305828,-0.231616,-0.246752,-0.232725,-0.252816,-0.293747,-0.041852,-0.005999,-0.167533,-0.182542,-0.182952,-0.152257,-0.19792,-0.128588,-0.006867,-0.002019,-0.002097,-0.185024,-0.162354,-0.168614,-0.196211,-0.003362,-0.324033,-0.311702,-0.028594,-0.027655


time: 2.14 s


#### Get out of fold predictions

In [302]:
kfold = StratifiedKFold(n_splits=3)
scores = pd.DataFrame({"SK_ID_CURR": [], "CREDIT_CARD_AGG_SYNTHETIC_TARGET": []})

time: 2.11 ms


In [303]:
for train_indx, test_indx in kfold.split(df.values, y.values):
    print("Determining lambda and fitting...")
    clf = LogisticRegressionCV(Cs=10, 
                               penalty="l2", 
                               fit_intercept=False, 
                               scoring="roc_auc", 
                               cv=kfold)
    clf.fit(df.iloc[train_indx], y.iloc[train_indx])
    print("Scoring...")
    fold_scores = pd.DataFrame({"SK_ID_CURR": sk_id_curr.iloc[test_indx], "CREDIT_CARD_AGG_SYNTHETIC_TARGET": clf.predict_proba(df.iloc[test_indx])[:,1]})
    scores = pd.concat([scores, fold_scores], axis=0)
    
scores.head()

Determining lambda and fitting...
Scoring...
Determining lambda and fitting...
Scoring...
Determining lambda and fitting...
Scoring...


Unnamed: 0,SK_ID_CURR,CREDIT_CARD_AGG_SYNTHETIC_TARGET
0,100002.0,0.497157
1,100003.0,0.497157
2,100004.0,0.497157
3,100006.0,0.471819
4,100007.0,0.497157


time: 1min 29s


#### Append

In [304]:
credit_card_agg = credit_card_agg.merge(scores, how="left", on="SK_ID_CURR")
credit_card_agg.head()

Unnamed: 0,SK_ID_CURR,MAX_CREDIT_CARD_SK_DPD_6M,MAX_CREDIT_CARD_SK_DPD_12M,MAX_AMT_DRAWINGS_CURRENT_6M,MAX_AMT_DRAWINGS_CURRENT_12M,MAX_AMT_INST_MIN_REGULARITY_6M,MAX_AMT_INST_MIN_REGULARITY_12M,MAX_CNT_DRAWINGS_POS_CURRENT_6M,MAX_CNT_DRAWINGS_POS_CURRENT_12M,SUM_CC_PAYMENT_DIFF_12M,DIFF_AVG_BALANCE_6M_12M,AVG_BALANCE_6M,AVG_UTILIZATION_6M,AVG_BALANCE,MAX_BALANCE,SUM_BALANCE,MAX_MONTHS_BALANCE,MIN_MONTHS_BALANCE,RANGE_MONTHS_BALANCE,AVG_UTILIZATION,MAX_UTILIZATION,AVG_BALANCE_WEIGHTED,MAX_BALANCE_WEIGHTED,SUM_BALANCE_WEIGHTED,AVG_UTILIZATION_WEIGHTED,MAX_UTILIZATION_WEIGHTED,MAX_DPD_WEIGHTED,MAX_DPD_DEF_WEIGHTED,SUM_CNT_DRAWINGS_CURRENT,AVG_CNT_DRAWINGS_CURRENT,MAX_CNT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_CURRENT,AVG_AMT_DRAWINGS_CURRENT,MAX_AMT_DRAWINGS_CURRENT,MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,SUM_CNT_DRAWINGS_ATM_CURRENT_6M,SUM_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_CNT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M,MAX_UTILIZATION_6M,MAX_UTILIZATION_3M,MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M,MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M,CREDIT_CARD_AGG_SYNTHETIC_TARGET
0,100006,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,0.0,0.0,0.0,0.0,0.0,6.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,,,0.471819
1,100011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54482.111149,189000.0,4031676.225,75.0,2.0,73.0,0.302678,1.05,891.528045,2520.0,65973.075311,0.004953,0.014,0.0,0.0,4.0,0.054054,4.0,180000.0,2432.432432,180000.0,1.0,inf,inf,0.0,0.0,0.0,0.0,,0.0,0.0,,,0.483568
2,100013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18159.919219,161420.22,1743352.245,96.0,1.0,95.0,0.115301,1.02489,230.066978,1944.407308,22086.429911,0.001461,0.012345,0.014493,0.014493,23.0,0.239583,7.0,571500.0,5953.125,157500.0,0.0,inf,inf,0.0,0.0,0.0,0.0,,0.0,0.0,,,
3,100021,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,2.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,,,0.474393
4,100023,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,4.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,,,,0.487986


time: 359 ms


#### Fill in test set cases

Fit model on full training data and predict TARGET for the test cases

In [305]:
clf = LogisticRegressionCV(Cs=10, 
                           penalty="l2", 
                           fit_intercept=False, 
                           scoring="roc_auc", 
                           cv=kfold)
clf.fit(df, y)

LogisticRegressionCV(Cs=10, class_weight=None,
           cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
           dual=False, fit_intercept=False, intercept_scaling=1.0,
           max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
           random_state=None, refit=True, scoring='roc_auc',
           solver='lbfgs', tol=0.0001, verbose=0)

time: 58.8 s


In [306]:
null_id = credit_card_agg["CREDIT_CARD_AGG_SYNTHETIC_TARGET"].isnull()
temp_frame = credit_card_agg[null_id].drop(["SK_ID_CURR", "CREDIT_CARD_AGG_SYNTHETIC_TARGET"], axis=1).replace([-np.inf, np.inf], np.nan)
arr = scale.fit_transform(impute.fit_transform(temp_frame))
credit_card_agg.loc[null_id, "CREDIT_CARD_AGG_SYNTHETIC_TARGET"] = clf.predict_proba(pd.DataFrame(arr, columns=temp_frame.columns))[:,1]

time: 247 ms


#### Reappend principal components if necessary

In [307]:
prin_comp = pd.read_csv(path + "credit_card_agg.csv", usecols=lambda c: (c == "SK_ID_CURR") or ("AGG_COMP" in c))
credit_card_agg = credit_card_agg.merge(prin_comp, how="left", on="SK_ID_CURR")
del prin_comp
credit_card_agg.head()

Unnamed: 0,SK_ID_CURR,MAX_CREDIT_CARD_SK_DPD_6M,MAX_CREDIT_CARD_SK_DPD_12M,MAX_AMT_DRAWINGS_CURRENT_6M,MAX_AMT_DRAWINGS_CURRENT_12M,MAX_AMT_INST_MIN_REGULARITY_6M,MAX_AMT_INST_MIN_REGULARITY_12M,MAX_CNT_DRAWINGS_POS_CURRENT_6M,MAX_CNT_DRAWINGS_POS_CURRENT_12M,SUM_CC_PAYMENT_DIFF_12M,DIFF_AVG_BALANCE_6M_12M,AVG_BALANCE_6M,AVG_UTILIZATION_6M,AVG_BALANCE,MAX_BALANCE,SUM_BALANCE,MAX_MONTHS_BALANCE,MIN_MONTHS_BALANCE,RANGE_MONTHS_BALANCE,AVG_UTILIZATION,MAX_UTILIZATION,AVG_BALANCE_WEIGHTED,MAX_BALANCE_WEIGHTED,SUM_BALANCE_WEIGHTED,AVG_UTILIZATION_WEIGHTED,MAX_UTILIZATION_WEIGHTED,MAX_DPD_WEIGHTED,MAX_DPD_DEF_WEIGHTED,SUM_CNT_DRAWINGS_CURRENT,AVG_CNT_DRAWINGS_CURRENT,MAX_CNT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_CURRENT,AVG_AMT_DRAWINGS_CURRENT,MAX_AMT_DRAWINGS_CURRENT,MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,SUM_CNT_DRAWINGS_ATM_CURRENT_6M,SUM_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_CNT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M,MAX_UTILIZATION_6M,MAX_UTILIZATION_3M,MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M,MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M,CREDIT_CARD_AGG_SYNTHETIC_TARGET,CREDIT_CARD_AGG_COMP1,CREDIT_CARD_AGG_COMP2,CREDIT_CARD_AGG_COMP3,CREDIT_CARD_AGG_COMP4,CREDIT_CARD_AGG_COMP5,CREDIT_CARD_AGG_COMP6,CREDIT_CARD_AGG_COMP7
0,100006,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,0.0,0.0,0.0,0.0,0.0,6.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,,,0.471819,-2.996367,0.956185,-1.143641,0.37796,0.322572,0.12285,-0.890276
1,100011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54482.111149,189000.0,4031676.225,75.0,2.0,73.0,0.302678,1.05,891.528045,2520.0,65973.075311,0.004953,0.014,0.0,0.0,4.0,0.054054,4.0,180000.0,2432.432432,180000.0,1.0,inf,inf,0.0,0.0,0.0,0.0,,0.0,0.0,,,0.483568,-1.91423,-1.336225,0.97039,-0.360336,-1.157258,-0.099205,0.683874
2,100013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18159.919219,161420.22,1743352.245,96.0,1.0,95.0,0.115301,1.02489,230.066978,1944.407308,22086.429911,0.001461,0.012345,0.014493,0.014493,23.0,0.239583,7.0,571500.0,5953.125,157500.0,0.0,inf,inf,0.0,0.0,0.0,0.0,,0.0,0.0,,,0.481502,-1.995274,-1.079984,1.425906,-0.458675,-1.500304,-0.189693,1.339213
3,100021,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,2.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,,,0.474393,-3.130604,0.815268,-0.953739,0.421427,0.156045,0.118608,-0.890236
4,100023,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,4.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,,,,0.487986,-3.290014,1.021466,-1.043096,0.623813,0.148561,0.201433,-1.532772


time: 1.21 s


In [308]:
credit_card_agg.shape

(103558, 54)

time: 2.71 ms


### AUC

In [309]:
temp = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"]).merge(credit_card_agg[["SK_ID_CURR", "CREDIT_CARD_AGG_SYNTHETIC_TARGET"]], how="left", on="SK_ID_CURR").dropna()
print(round(roc_auc_score(temp["TARGET"], temp["CREDIT_CARD_AGG_SYNTHETIC_TARGET"]), 4))
del temp
gc.collect()

0.6549


76

time: 5.76 s


In [None]:
credit_card_agg.to_csv(path + "credit_card_agg.csv", index=False, header=True)

## Installments Aggregate Synthetic Target

In [311]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])
installment_agg = pd.read_csv(path + "installment_agg.csv", usecols=lambda c: ("AGG_SYNTH" not in c) and ("AGG_COMP" not in c))

df = pd.merge(frame, installment_agg, how="left", on="SK_ID_CURR")
del frame
gc.collect()
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 8.28 s


In [312]:
df.head()

Unnamed: 0,SUM_UNDERPAYMENT_12M,SUM_UNDERPAYMENT_6M,MAX_PAYMENT_SIZE_6M,MAX_PAYMENT_SIZE_12M,MIN_PAYMENT_SIZE_6M,MAX_ABS_DAYS_INSTALMENT,COUNT_UNDERPAYMENT,SUM_UNDERPAYMENT,SUM_UNDERPAYMENT_WEIGHTED,MAX_UNDERPAYMENT,AVG_PAYMENT_SIZE_WEIGHTED,AVG_PAYMENT_SIZE,MAX_PAYMENT_SIZE_WEIGHTED,MAX_PAYMENT_SIZE,MIN_PAYMENT_SIZE_WEIGHTED,MIN_PAYMENT_SIZE,SUM_PAYMENT_WEIGHTED,SUM_PAYMENT,SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT,MAX_DAYS_ENTRY_PAYMENT,MIN_DAYS_ENTRY_PAYMENT,RANGE_DAYS_ENTRY_PAYMENT,MAX_UNDERPAYMENT_6M,MAX_UNDERPAYMENT_12M,SUM_PAYMENT_6M,SUM_PAYMENT_DIFF_6M_12M,MAX_AMT_INSTALMENT_6M,MIN_AMT_INSTALMENT_6M,MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M
0,0.0,0.0,53093.745,53093.745,9251.775,565.0,0.0,0.0,0.0,0.0,95.448632,11559.247105,1083.545816,53093.745,15.761116,9251.775,1813.524009,219625.695,0.0,-49.0,-587.0,538.0,0.0,0.0,90100.845,34590.195,53093.745,9251.775,-12.0,-31.0
1,0.0,0.0,,,,2310.0,0.0,0.0,0.0,0.0,100.798053,64754.586,1030.947353,560835.36,2.899015,6662.97,2519.951327,1618864.65,0.0,-544.0,-2324.0,1780.0,,,0.0,0.0,,,-1.0,-14.0
2,0.0,0.0,,,,784.0,0.0,0.0,0.0,0.0,9.434878,7096.155,14.544656,10573.965,6.738679,5357.25,28.304633,21288.465,0.0,-727.0,-795.0,68.0,,,0.0,0.0,,,-3.0,-11.0
3,0.0,0.0,691786.89,691786.89,29027.52,545.0,0.0,0.0,0.0,0.0,543.096731,62947.088438,3975.786724,691786.89,4.318122,2482.92,8689.547693,1007153.415,0.0,-12.0,-575.0,563.0,0.0,0.0,865952.01,749841.93,691786.89,29027.52,-1.0,-77.0
4,0.0,0.0,16037.64,16037.64,16037.64,2326.0,3.0,29857.365,25.402727,22655.655,49.833434,12214.060227,1145.545714,22678.785,0.000125,0.18,3289.00667,806127.975,16.0,-14.0,-2318.0,2304.0,0.0,0.0,96225.84,0.0,16037.64,16037.64,12.0,-31.0


time: 23.2 ms


In [313]:
df = pd.DataFrame(scale.fit_transform(impute.fit_transform(df.replace([-np.inf, np.inf], np.nan))), columns=df.columns)
df.head()

Unnamed: 0,SUM_UNDERPAYMENT_12M,SUM_UNDERPAYMENT_6M,MAX_PAYMENT_SIZE_6M,MAX_PAYMENT_SIZE_12M,MIN_PAYMENT_SIZE_6M,MAX_ABS_DAYS_INSTALMENT,COUNT_UNDERPAYMENT,SUM_UNDERPAYMENT,SUM_UNDERPAYMENT_WEIGHTED,MAX_UNDERPAYMENT,AVG_PAYMENT_SIZE_WEIGHTED,AVG_PAYMENT_SIZE,MAX_PAYMENT_SIZE_WEIGHTED,MAX_PAYMENT_SIZE,MIN_PAYMENT_SIZE_WEIGHTED,MIN_PAYMENT_SIZE,SUM_PAYMENT_WEIGHTED,SUM_PAYMENT,SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT,MAX_DAYS_ENTRY_PAYMENT,MIN_DAYS_ENTRY_PAYMENT,RANGE_DAYS_ENTRY_PAYMENT,MAX_UNDERPAYMENT_6M,MAX_UNDERPAYMENT_12M,SUM_PAYMENT_6M,SUM_PAYMENT_DIFF_6M_12M,MAX_AMT_INSTALMENT_6M,MIN_AMT_INSTALMENT_6M,MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M
0,0.087124,0.057306,0.062455,-0.156028,-0.055344,-1.143503,-0.461146,0.03512,0.059059,-0.429348,0.025055,-0.280018,0.034783,-0.3293,0.008828,0.298862,-0.161664,-0.491631,-0.525024,0.504291,1.134204,-0.783105,-0.200594,-0.277533,-0.030966,0.231387,0.067885,-0.081803,-0.253549,0.229909
1,0.087124,0.057306,-0.213935,-0.303097,-0.113491,0.81744,-0.461146,0.03512,0.059059,-0.429348,0.036434,1.891991,0.017093,1.76566,-0.020582,0.114758,-0.019108,1.070434,-0.525024,-0.43673,-0.820883,0.566571,-0.200594,-0.277533,-0.455841,0.107633,-0.212541,-0.107967,-0.151926,0.831629
2,0.087124,0.057306,-0.213935,-0.303097,-0.113491,-0.897402,-0.461146,0.03512,0.059059,-0.429348,-0.15791,-0.46225,-0.324733,-0.504738,-0.011802,0.021902,-0.52192,-0.713049,-0.525024,-0.784623,0.900089,-1.293852,-0.200594,-0.277533,-0.455841,0.107633,-0.212541,-0.107967,-0.170403,0.937814
3,0.087124,0.057306,5.210003,3.270694,0.403188,-1.165978,-0.461146,0.03512,0.059059,-0.429348,0.977269,1.818189,1.007471,2.305971,-0.017337,-0.182507,1.225909,0.387539,-0.525024,0.57463,1.147711,-0.755937,-0.200594,-0.277533,3.627602,2.790347,5.254689,0.381197,-0.151926,-1.398273
4,0.087124,0.057306,-0.236199,-0.354842,0.101997,0.83542,0.291053,0.210329,0.083116,1.095966,-0.071976,-0.253282,0.055634,-0.454793,-0.02721,-0.359067,0.136086,0.163121,2.033883,0.570828,-0.814129,1.135999,-0.200594,-0.277533,-0.002083,0.107633,-0.233046,0.077071,-0.031826,0.229909


time: 1.81 s


#### Get out of fold predictions

In [314]:
kfold = StratifiedKFold(n_splits=3)
scores = pd.DataFrame({"SK_ID_CURR": [], "INSTALLMENT_AGG_SYNTHETIC_TARGET": []})

time: 2.36 ms


In [315]:
for train_indx, test_indx in kfold.split(df.values, y.values):
    print("Determining lambda and fitting...")
    clf = LogisticRegressionCV(Cs=10, 
                               penalty="l2", 
                               fit_intercept=False, 
                               scoring="roc_auc", 
                               cv=kfold)
    clf.fit(df.iloc[train_indx], y.iloc[train_indx])
    print("Scoring...")
    fold_scores = pd.DataFrame({"SK_ID_CURR": sk_id_curr.iloc[test_indx], "INSTALLMENT_AGG_SYNTHETIC_TARGET": clf.predict_proba(df.iloc[test_indx])[:,1]})
    scores = pd.concat([scores, fold_scores], axis=0)
    
scores.head()

Determining lambda and fitting...
Scoring...
Determining lambda and fitting...
Scoring...
Determining lambda and fitting...
Scoring...


Unnamed: 0,SK_ID_CURR,INSTALLMENT_AGG_SYNTHETIC_TARGET
0,100002.0,0.514141
1,100003.0,0.47399
2,100004.0,0.525224
3,100006.0,0.561795
4,100007.0,0.501933


time: 26.3 s


#### Append

In [316]:
installment_agg = installment_agg.merge(scores, how="left", on="SK_ID_CURR")
installment_agg.head()

Unnamed: 0,SK_ID_CURR,SUM_UNDERPAYMENT_12M,SUM_UNDERPAYMENT_6M,MAX_PAYMENT_SIZE_6M,MAX_PAYMENT_SIZE_12M,MIN_PAYMENT_SIZE_6M,MAX_ABS_DAYS_INSTALMENT,COUNT_UNDERPAYMENT,SUM_UNDERPAYMENT,SUM_UNDERPAYMENT_WEIGHTED,MAX_UNDERPAYMENT,AVG_PAYMENT_SIZE_WEIGHTED,AVG_PAYMENT_SIZE,MAX_PAYMENT_SIZE_WEIGHTED,MAX_PAYMENT_SIZE,MIN_PAYMENT_SIZE_WEIGHTED,MIN_PAYMENT_SIZE,SUM_PAYMENT_WEIGHTED,SUM_PAYMENT,SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT,MAX_DAYS_ENTRY_PAYMENT,MIN_DAYS_ENTRY_PAYMENT,RANGE_DAYS_ENTRY_PAYMENT,MAX_UNDERPAYMENT_6M,MAX_UNDERPAYMENT_12M,SUM_PAYMENT_6M,SUM_PAYMENT_DIFF_6M_12M,MAX_AMT_INSTALMENT_6M,MIN_AMT_INSTALMENT_6M,MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,INSTALLMENT_AGG_SYNTHETIC_TARGET
0,100001,0.0,0.0,,,,2916.0,0.0,0.0,0.0,0.0,3.116986,5885.132143,10.686671,17397.9,1.365586,3951.0,21.8189,41195.925,1.0,-1628.0,-2916.0,1288.0,,,0.0,0.0,,,11.0,-36.0,
1,100002,0.0,0.0,53093.745,53093.745,9251.775,565.0,0.0,0.0,0.0,0.0,95.448632,11559.247105,1083.545816,53093.745,15.761116,9251.775,1813.524009,219625.695,0.0,-49.0,-587.0,538.0,0.0,0.0,90100.845,34590.195,53093.745,9251.775,-12.0,-31.0,0.514141
2,100003,0.0,0.0,,,,2310.0,0.0,0.0,0.0,0.0,100.798053,64754.586,1030.947353,560835.36,2.899015,6662.97,2519.951327,1618864.65,0.0,-544.0,-2324.0,1780.0,,,0.0,0.0,,,-1.0,-14.0,0.47399
3,100004,0.0,0.0,,,,784.0,0.0,0.0,0.0,0.0,9.434878,7096.155,14.544656,10573.965,6.738679,5357.25,28.304633,21288.465,0.0,-727.0,-795.0,68.0,,,0.0,0.0,,,-3.0,-11.0,0.525224
4,100005,0.0,0.0,,,,706.0,0.0,0.0,0.0,0.0,11.09417,6240.205,37.566479,17656.245,6.539674,4813.2,99.847528,56161.845,1.0,-470.0,-736.0,266.0,,,0.0,0.0,,,1.0,-37.0,


time: 213 ms


#### Fill in test set cases

Fit model on full training data and predict TARGET for the test cases

In [317]:
clf = LogisticRegressionCV(Cs=10, 
                           penalty="l2", 
                           fit_intercept=False, 
                           scoring="roc_auc", 
                           cv=kfold)
clf.fit(df, y)

LogisticRegressionCV(Cs=10, class_weight=None,
           cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
           dual=False, fit_intercept=False, intercept_scaling=1.0,
           max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
           random_state=None, refit=True, scoring='roc_auc',
           solver='lbfgs', tol=0.0001, verbose=0)

time: 14.4 s


In [318]:
null_id = installment_agg["INSTALLMENT_AGG_SYNTHETIC_TARGET"].isnull()
temp_frame = installment_agg[null_id].drop(["SK_ID_CURR", "INSTALLMENT_AGG_SYNTHETIC_TARGET"], axis=1).replace([-np.inf, np.inf], np.nan)
arr = scale.fit_transform(impute.fit_transform(temp_frame))
installment_agg.loc[null_id, "INSTALLMENT_AGG_SYNTHETIC_TARGET"] = clf.predict_proba(pd.DataFrame(arr, columns=temp_frame.columns))[:,1]

time: 407 ms


#### Reappend principal components if necessary

In [319]:
prin_comp = pd.read_csv(path + "installment_agg.csv", usecols=lambda c: (c == "SK_ID_CURR") or ("AGG_COMP" in c))
installment_agg = installment_agg.merge(prin_comp, how="left", on="SK_ID_CURR")
del prin_comp
installment_agg.head()

Unnamed: 0,SK_ID_CURR,SUM_UNDERPAYMENT_12M,SUM_UNDERPAYMENT_6M,MAX_PAYMENT_SIZE_6M,MAX_PAYMENT_SIZE_12M,MIN_PAYMENT_SIZE_6M,MAX_ABS_DAYS_INSTALMENT,COUNT_UNDERPAYMENT,SUM_UNDERPAYMENT,SUM_UNDERPAYMENT_WEIGHTED,MAX_UNDERPAYMENT,AVG_PAYMENT_SIZE_WEIGHTED,AVG_PAYMENT_SIZE,MAX_PAYMENT_SIZE_WEIGHTED,MAX_PAYMENT_SIZE,MIN_PAYMENT_SIZE_WEIGHTED,MIN_PAYMENT_SIZE,SUM_PAYMENT_WEIGHTED,SUM_PAYMENT,SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT,MAX_DAYS_ENTRY_PAYMENT,MIN_DAYS_ENTRY_PAYMENT,RANGE_DAYS_ENTRY_PAYMENT,MAX_UNDERPAYMENT_6M,MAX_UNDERPAYMENT_12M,SUM_PAYMENT_6M,SUM_PAYMENT_DIFF_6M_12M,MAX_AMT_INSTALMENT_6M,MIN_AMT_INSTALMENT_6M,MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,INSTALLMENT_AGG_SYNTHETIC_TARGET,INSTALLMENT_AGG_COMP1,INSTALLMENT_AGG_COMP2,INSTALLMENT_AGG_COMP3,INSTALLMENT_AGG_COMP4,INSTALLMENT_AGG_COMP5,INSTALLMENT_AGG_COMP6,INSTALLMENT_AGG_COMP7
0,100001,0.0,0.0,,,,2916.0,0.0,0.0,0.0,0.0,3.116986,5885.132143,10.686671,17397.9,1.365586,3951.0,21.8189,41195.925,1.0,-1628.0,-2916.0,1288.0,,,0.0,0.0,,,11.0,-36.0,0.485121,-1.311336,-0.289649,-1.265137,0.259213,1.925718,0.03656,1.028292
1,100002,0.0,0.0,53093.745,53093.745,9251.775,565.0,0.0,0.0,0.0,0.0,95.448632,11559.247105,1083.545816,53093.745,15.761116,9251.775,1813.524009,219625.695,0.0,-49.0,-587.0,538.0,0.0,0.0,90100.845,34590.195,53093.745,9251.775,-12.0,-31.0,0.514141,-0.752288,1.616157,0.754784,-0.702447,-0.535793,0.056893,0.006947
2,100003,0.0,0.0,,,,2310.0,0.0,0.0,0.0,0.0,100.798053,64754.586,1030.947353,560835.36,2.899015,6662.97,2519.951327,1618864.65,0.0,-544.0,-2324.0,1780.0,,,0.0,0.0,,,-1.0,-14.0,0.47399,0.659627,-0.035899,-0.624043,1.42402,0.239304,-0.727819,1.224179
3,100004,0.0,0.0,,,,784.0,0.0,0.0,0.0,0.0,9.434878,7096.155,14.544656,10573.965,6.738679,5357.25,28.304633,21288.465,0.0,-727.0,-795.0,68.0,,,0.0,0.0,,,-3.0,-11.0,0.525224,-1.700891,1.634542,0.456757,-0.633019,-0.255442,-0.037613,0.026482
4,100005,0.0,0.0,,,,706.0,0.0,0.0,0.0,0.0,11.09417,6240.205,37.566479,17656.245,6.539674,4813.2,99.847528,56161.845,1.0,-470.0,-736.0,266.0,,,0.0,0.0,,,1.0,-37.0,0.525405,-1.571981,1.428804,0.404391,-0.61607,-0.355109,-0.043216,-0.162738


time: 1.65 s


In [320]:
installment_agg.shape

(339587, 39)

time: 2.52 ms


### AUC

In [321]:
temp = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"]).merge(installment_agg[["SK_ID_CURR", "INSTALLMENT_AGG_SYNTHETIC_TARGET"]], how="left", on="SK_ID_CURR").dropna()
print(round(roc_auc_score(temp["TARGET"], temp["INSTALLMENT_AGG_SYNTHETIC_TARGET"]), 4))
del temp
gc.collect()

0.6228


70

time: 6.67 s


In [None]:
installment_agg.to_csv(path + "installment_agg.csv", index=False, header=True)

## Point of Sale Aggregate Synthetic Target

In [87]:
frame = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"])
pos_cash_agg = pd.read_csv(path + "pos_cash_agg.csv", usecols=lambda c: ("AGG_SYNTH" not in c) and ("AGG_COMP" not in c))

df = pd.merge(frame, pos_cash_agg, how="left", on="SK_ID_CURR")
del frame
gc.collect()
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 6.76 s


In [88]:
df.head()

Unnamed: 0,MAX_POS_DPD,MAX_POS_DPD_DEF,NUM_POS_CASH,MIN_CNT_INSTALMENT_FUTURE_6M,MAX_CNT_INSTALMENT_FUTURE_6M,MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_12M
0,0.0,0.0,1.0,6.0,11.0,0.0
1,0.0,0.0,3.0,,,
2,0.0,0.0,1.0,,,
3,0.0,0.0,3.0,0.0,48.0,0.0
4,0.0,0.0,5.0,13.0,18.0,0.0


time: 11.2 ms


In [89]:
df = pd.DataFrame(scale.fit_transform(impute.fit_transform(df.replace([-np.inf, np.inf], np.nan))), columns=df.columns)
df.head()

Unnamed: 0,MAX_POS_DPD,MAX_POS_DPD_DEF,NUM_POS_CASH,MIN_CNT_INSTALMENT_FUTURE_6M,MAX_CNT_INSTALMENT_FUTURE_6M,MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_12M
0,-0.099399,-0.046552,-0.899183,0.16206,-0.159063,-0.054276
1,-0.099399,-0.046552,0.146297,-0.309552,-0.255013,-0.054276
2,-0.099399,-0.046552,-0.899183,-0.309552,-0.255013,-0.054276
3,-0.099399,-0.046552,0.146297,-0.545357,3.391078,-0.054276
4,-0.099399,-0.046552,1.191777,0.98738,0.512585,-0.054276


time: 461 ms


#### Get out of fold predictions

In [90]:
kfold = StratifiedKFold(n_splits=3)
scores = pd.DataFrame({"SK_ID_CURR": [], "POS_AGG_SYNTHETIC_TARGET": []})

time: 8.96 ms


In [91]:
for train_indx, test_indx in kfold.split(df.values, y.values):
    print("Determining lambda and fitting...")
    clf = LogisticRegressionCV(Cs=10, 
                               penalty="l2", 
                               fit_intercept=False, 
                               scoring="roc_auc", 
                               cv=kfold)
    clf.fit(df.iloc[train_indx], y.iloc[train_indx])
    print("Scoring...")
    fold_scores = pd.DataFrame({"SK_ID_CURR": sk_id_curr.iloc[test_indx], "POS_AGG_SYNTHETIC_TARGET": clf.predict_proba(df.iloc[test_indx])[:,1]})
    scores = pd.concat([scores, fold_scores], axis=0)
    
scores.head()

Determining lambda and fitting...
Scoring...
Determining lambda and fitting...
Scoring...
Determining lambda and fitting...
Scoring...


Unnamed: 0,SK_ID_CURR,POS_AGG_SYNTHETIC_TARGET
0,100002.0,0.510127
1,100003.0,0.495285
2,100004.0,0.509493
3,100006.0,0.523314
4,100007.0,0.48669


time: 5.57 s


#### Append

In [92]:
pos_cash_agg = pos_cash_agg.merge(scores, how="left", on="SK_ID_CURR")
pos_cash_agg.head()

Unnamed: 0,SK_ID_CURR,MAX_POS_DPD,MAX_POS_DPD_DEF,NUM_POS_CASH,MIN_CNT_INSTALMENT_FUTURE_6M,MAX_CNT_INSTALMENT_FUTURE_6M,MAX_CNT_INSTALMENT_FUTURE_PROD_SK_DPD_12M,POS_AGG_SYNTHETIC_TARGET
0,100001,7,7,2,,,,
1,100002,0,0,1,6.0,11.0,0.0,0.510127
2,100003,0,0,3,,,,0.495285
3,100004,0,0,1,,,,0.509493
4,100005,0,0,1,,,,


time: 201 ms


#### Fill in test set cases

Fit model on full training data and predict TARGET for the test cases

In [93]:
clf = LogisticRegressionCV(Cs=10, 
                           penalty="l2", 
                           fit_intercept=False, 
                           scoring="roc_auc", 
                           cv=kfold)
clf.fit(df, y)

LogisticRegressionCV(Cs=10, class_weight=None,
           cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
           dual=False, fit_intercept=False, intercept_scaling=1.0,
           max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
           random_state=None, refit=True, scoring='roc_auc',
           solver='lbfgs', tol=0.0001, verbose=0)

time: 3.04 s


In [94]:
null_id = pos_cash_agg["POS_AGG_SYNTHETIC_TARGET"].isnull()
temp_frame = pos_cash_agg[null_id].drop(["SK_ID_CURR", "POS_AGG_SYNTHETIC_TARGET"], axis=1).replace([-np.inf, np.inf], np.nan)
arr = scale.fit_transform(impute.fit_transform(temp_frame))
pos_cash_agg.loc[null_id, "POS_AGG_SYNTHETIC_TARGET"] = clf.predict_proba(pd.DataFrame(arr, columns=temp_frame.columns))[:,1]

time: 162 ms


#### Reappend principal components if necessary

In [332]:
pos_cash_agg.shape

(337252, 8)

time: 2.47 ms


### AUC

In [96]:
temp = pd.read_csv(path + "train.csv", usecols=["SK_ID_CURR", "TARGET"]).merge(pos_cash_agg[["SK_ID_CURR", "POS_AGG_SYNTHETIC_TARGET"]], how="left", on="SK_ID_CURR").dropna()
print(round(roc_auc_score(temp["TARGET"], temp["POS_AGG_SYNTHETIC_TARGET"]), 4))
del temp
gc.collect()

0.5658


111

time: 5.84 s


# LDA on all features

In [13]:
df = pd.read_csv(path + "train.csv")
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 28.7 s


In [6]:
scaled_df = pd.DataFrame(scale.fit_transform(quant.fit_transform(impute.fit_transform(df))), 
                        columns=df.columns, 
                        index=df.index)
del df
gc.collect()

113

time: 1min 26s


In [7]:
lda_score = pd.Series(np.empty(len(y)), 
                     index=scaled_df.index, 
                     name="LDA_SCORE")

time: 1.46 ms


In [8]:
kfold = StratifiedKFold(n_splits=3)

time: 716 µs


In [9]:
for train_indx, test_indx in kfold.split(scaled_df.values, y.values):
    print("Fitting...")
    clf = LinearDiscriminantAnalysis()
    clf.fit(scaled_df.iloc[train_indx], y.iloc[train_indx])
    print("Scoring...")
    lda_score.iloc[test_indx] = clf.predict_proba(scaled_df.iloc[test_indx])[:,1]

Fitting...
Scoring...
Fitting...
Scoring...
Fitting...
Scoring...
time: 1min 9s


In [10]:
roc_auc_score(y, lda_score)

0.776766596344993

time: 94.2 ms


In [12]:
del scaled_df
gc.collect()

134

time: 163 ms


# LightGBM

#### Select features

In [14]:
clf = lgb.LGBMClassifier(n_estimators=1000, num_leaves=23, subsample=0.5)
clf.fit(df.join(lda_score), y)

lgb_cols = df.join(lda_score).columns[clf.feature_importances_ > 0]
len(lgb_cols)

482

time: 1min 54s


#### CV performance

In [None]:
params = {"n_estimators": 5000, 
          "num_leaves": 500, 
          "min_data_in_leaf": 1000,
          "learning_rate": 0.005, 
          "bagging_fraction": 0.5, 
          "bagging_freq": 1, 
          "feature_fraction": 0.5, 
          "lambda_l2": 1}

to_drop = ["LDA_SCORE"]

lgb_data = lgb.Dataset(data=df.join(lda_score).drop(to_drop, axis=1), 
                       label=y)

cv_result = lgb.cv(params=params, 
                   train_set=lgb_data, 
                   nfold=5, 
                   metrics="auc", 
                   early_stopping_rounds=200, 
                   stratified=True, 
                   shuffle=True, 
                   verbose_eval=100, 
                   show_stdv=True, 
                   seed=2357)

cv_result = pd.DataFrame(cv_result)

[100]	cv_agg's auc: 0.766022 + 0.00185066
[200]	cv_agg's auc: 0.770263 + 0.00171526
[300]	cv_agg's auc: 0.773939 + 0.00163015
[400]	cv_agg's auc: 0.777171 + 0.00157386
[500]	cv_agg's auc: 0.779789 + 0.00149443


In [None]:
cv_result.tail()

With LDA_SCORE: 0.792666

# Principal Components

## Credit Card

In [3]:
credit_card_agg = pd.read_csv(path + "credit_card_agg.csv")
frame = credit_card_agg.drop(["SK_ID_CURR", "CREDIT_CARD_AGG_SYNTHETIC_TARGET"], axis=1)
frame = pd.DataFrame(scale.fit_transform(impute.fit_transform(frame.replace([-np.inf, np.inf], np.nan))), columns=frame.columns)
frame.head()

Unnamed: 0,MAX_CREDIT_CARD_SK_DPD_6M,MAX_CREDIT_CARD_SK_DPD_12M,MAX_AMT_DRAWINGS_CURRENT_6M,MAX_AMT_DRAWINGS_CURRENT_12M,MAX_AMT_INST_MIN_REGULARITY_6M,MAX_AMT_INST_MIN_REGULARITY_12M,MAX_CNT_DRAWINGS_POS_CURRENT_6M,MAX_CNT_DRAWINGS_POS_CURRENT_12M,SUM_CC_PAYMENT_DIFF_12M,DIFF_AVG_BALANCE_6M_12M,AVG_BALANCE_6M,AVG_UTILIZATION_6M,AVG_BALANCE,MAX_BALANCE,SUM_BALANCE,MAX_MONTHS_BALANCE,MIN_MONTHS_BALANCE,RANGE_MONTHS_BALANCE,AVG_UTILIZATION,MAX_UTILIZATION,AVG_BALANCE_WEIGHTED,MAX_BALANCE_WEIGHTED,SUM_BALANCE_WEIGHTED,AVG_UTILIZATION_WEIGHTED,MAX_UTILIZATION_WEIGHTED,MAX_DPD_WEIGHTED,MAX_DPD_DEF_WEIGHTED,SUM_CNT_DRAWINGS_CURRENT,AVG_CNT_DRAWINGS_CURRENT,MAX_CNT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_CURRENT,AVG_AMT_DRAWINGS_CURRENT,MAX_AMT_DRAWINGS_CURRENT,MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,SUM_CNT_DRAWINGS_ATM_CURRENT_6M,SUM_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_CNT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M,MAX_UTILIZATION_6M,MAX_UTILIZATION_3M,MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M,MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M
0,-0.083113,-0.088853,-0.398413,-0.499736,-0.55513,-0.592882,-0.334658,-0.384997,-0.286135,-0.146678,-0.51241,-0.658542,-0.650688,-0.830574,-0.661302,-0.944723,-0.706254,-0.928712,-0.990213,-1.301943,-0.494588,-0.525476,-0.568979,-0.590138,-0.676992,-0.076963,-0.012305,-0.489621,-0.414114,-0.637581,-0.604955,-0.513358,-0.80724,-0.01321,-0.003539,-0.003716,-0.355897,-0.308925,-0.321915,-0.380391,-0.00585,-0.707062,-0.671476,-0.045806,-0.051199
1,-0.083113,-0.088853,-0.398413,-0.499736,-0.55513,-0.592882,-0.334658,-0.384997,-0.286135,-0.146678,-0.51241,-0.658542,-0.144053,0.272594,0.571896,1.122404,0.654635,1.10744,-0.050815,0.88148,-0.462704,-0.506566,-0.409983,-0.536243,-0.63955,-0.076963,-0.012305,-0.414512,-0.399325,-0.259663,-0.209929,-0.421317,0.669207,0.001945,-0.003539,-0.003716,-0.355897,-0.308925,-0.321915,-0.380391,-0.00585,-0.707062,-0.671476,-0.045806,-0.051199
2,-0.083113,-0.088853,-0.398413,-0.499736,-0.55513,-0.592882,-0.334658,-0.384997,-0.286135,-0.146678,-0.51241,-0.658542,-0.481817,0.111614,-0.12805,1.75153,-0.706254,1.766195,-0.632363,0.829266,-0.48636,-0.510885,-0.515751,-0.574243,-0.643975,-0.076766,-0.011492,-0.057743,-0.348563,0.023775,0.649253,-0.288096,0.484651,-0.015844,-0.003539,-0.003716,-0.355897,-0.308925,-0.321915,-0.380391,-0.00585,-0.707062,-0.671476,-0.045806,-0.051199
3,-0.083113,-0.088853,-0.398413,-0.499736,-0.55513,-0.592882,-0.334658,-0.384997,-0.286135,-0.146678,-0.51241,-0.658542,-0.650688,-0.830574,-0.661302,-0.585223,0.654635,-0.599334,-0.990213,-1.301943,-0.494588,-0.525476,-0.568979,-0.590138,-0.676992,-0.076963,-0.012305,-0.489621,-0.414114,-0.637581,-0.604955,-0.513358,-0.80724,-0.01321,-0.003539,-0.003716,-0.355897,-0.308925,-0.321915,-0.380391,-0.00585,-0.707062,-0.671476,-0.045806,-0.051199
4,-0.083113,-0.088853,-0.398413,-0.499736,-0.55513,-0.592882,-0.334658,-0.384997,-0.286135,-0.146678,-0.51241,-0.658542,-0.650688,-0.830574,-0.661302,-0.794931,3.376414,-0.868825,-0.990213,-1.301943,-0.494588,-0.525476,-0.568979,-0.590138,-0.676992,-0.076963,-0.012305,-0.489621,-0.414114,-0.637581,-0.604955,-0.513358,-0.80724,-0.01321,-0.003539,-0.003716,-0.355897,-0.308925,-0.321915,-0.380391,-0.00585,-0.707062,-0.671476,-0.045806,-0.051199


time: 1.89 s


In [4]:
num_comp = 7
pca = PCA(n_components=num_comp)
pca.fit(frame)
princomp = pd.DataFrame(np.dot(frame, pca.components_.T), columns=["CREDIT_CARD_AGG_COMP" + str(i+1) for i in range(num_comp)])
credit_card_agg = pd.concat([credit_card_agg, princomp], axis=1)
credit_card_agg.head()

Unnamed: 0,SK_ID_CURR,MAX_CREDIT_CARD_SK_DPD_6M,MAX_CREDIT_CARD_SK_DPD_12M,MAX_AMT_DRAWINGS_CURRENT_6M,MAX_AMT_DRAWINGS_CURRENT_12M,MAX_AMT_INST_MIN_REGULARITY_6M,MAX_AMT_INST_MIN_REGULARITY_12M,MAX_CNT_DRAWINGS_POS_CURRENT_6M,MAX_CNT_DRAWINGS_POS_CURRENT_12M,SUM_CC_PAYMENT_DIFF_12M,DIFF_AVG_BALANCE_6M_12M,AVG_BALANCE_6M,AVG_UTILIZATION_6M,AVG_BALANCE,MAX_BALANCE,SUM_BALANCE,MAX_MONTHS_BALANCE,MIN_MONTHS_BALANCE,RANGE_MONTHS_BALANCE,AVG_UTILIZATION,MAX_UTILIZATION,AVG_BALANCE_WEIGHTED,MAX_BALANCE_WEIGHTED,SUM_BALANCE_WEIGHTED,AVG_UTILIZATION_WEIGHTED,MAX_UTILIZATION_WEIGHTED,MAX_DPD_WEIGHTED,MAX_DPD_DEF_WEIGHTED,SUM_CNT_DRAWINGS_CURRENT,AVG_CNT_DRAWINGS_CURRENT,MAX_CNT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_CURRENT,AVG_AMT_DRAWINGS_CURRENT,MAX_AMT_DRAWINGS_CURRENT,MIN_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,AVG_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,MAX_AMT_PAYMENT_CURRENT_DIV_AMT_INST_MIN_REGULARITY,SUM_CNT_DRAWINGS_ATM_CURRENT_6M,SUM_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_DRAWINGS_ATM_CURRENT_6M,MAX_CNT_DRAWINGS_ATM_CURRENT_6M,MAX_AMT_RECEIVABLE_DIV_AMT_RECEIVABLE_PRINCIPAL_6M,MAX_UTILIZATION_6M,MAX_UTILIZATION_3M,MAX_CREDIT_CARD_INST_AMT_PAST_DUE_6M,MIN_CREDIT_CARD_INST_AMT_PAST_DUE_12M,CREDIT_CARD_AGG_SYNTHETIC_TARGET,CREDIT_CARD_AGG_COMP1,CREDIT_CARD_AGG_COMP2,CREDIT_CARD_AGG_COMP3,CREDIT_CARD_AGG_COMP4,CREDIT_CARD_AGG_COMP5,CREDIT_CARD_AGG_COMP6,CREDIT_CARD_AGG_COMP7
0,100006,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,0.0,0.0,0.0,0.0,0.0,6.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,,,0.055377,-2.996367,0.956185,-1.143641,0.37796,0.322572,0.12285,-0.890276
1,100011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54482.111149,189000.0,4031676.225,75.0,2.0,73.0,0.302678,1.05,891.528045,2520.0,65973.075311,0.004953,0.014,0.0,0.0,4.0,0.054054,4.0,180000.0,2432.432432,180000.0,1.0,inf,inf,0.0,0.0,0.0,0.0,,0.0,0.0,,,0.068132,-1.91423,-1.336225,0.97039,-0.360336,-1.157258,-0.099205,0.683874
2,100013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18159.919219,161420.22,1743352.245,96.0,1.0,95.0,0.115301,1.02489,230.066978,1944.407308,22086.429911,0.001461,0.012345,0.014493,0.014493,23.0,0.239583,7.0,571500.0,5953.125,157500.0,0.0,inf,inf,0.0,0.0,0.0,0.0,,0.0,0.0,,,0.067048,-1.995274,-1.079984,1.425906,-0.458675,-1.500304,-0.189693,1.339213
3,100021,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,2.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,0.0,,,0.056155,-3.130604,0.815268,-0.953739,0.421427,0.156045,0.118608,-0.890236
4,100023,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,4.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,0.0,,,,0.065454,-3.290014,1.021466,-1.043096,0.623813,0.148561,0.201433,-1.532772


time: 748 ms


In [5]:
credit_card_agg.shape

(103558, 54)

time: 2.29 ms


## Previous Application

In [80]:
previous_agg = pd.read_csv(path + "previous_agg.csv", usecols=lambda c: "AGG_COMP" not in c)
frame = previous_agg.drop(["SK_ID_CURR", "PREVIOUS_AGG_SYNTHETIC_TARGET"], axis=1)
frame = pd.DataFrame(scale.fit_transform(impute.fit_transform(frame.replace([-np.inf, np.inf], np.nan))), columns=frame.columns)
frame.head()

Unnamed: 0,MIN_PREV_AMT_ANNUITY_12M,MIN_PREV_AMT_ANNUITY_24M,MIN_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_24M,MAX_PREV_PROP_APPROVED_12M,MAX_PREV_PROP_APPROVED_24M,COUNT_PREV_APP,MIN_PREV_DAYS_TERMINATION,MAX_PREV_DAYS_TERMINATION,AVG_PREV_DAYS_TERMINATION,RANGE_PREV_DAYS_TERMINATION,MIN_PREV_AMT_CREDIT,MAX_PREV_AMT_CREDIT,AVG_PREV_AMT_CREDIT,MIN_PREV_AMT_CREDIT_WEIGHTED,MAX_PREV_AMT_CREDIT_WEIGHTED,AVG_PREV_AMT_CREDIT_WEIGHTED,MIN_PREV_AMT_CREDIT_DIV_ANNUITY,MAX_PREV_AMT_CREDIT_DIV_ANNUITY,AVG_PREV_AMT_CREDIT_DIV_ANNUITY,MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MIN_PREV_AMT_ANNUITY,MAX_PREV_AMT_ANNUITY,AVG_PREV_AMT_ANNUITY,MIN_PREV_AMT_ANNUITY_WEIGHTED,MAX_PREV_AMT_ANNUITY_WEIGHTED,AVG_PREV_AMT_ANNUITY_WEIGHTED,MIN_DAYS_DECISION,MAX_DAYS_DECISION,RANGE_DAYS_DECISION,SUM_DAYS_LAST_DUE_NULL,AVG_DAYS_LAST_DUE_NULL,AVG_PREV_REQ_AMOUNT_WEIGHTED,MAX_PREV_REQ_AMOUNT_WEIGHTED,AVG_PREV_REQ_AMOUNT,MAX_PREV_REQ_AMOUNT,AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED,AVG_PREV_PROP_APPROVED_WEIGHTED,MAX_PREV_PROP_APPROVED_WEIGHTED,AVG_PREV_RATE_DOWNPAYMENT,AVG_PREV_PROP_APPROVED,MAX_PREV_PROP_APPROVED,MIN_PREV_PROP_APPROVED,AVG_PREV_INT_RATE,SUM_PREV_URGENT_NEEDS,SUM_PREV_REPAIRS,SUM_PREV_OTHER,SUM_PREV_LIMIT_REJECT,SUM_REFUSED_CONTRACT,SUM_CANC_CONTRACT,SUM_APPR_CONTRACT,SUM_PREV_HC_REJECT,SUM_PREV_INSURE_REQ,COUNT_PREV_WALK_IN,COUNT_PREV_HIGH_YIELD,COUNT_PREV_LOW_YIELD,SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE,SUM_DAYS_FIRST_DRAWING_SENTINEL,SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,SUM_DAYS_LAST_DUE_LT_FIRST_VERSION,MIN_RATE_INTEREST_PRIMARY_12M,AVG_RATE_INTEREST_PRIVILEGED_12M,SUM_REFUSED_CONTRACT_6M,SUM_PRODUCT_COMBINATION_POS_HOUSE_INTEREST_12M,SUM_PRODUCT_COMBINATION_POS_MOBILE_INTEREST_12M,SUM_NAME_GOODS_CATEGORY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_CSTR_6M,SUM_NAME_PAYMENT_TYPE_XNA_6M,COUNT_NAME_CLIENT_TYPE_REPEATER_12M,COUNT_NAME_CLIENT_TYPE_NEW_12M,AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M
0,-0.20337,-0.269348,-0.091275,-0.038181,0.010188,-0.043361,-0.031073,-0.930878,-0.290102,-1.046355,-0.806661,-0.888488,-0.253118,-0.859038,-0.888254,-0.061548,-0.146919,-0.15241,-0.48176,-1.247146,-1.274387,-0.102814,-0.203811,-0.215664,-0.521022,-1.038093,-1.060894,-0.088357,-0.183755,-0.191317,-0.228004,-2.343923,-1.182417,-0.652054,-1.015875,-0.152471,-0.147753,-0.849415,-0.835933,-0.097335,-0.264047,-0.235391,0.274481,-0.598271,-0.830647,0.220864,0.000575,-0.119362,-0.180841,-0.15509,-0.253959,-0.468607,-0.54178,-0.964522,-0.369597,-0.740713,-0.405002,-0.030494,-0.798083,-1.002924,-0.890906,-0.324769,-0.236958,-0.120713,-0.00144,0.014483,-0.309833,-0.286025,-0.265768,-0.373496,-0.364695,-0.068306,-0.39633,-0.584399,-0.226737,-0.134695,-0.190281,-0.083524,-0.065043,-0.030225,-0.17607,-0.174794
1,-0.20337,-0.283452,-0.091275,-0.038181,-0.471055,-0.043361,-0.601476,-0.930878,-0.273684,-1.037628,-0.792062,-0.888488,1.356782,-0.519619,0.052874,0.18822,-0.13332,-0.095998,2.722014,0.144542,1.363771,0.228852,-0.156504,-0.082367,0.176605,-0.762751,-0.535396,0.014917,-0.168238,-0.147301,1.082426,-0.231896,-1.182417,-0.652054,-1.015875,-0.091135,-0.132774,0.161104,-0.47043,-0.178424,-0.15959,-0.196526,-0.925028,-0.161182,-0.580702,0.522011,0.000575,-0.119362,-0.180841,-0.15509,-0.253959,-0.468607,-0.54178,-0.964522,-0.369597,-0.740713,-0.405002,-0.744408,-0.14507,-1.002924,-0.890906,-0.268249,-0.174918,-0.120713,-0.00144,0.014483,-0.309833,-0.286025,-0.265768,-0.373496,-0.364695,-0.068306,-0.39633,-0.584399,-0.226737,-0.134695,-0.190281,-0.083524,-0.065043,-0.030225,-0.17607,-0.174794
2,-0.20337,-0.269348,-0.091275,-0.038181,0.010188,-0.043361,-0.031073,-0.457024,-0.293849,-1.040419,-0.801492,-0.880512,0.20586,1.353425,1.902399,-0.047899,-0.08057,-0.032452,-0.630963,-0.776257,-0.748664,-0.092859,-0.186106,-0.192924,-0.15432,3.8657,4.153957,-0.083532,-0.029048,0.041124,-0.922508,-0.49264,0.594767,-0.652054,-1.015875,-0.036092,-0.084246,1.841035,1.238222,-0.149521,-0.214605,-0.200318,-0.349796,0.435829,0.313158,0.44364,0.000575,-0.119362,-0.180841,-0.15509,-0.253959,-0.468607,-0.54178,-0.02793,-0.369597,0.773034,-0.405002,-0.744408,-0.14507,0.472577,0.122849,-0.198602,-0.192782,-0.120713,-0.00144,0.014483,-0.309833,-0.286025,-0.265768,-0.373496,-0.364695,-0.068306,-0.39633,-0.584399,-0.226737,-0.134695,-0.190281,-0.083524,-0.065043,-0.030225,-0.17607,-0.174794
3,-0.20337,-0.269348,-0.091275,-0.038181,0.010188,-0.043361,-0.031073,-0.930878,-0.280858,-1.041442,-0.798441,-0.888488,-0.291285,-0.867085,-0.910566,-0.051799,-0.146388,-0.150208,-1.026602,-1.483819,-1.723039,-0.08948,-0.201909,-0.210304,-0.335947,-0.965047,-0.921484,-0.054167,-0.178618,-0.176745,0.840909,-0.621149,-1.182417,-0.652054,-1.015875,-0.149086,-0.146926,-0.853041,-0.837245,0.173388,-0.219829,-0.218939,1.512567,-1.941717,-1.598884,-0.704748,0.000575,-0.119362,-0.180841,-0.15509,-0.253959,-0.468607,-0.54178,-0.964522,-0.369597,-0.740713,-0.405002,-0.744408,-0.798083,-1.002924,-0.890906,-0.290488,-0.199329,-0.120713,-0.00144,0.014483,-0.309833,-0.286025,-0.265768,-0.373496,-0.364695,-0.068306,-0.39633,-0.584399,-0.226737,-0.134695,-0.190281,-0.083524,-0.065043,-0.030225,-0.17607,-0.174794
4,-0.20337,-0.269348,-0.091275,-0.038181,0.010188,-0.043361,-0.031073,-0.693951,-0.278244,-1.040052,-0.796117,-0.888488,-0.499755,-0.823261,-0.910743,-0.073665,-0.145019,-0.149837,0.076156,-1.004792,-0.814969,-0.01476,-0.191251,-0.180275,-0.407549,-0.993307,-0.975419,-0.055876,-0.178875,-0.177473,0.907933,0.31008,-0.689931,-0.323775,0.835001,-0.149157,-0.145374,-0.865971,-0.78905,0.016248,-0.20341,-0.21283,0.327805,-1.197024,-1.173038,-0.191667,0.000575,-0.119362,-0.180841,-0.15509,-0.253959,-0.468607,0.038602,-0.964522,-0.369597,-0.740713,-0.405002,-0.030494,-0.798083,-1.002924,-0.890906,-0.285548,-0.193906,-0.120713,-0.00144,0.014483,-0.309833,-0.286025,-0.265768,-0.373496,-0.364695,-0.068306,-0.39633,-0.180272,-0.226737,-0.134695,-0.190281,-0.083524,-0.065043,-0.030225,-0.17607,-0.174794


time: 12.5 s


In [81]:
previous_agg.shape

(338857, 84)

time: 2.71 ms


In [82]:
num_comp = 7
pca = PCA(n_components=num_comp)
pca.fit(frame)
princomp = pd.DataFrame(np.dot(frame, pca.components_.T), columns=["PREVIOUS_AGG_COMP" + str(i+1) for i in range(num_comp)])
previous_agg = pd.concat([previous_agg, princomp], axis=1)
previous_agg.head()

Unnamed: 0,SK_ID_CURR,MIN_PREV_AMT_ANNUITY_12M,MIN_PREV_AMT_ANNUITY_24M,MIN_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_12M,AVG_PREV_PROP_APPROVED_24M,MAX_PREV_PROP_APPROVED_12M,MAX_PREV_PROP_APPROVED_24M,COUNT_PREV_APP,MIN_PREV_DAYS_TERMINATION,MAX_PREV_DAYS_TERMINATION,AVG_PREV_DAYS_TERMINATION,RANGE_PREV_DAYS_TERMINATION,MIN_PREV_AMT_CREDIT,MAX_PREV_AMT_CREDIT,AVG_PREV_AMT_CREDIT,MIN_PREV_AMT_CREDIT_WEIGHTED,MAX_PREV_AMT_CREDIT_WEIGHTED,AVG_PREV_AMT_CREDIT_WEIGHTED,MIN_PREV_AMT_CREDIT_DIV_ANNUITY,MAX_PREV_AMT_CREDIT_DIV_ANNUITY,AVG_PREV_AMT_CREDIT_DIV_ANNUITY,MIN_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MAX_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,AVG_PREV_AMT_CREDIT_DIV_ANNUITY_WEIGHTED,MIN_PREV_AMT_ANNUITY,MAX_PREV_AMT_ANNUITY,AVG_PREV_AMT_ANNUITY,MIN_PREV_AMT_ANNUITY_WEIGHTED,MAX_PREV_AMT_ANNUITY_WEIGHTED,AVG_PREV_AMT_ANNUITY_WEIGHTED,MIN_DAYS_DECISION,MAX_DAYS_DECISION,RANGE_DAYS_DECISION,SUM_DAYS_LAST_DUE_NULL,AVG_DAYS_LAST_DUE_NULL,AVG_PREV_REQ_AMOUNT_WEIGHTED,MAX_PREV_REQ_AMOUNT_WEIGHTED,AVG_PREV_REQ_AMOUNT,MAX_PREV_REQ_AMOUNT,AVG_PREV_RATE_DOWNPAYMENT_WEIGHTED,AVG_PREV_PROP_APPROVED_WEIGHTED,MAX_PREV_PROP_APPROVED_WEIGHTED,AVG_PREV_RATE_DOWNPAYMENT,AVG_PREV_PROP_APPROVED,MAX_PREV_PROP_APPROVED,MIN_PREV_PROP_APPROVED,AVG_PREV_INT_RATE,SUM_PREV_URGENT_NEEDS,SUM_PREV_REPAIRS,SUM_PREV_OTHER,SUM_PREV_LIMIT_REJECT,SUM_REFUSED_CONTRACT,SUM_CANC_CONTRACT,SUM_APPR_CONTRACT,SUM_PREV_HC_REJECT,SUM_PREV_INSURE_REQ,COUNT_PREV_WALK_IN,COUNT_PREV_HIGH_YIELD,COUNT_PREV_LOW_YIELD,SUM_DAYS_LAST_DUE_1ST_VERSION_EQ_DAYS_LAST_DUE,SUM_DAYS_FIRST_DRAWING_SENTINEL,SUM_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,MAX_DAYS_FIRST_DRAWING_SENTINEL_WEIGHTED,SUM_DAYS_LAST_DUE_LT_FIRST_VERSION,MIN_RATE_INTEREST_PRIMARY_12M,AVG_RATE_INTEREST_PRIVILEGED_12M,SUM_REFUSED_CONTRACT_6M,SUM_PRODUCT_COMBINATION_POS_HOUSE_INTEREST_12M,SUM_PRODUCT_COMBINATION_POS_MOBILE_INTEREST_12M,SUM_NAME_GOODS_CATEGORY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_XNA_6M,SUM_NAME_SELLER_INDUSTRY_CSTR_6M,SUM_NAME_PAYMENT_TYPE_XNA_6M,COUNT_NAME_CLIENT_TYPE_REPEATER_12M,COUNT_NAME_CLIENT_TYPE_NEW_12M,AVG_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_ANNUITY_6M,AVG_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,MAX_PREV_AMT_CREDIT_DIV_AMT_GOODS_PRICE_6M,AVG_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,MIN_PREV_AMT_CREDIT_PLUS_AMT_ANNUITY_6M,PREVIOUS_AGG_SYNTHETIC_TARGET,PREVIOUS_AGG_COMP1,PREVIOUS_AGG_COMP2,PREVIOUS_AGG_COMP3,PREVIOUS_AGG_COMP4,PREVIOUS_AGG_COMP5,PREVIOUS_AGG_COMP6,PREVIOUS_AGG_COMP7
0,100001,,,,,,,,1.0,-1612.0,-1612.0,-1612.0,0.0,23787.0,23787.0,23787.0,13.67069,13.67069,13.67069,6.020501,6.020501,6.020501,0.00346,0.00346,0.00346,3951.0,3951.0,3951.0,2.27069,2.27069,2.27069,-1740.0,-1740.0,0.0,0.0,0.0,14.273276,14.273276,24835.5,24835.5,6e-05,0.00055,0.00055,0.104326,0.957782,0.957782,0.957782,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.000575,0.000575,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.513661,-3.921423,0.977275,-0.190697,1.345474,-0.545294,1.929032,1.411176
1,100002,,9251.775,,,1.0,,1.0,1.0,-17.0,-17.0,-17.0,0.0,179055.0,179055.0,179055.0,295.470297,295.470297,295.470297,19.353584,19.353584,19.353584,0.031937,0.031937,0.031937,9251.775,9251.775,9251.775,15.266955,15.266955,15.266955,-606.0,-606.0,0.0,0.0,0.0,295.470297,295.470297,179055.0,179055.0,0.0,0.00165,0.00165,0.0,1.0,1.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.00165,0.00165,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.487067,-2.72488,1.59177,2.395666,0.118051,-0.42582,-0.376677,0.405732
2,100003,,,,,,,,3.0,-1976.0,-527.0,-1047.333333,1449.0,68053.5,1035882.0,484191.0,29.070269,1388.581769,612.90394,5.399568,10.531859,8.677472,0.004315,0.014118,0.008318,6737.31,98356.995,56553.99,2.877962,131.845838,70.901357,-2341.0,-746.0,1595.0,0.0,0.0,547.812073,1206.434316,435436.5,900000.0,2.1e-05,0.001071,0.001543,0.05003,1.057664,1.15098,0.989013,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,2.0,3.0,0.002975,0.00134,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.42019,0.428183,-0.535911,2.110056,-4.18119,1.337309,1.599647,0.545277
3,100004,,,,,,,,1.0,-714.0,-714.0,-714.0,0.0,20106.0,20106.0,20106.0,24.669939,24.669939,24.669939,3.753045,3.753045,3.753045,0.004605,0.004605,0.004605,5357.25,5357.25,5357.25,6.573313,6.573313,6.573313,-815.0,-815.0,0.0,0.0,0.0,29.793865,29.793865,24282.0,24282.0,0.00026,0.001016,0.001016,0.212008,0.828021,0.828021,0.828021,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.001227,0.001227,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.506853,-4.125854,1.233849,-0.810943,0.819096,-1.583013,1.885414,1.163706
4,100005,,,,,,,,2.0,-460.0,-460.0,-460.0,0.0,0.0,40153.5,20076.75,0.0,53.042933,26.521466,8.342371,8.342371,8.342371,0.01102,0.01102,0.01102,4813.2,4813.2,4813.2,6.358256,6.358256,6.358256,-757.0,-315.0,442.0,1.0,0.5,29.469947,58.939894,22308.75,44617.5,0.000144,0.001189,0.001189,0.108964,0.89995,0.89995,0.89995,,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.001321,0.001321,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,,0.529786,-3.004413,0.852979,-0.301994,1.40546,-1.294547,0.780261,1.127595


time: 4.13 s


In [86]:
previous_agg.shape

(338857, 91)

time: 2.45 ms


## Bureau

In [21]:
bureau_agg = pd.read_csv(path + "bureau_agg.csv")
frame = bureau_agg.drop(["SK_ID_CURR", "BUREAU_AGG_SYNTHETIC_TARGET"], axis=1)
frame = pd.DataFrame(scale.fit_transform(impute.fit_transform(frame.replace([-np.inf, np.inf], np.nan))), columns=frame.columns)
frame.head()

Unnamed: 0,SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M,SUM_CC_DEBT_6M,SUM_CC_DEBT_12M,MAX_WORST_DQ_BUREAU_BALANCE_6M,MAX_WORST_DQ_BUREAU_BALANCE_12M,MAX_BUREAU_UTILIZATION_6M,MAX_BUREAU_UTILIZATION_12M,COUNT_ACTIVE_6M,COUNT_ACTIVE_12M,COUNT_ACTIVE_24M,DAYS_REMAINING_ACTIVE,MAX_CREDIT_DAY_OVERDUE_6M,MAX_CREDIT_DAY_OVERDUE_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_12M_24M,BUREAU_SUM_DEBT_DIFF_6M_12M,BUREAU_SUM_DEBT_DIFF_12M_24M,MAX_CNT_CREDIT_PROLONG,AVG_LEN_BUREAU_BALANCE,PROP_CURRENT,PROP_CLOSED,PROP_CURRENT_WEIGHTED,MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE,MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE,RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE,SUM_SUM_CURRENT_BUREAU_BALANCE,AVG_PROP_CURRENT,AVG_PROP_DQ,MAX_PROP_DQ,AVG_PROP_CURRENT_WEIGHTED,MIN_PROP_CURRENT_WEIGHTED,AVG_PROP_DQ_WEIGHTED,MAX_PROP_DQ_WEIGHTED,AVG_PROP_CURRENT_WEIGHTED_AMT,MIN_PROP_CURRENT_WEIGHTED_AMT,AVG_PROP_DQ_WEIGHTED_AMT,MAX_PROP_DQ_WEIGHTED_AMT,AVG_WORST_DQ_BUREAU_BALANCE,MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED,AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED,TOTAL_AMT_CREDIT_SUM_POS_DAYS,SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,MAX_LEN_BUREAU_BALANCE,SUM_LEN_BUREAU_BALANCE,MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE,MIN_DAYS_CREDIT_ENDDATE,MAX_DAYS_CREDIT_ENDDATE,SUM_DAYS_CREDIT_ENDDATE,SUM_NULL_DAYS_ENDDATE_FACT,COUNT_BUREAU_RECORDS,COUNT_ACTIVE,MAX_CREDIT_DAY_OVERDUE_WEIGHTED,SUM_CREDIT_DAY_OVERDUE_WEIGHTED,MAX_CREDIT_DAY_OVERDUE,SUM_CREDIT_DAY_OVERDUE,DAYS_SINCE_APPLIED,SUM_INVERSE_DAYS_CREDIT,MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,MAX_AMT_CREDIT_MAX_OVERDUE,SUM_AMT_CREDIT_MAX_OVERDUE,SUM_CNT_CREDIT_PROLONG,SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED,SUM_AMT_CREDIT_SUM_DEBT,BUREAU_UTILIZATION_AVG,BUREAU_UTILIZATION_MAX,BUREAU_PROP_SUM_OVERDUE_AVG,BUREAU_PROP_MAX_OVERDUE_AVG,MAX_DAYS_CREDIT_UPDATE,RANGE_DAYS_CREDIT_UPDATE,DAYS_CREDIT_RANGE,TOTAL_AMT_CREDIT_SUM_WEIGHTED,TOTAL_AMT_CREDIT_SUM,COUNT_CREDIT_CARD,COUNT_CAR_LOAN,COUNT_MORTGAGE,SUM_AMT_ANNUITY,BUREAU_AGG_COMP1,BUREAU_AGG_COMP2,BUREAU_AGG_COMP3,BUREAU_AGG_COMP4,BUREAU_AGG_COMP5,BUREAU_AGG_COMP6,BUREAU_AGG_COMP7
0,0.008409,-0.404347,-0.408288,2.979738,2.395456,-0.007603,-0.007255,0.794515,0.745754,0.597533,-0.142532,-0.03202,-0.007135,0.001625,-0.001846,-0.007685,-0.007685,-0.156358,-0.736477,-0.070091,0.426421,-0.151689,-0.03988,-0.716668,0.510476,0.268297,0.085164,-0.085164,0.169495,2.141916,-0.134742,-0.028612,0.073509,0.698633,-0.050086,-0.026582,0.001562,0.187357,1.026268,0.318205,-0.148818,-0.242212,-0.488278,-0.488278,0.355103,-0.113884,-0.287609,-0.166321,0.515261,0.313134,0.523474,-0.036308,-0.035193,-0.051846,-0.051706,-0.828339,0.579231,-0.097297,-0.102979,-0.021913,-0.025746,-0.154452,0.012633,-0.03488,-0.009142,-0.007649,-0.005878,0.005976,0.386648,-0.81578,0.271683,0.047063,-0.129411,-0.869386,-0.2578,-0.229794,-0.00076,0.346875,0.575874,0.662495,-0.323353,-1.129956,0.010354,0.152581
1,-0.022963,-0.404347,-0.408288,-0.198581,-0.235108,-0.011793,-0.011446,0.159632,0.123802,0.019504,-0.353989,-0.03202,-0.007135,0.001625,-0.001846,-0.247348,-0.247348,-0.156358,-0.475448,-5.494959,-0.424205,-0.190473,-0.435191,-0.611796,0.080166,0.255364,-5.329139,5.329139,4.20728,0.088639,-0.210269,1.034886,0.792557,-0.235326,-0.069023,0.174253,0.186197,2.101018,0.352518,0.693025,-0.24541,-0.41464,-0.424878,-0.424878,0.572956,-0.01542,-0.393909,-0.376646,-0.040002,0.53885,-0.034649,-0.036308,-0.035193,-0.051846,-0.051706,-0.727126,0.136671,0.03372,0.027172,-0.004304,0.003479,-0.154452,-0.075583,-0.248757,-0.009142,-0.007649,-0.005878,0.005976,0.383748,0.131473,0.065945,-0.063256,-0.270634,1.774782,-0.2578,-0.229794,-0.056204,0.327436,2.872822,0.236537,-0.400444,-0.335987,-0.167222,0.065951
2,-0.057216,-0.404347,-0.408288,-0.198581,-0.235108,-0.01698,-0.016633,-0.47525,-0.49815,-0.558524,-0.314095,-0.03202,-0.007135,0.001625,-0.001846,-0.415213,-0.415213,-0.156358,-0.16822,0.227302,-0.169636,-0.138085,-0.011643,-0.244743,-0.027412,-0.520616,0.23292,-0.23292,-0.305538,-0.185166,-0.143995,-0.120773,-0.138937,-0.113569,-0.056812,-0.068996,-0.070805,-0.262917,-0.224982,-0.191658,-0.177909,-0.391612,-0.171278,-0.171278,-0.02614,-0.537241,-0.34747,-0.383261,-0.595264,-0.364014,-0.592771,-0.036308,-0.035193,-0.051846,-0.051706,0.215654,-0.390123,-0.097297,-0.102979,-0.021913,-0.025746,-0.154452,-0.246663,-0.398561,-0.009577,-0.007983,-0.005878,0.005976,0.279337,0.96918,0.769155,-0.242179,-0.234064,0.452698,-0.2578,-0.229794,-0.056204,-0.436946,-0.196579,-0.36802,-0.118736,-0.403783,0.040286,0.091427
3,-0.057216,-0.404347,-0.408288,-0.198581,-0.235108,-0.008937,-0.008752,-1.110132,-1.120102,-1.136553,-0.425359,-0.03202,-0.007135,0.001625,-0.001846,-0.415213,-0.415213,-0.156358,-0.16822,0.227302,-0.169636,-0.138085,-0.011643,-0.244743,-0.027412,-0.520616,0.23292,-0.23292,-0.305538,-0.185166,-0.143995,-0.120773,-0.138937,-0.113569,-0.056812,-0.068996,-0.070805,-0.262917,-0.224982,-0.191658,-0.496225,-0.488503,-0.171278,-0.171278,-0.02614,0.167333,-0.517677,-0.288689,-1.150527,-0.815446,-1.150894,-0.036308,-0.035193,-0.051846,-0.051706,-0.155461,-0.418082,-0.097297,-0.102979,-0.021913,-0.025746,-0.154452,-0.246663,-0.398561,-0.009142,-0.007649,-0.005878,0.005976,-0.70386,-0.676776,-0.386897,-0.309133,-0.432911,-0.869386,-0.2578,-0.229794,-0.056204,-1.011322,-0.058936,-0.232194,0.132002,0.053512,-0.026916,0.019336
4,0.009934,-0.404347,-0.408288,-0.198581,-0.235108,-0.007912,-0.007565,0.159632,0.123802,0.019504,-0.29305,-0.03202,-0.007135,0.001625,-0.001846,-0.026999,-0.026999,-0.156358,-1.192315,0.227302,-0.397087,0.743468,-2.242331,-0.664232,-1.614183,-0.313688,0.23292,-0.23292,-0.305538,3.763111,0.915982,-0.120773,-0.138937,0.587925,0.00338,-0.068996,-0.070805,-0.262917,-0.224982,-0.191658,-0.260975,-0.373286,-1.18568,-1.18568,2.479168,0.346254,-0.335966,-0.107972,-0.040002,-0.58973,-0.034649,-0.036308,-0.035193,-0.051846,-0.051706,-0.803973,0.455183,-0.097297,-0.102979,-0.021913,-0.025746,-0.154452,-0.002122,-0.052116,-0.009142,-0.007649,-0.005878,0.005976,0.372147,-0.851682,-1.047653,-0.121223,-0.320547,-0.208344,-0.2578,-0.229794,-0.046684,-0.614042,-0.327899,2.136838,-0.300343,-0.836535,0.108691,0.118957


time: 10.6 s


In [22]:
num_comp = 7
pca = PCA(n_components=num_comp)
pca.fit(frame)
princomp = pd.DataFrame(np.dot(frame, pca.components_.T), columns=["BUREAU_AGG_COMP" + str(i+1) for i in range(num_comp)])
bureau_agg = pd.concat([bureau_agg, princomp], axis=1)
bureau_agg.head()

Unnamed: 0,SK_ID_CURR,SUM_AMT_CREDIT_SUM_DEBT_DIV_DAYS_CREDIT_ENDDATE_ACTIVE_12M,SUM_CC_DEBT_6M,SUM_CC_DEBT_12M,MAX_WORST_DQ_BUREAU_BALANCE_6M,MAX_WORST_DQ_BUREAU_BALANCE_12M,MAX_BUREAU_UTILIZATION_6M,MAX_BUREAU_UTILIZATION_12M,COUNT_ACTIVE_6M,COUNT_ACTIVE_12M,COUNT_ACTIVE_24M,DAYS_REMAINING_ACTIVE,MAX_CREDIT_DAY_OVERDUE_6M,MAX_CREDIT_DAY_OVERDUE_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_6M_12M,BUREAU_UTILIZATION_DIFF_12M_24M,BUREAU_SUM_DEBT_DIFF_6M_12M,BUREAU_SUM_DEBT_DIFF_12M_24M,MAX_CNT_CREDIT_PROLONG,AVG_LEN_BUREAU_BALANCE,PROP_CURRENT,PROP_CLOSED,PROP_CURRENT_WEIGHTED,MAX_AVG_MONTHS_BALANCE_BUREAU_BALANCE,MIN_AVG_MONTHS_BALANCE_BUREAU_BALANCE,RANGE_AVG_MONTHS_BALANCE_BUREAU_BALANCE,SUM_SUM_CURRENT_BUREAU_BALANCE,AVG_PROP_CURRENT,AVG_PROP_DQ,MAX_PROP_DQ,AVG_PROP_CURRENT_WEIGHTED,MIN_PROP_CURRENT_WEIGHTED,AVG_PROP_DQ_WEIGHTED,MAX_PROP_DQ_WEIGHTED,AVG_PROP_CURRENT_WEIGHTED_AMT,MIN_PROP_CURRENT_WEIGHTED_AMT,AVG_PROP_DQ_WEIGHTED_AMT,MAX_PROP_DQ_WEIGHTED_AMT,AVG_WORST_DQ_BUREAU_BALANCE,MAX_WORST_DQ_BUREAU_BALANCE_WEIGHTED,AVG_WORST_DQ_BUREAU_BALANCE_WEIGHTED,TOTAL_AMT_CREDIT_SUM_POS_DAYS,SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,MAX_LEN_BUREAU_BALANCE,SUM_LEN_BUREAU_BALANCE,MIN_MIN_MONTHS_BALANCE_BUREAU_BALANCE,MIN_DAYS_CREDIT_ENDDATE,MAX_DAYS_CREDIT_ENDDATE,SUM_DAYS_CREDIT_ENDDATE,SUM_NULL_DAYS_ENDDATE_FACT,COUNT_BUREAU_RECORDS,COUNT_ACTIVE,MAX_CREDIT_DAY_OVERDUE_WEIGHTED,SUM_CREDIT_DAY_OVERDUE_WEIGHTED,MAX_CREDIT_DAY_OVERDUE,SUM_CREDIT_DAY_OVERDUE,DAYS_SINCE_APPLIED,SUM_INVERSE_DAYS_CREDIT,MAX_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,SUM_AMT_CREDIT_MAX_OVERDUE_WEIGHTED,MAX_AMT_CREDIT_MAX_OVERDUE,SUM_AMT_CREDIT_MAX_OVERDUE,SUM_CNT_CREDIT_PROLONG,SUM_AMT_CREDIT_SUM_DEBT_WEIGHTED,SUM_AMT_CREDIT_SUM_DEBT,BUREAU_UTILIZATION_AVG,BUREAU_UTILIZATION_MAX,BUREAU_PROP_SUM_OVERDUE_AVG,BUREAU_PROP_MAX_OVERDUE_AVG,MAX_DAYS_CREDIT_UPDATE,RANGE_DAYS_CREDIT_UPDATE,DAYS_CREDIT_RANGE,TOTAL_AMT_CREDIT_SUM_WEIGHTED,TOTAL_AMT_CREDIT_SUM,COUNT_CREDIT_CARD,COUNT_CAR_LOAN,COUNT_MORTGAGE,SUM_AMT_ANNUITY,BUREAU_AGG_SYNTHETIC_TARGET,BUREAU_AGG_COMP1,BUREAU_AGG_COMP2,BUREAU_AGG_COMP3,BUREAU_AGG_COMP4,BUREAU_AGG_COMP5,BUREAU_AGG_COMP6,BUREAU_AGG_COMP7,BUREAU_AGG_COMP1.1,BUREAU_AGG_COMP2.1,BUREAU_AGG_COMP3.1,BUREAU_AGG_COMP4.1,BUREAU_AGG_COMP5.1,BUREAU_AGG_COMP6.1,BUREAU_AGG_COMP7.1
0,100001,603.706712,0.0,0.0,1.0,1.0,0.987405,0.987405,3.0,3.0,3.0,3091.0,0.0,,,,596686.5,596686.5,0.0,8.857143,0.983871,1.774194,0.007155,47.5,0.5,47.0,61.0,0.992481,0.007519,0.052632,0.349547,0.021053,0.000835,0.005848,120775.784672,1800.0,282.105263,1974.736842,0.142857,0.111111,0.015873,884025.0,3091.0,19.0,19.0,-51.0,-1329.0,1778.0,577.0,3.0,7.0,3.0,0.0,0.0,0.0,0.0,49.0,0.029363,,0.0,,0.0,0.0,53216.5875,596686.5,inf,inf,0.0,,-6.0,149.0,1523.0,100412.66129,1453365.0,0.0,0.0,0.0,24817.5,0.102038,1.10057,1.523734,1.496514,-0.660367,-1.992638,0.017996,0.257508,1.153939,1.628928,1.6366,-0.735256,-2.289269,0.020571,0.297415
1,100002,315.103846,0.0,0.0,0.0,0.0,0.54618,0.54618,2.0,2.0,2.0,780.0,0.0,,,,245781.0,245781.0,0.0,10.875,0.689655,0.264368,0.003698,40.5,1.5,39.0,60.0,0.716964,0.283036,0.5,0.109328,0.014109,0.010476,0.025641,4863.768166,0.0,1617.905476,7012.987013,0.75,0.051282,0.027542,638235.0,927.0,20.0,20.0,-47.0,-1072.0,780.0,-2094.0,2.0,8.0,2.0,0.0,0.0,0.0,0.0,103.0,0.017755,148.3425,153.695563,5043.645,8405.145,0.0,35111.571429,245781.0,inf,inf,0.0,inf,-7.0,1178.0,1334.0,69432.89321,865055.565,4.0,0.0,0.0,0.0,0.110174,1.038894,7.601345,0.534316,-0.817806,-0.592501,-0.290651,0.111304,1.089273,8.126102,0.584333,-0.910594,-0.682808,-0.335013,0.132145
2,100003,0.0,0.0,0.0,,,0.0,0.0,1.0,1.0,1.0,1216.0,0.0,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,810000.0,1216.0,,,,-2434.0,1216.0,-2178.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,606.0,0.003938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-43.0,2088.0,1980.0,19188.078259,1017400.5,2.0,0.0,0.0,0.0,0.051376,-1.386349,-0.52014,-0.831323,-0.242488,-0.712057,0.070022,0.154299,-1.453577,-0.556047,-0.909142,-0.270006,-0.818596,0.080747,0.179907
3,100004,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,,,,,,,,0.0,,,,,,,,,,,,,,,0.0,0.0,,,,-595.0,-382.0,-977.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,408.0,0.003205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,-382.0,300.0,918.0,386.044202,189037.8,0.0,0.0,0.0,0.0,0.06984,-3.20874,-0.155941,-0.524505,0.269582,0.094367,-0.046783,0.032632,-3.36434,-0.166707,-0.573602,0.300168,0.108468,-0.053954,0.037803
4,100005,617.739835,0.0,0.0,0.0,0.0,0.954794,0.954794,2.0,2.0,2.0,1446.0,0.0,,,,568408.5,568408.5,0.0,5.333333,1.0,0.3125,0.086957,8.5,1.0,7.5,16.0,1.0,0.0,0.0,0.539216,0.117647,0.0,0.0,107036.117647,6882.352941,0.0,0.0,0.0,0.0,0.0,598626.0,1446.0,8.0,8.0,-12.0,-128.0,1324.0,1318.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,62.0,0.026109,0.0,0.0,0.0,0.0,0.0,50188.368035,568408.5,inf,inf,0.0,0.0,-11.0,110.0,311.0,53154.691016,657126.0,1.0,0.0,0.0,4261.5,0.130488,-1.948244,-0.867605,4.826918,-0.613374,-1.4752,0.188919,0.200762,-2.042719,-0.9275,5.278753,-0.682956,-1.69583,0.217995,0.233441


time: 3.7 s


## Installments

In [25]:
installment_agg = pd.read_csv(path + "installment_agg.csv")
frame = installment_agg.drop(["SK_ID_CURR", "INSTALLMENT_AGG_SYNTHETIC_TARGET"], axis=1)
frame = pd.DataFrame(scale.fit_transform(impute.fit_transform(frame.replace([-np.inf, np.inf], np.nan))), columns=frame.columns)
frame.head()

Unnamed: 0,SUM_UNDERPAYMENT_12M,SUM_UNDERPAYMENT_6M,MAX_PAYMENT_SIZE_6M,MAX_PAYMENT_SIZE_12M,MIN_PAYMENT_SIZE_6M,MAX_ABS_DAYS_INSTALMENT,COUNT_UNDERPAYMENT,SUM_UNDERPAYMENT,SUM_UNDERPAYMENT_WEIGHTED,MAX_UNDERPAYMENT,AVG_PAYMENT_SIZE_WEIGHTED,AVG_PAYMENT_SIZE,MAX_PAYMENT_SIZE_WEIGHTED,MAX_PAYMENT_SIZE,MIN_PAYMENT_SIZE_WEIGHTED,MIN_PAYMENT_SIZE,SUM_PAYMENT_WEIGHTED,SUM_PAYMENT,SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT,MAX_DAYS_ENTRY_PAYMENT,MIN_DAYS_ENTRY_PAYMENT,RANGE_DAYS_ENTRY_PAYMENT,MAX_UNDERPAYMENT_6M,MAX_UNDERPAYMENT_12M,SUM_PAYMENT_6M,SUM_PAYMENT_DIFF_6M_12M,MAX_AMT_INSTALMENT_6M,MIN_AMT_INSTALMENT_6M,MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M
0,0.091688,0.060134,-0.217287,-0.310109,-0.116706,1.450319,-0.474592,0.042166,0.063254,-0.444175,-0.181221,-0.519936,-0.333909,-0.491256,-0.02592,-0.091849,-0.533982,-0.697498,-0.372866,-2.418434,-1.439745,0.016456,-0.20703,-0.280256,-0.461958,0.11574,-0.215724,-0.110836,-0.043183,0.067345
1,0.091688,0.060134,0.059417,-0.164289,-0.059087,-1.119002,-0.474592,0.042166,0.063254,-0.444175,0.022495,-0.295049,0.025893,-0.348274,0.008641,0.27376,-0.175815,-0.505823,-0.529753,0.519997,1.109948,-0.77654,-0.20703,-0.280256,-0.040331,0.237362,0.064922,-0.084646,-0.254742,0.240574
2,0.091688,0.060134,-0.217287,-0.310109,-0.116706,0.788044,-0.474592,0.042166,0.063254,-0.444175,0.034298,1.813283,0.008253,1.685522,-0.022239,0.095203,-0.034598,0.997287,-0.529753,-0.40117,-0.791648,0.536661,-0.20703,-0.280256,-0.461958,0.11574,-0.215724,-0.110836,-0.153562,0.829552
3,0.091688,0.060134,-0.217287,-0.310109,-0.116706,-0.879665,-0.474592,0.042166,0.063254,-0.444175,-0.167281,-0.471938,-0.332615,-0.51859,-0.013021,0.005144,-0.532685,-0.718883,-0.529753,-0.741723,0.882238,-1.273484,-0.20703,-0.280256,-0.461958,0.11574,-0.215724,-0.110836,-0.171958,0.933489
4,0.091688,0.060134,-0.217287,-0.310109,-0.116706,-0.964908,-0.474592,0.042166,0.063254,-0.444175,-0.16362,-0.505863,-0.324894,-0.490221,-0.013498,-0.032381,-0.518384,-0.681421,-0.372866,-0.26346,0.946829,-1.064133,-0.20703,-0.280256,-0.461958,0.11574,-0.215724,-0.110836,-0.135165,0.032699


time: 4.56 s


In [26]:
num_comp = 7
pca = PCA(n_components=num_comp)
pca.fit(frame)
princomp = pd.DataFrame(np.dot(frame, pca.components_.T), columns=["INSTALLMENT_AGG_COMP" + str(i+1) for i in range(num_comp)])
installment_agg = pd.concat([installment_agg, princomp], axis=1)
installment_agg.head()

Unnamed: 0,SK_ID_CURR,SUM_UNDERPAYMENT_12M,SUM_UNDERPAYMENT_6M,MAX_PAYMENT_SIZE_6M,MAX_PAYMENT_SIZE_12M,MIN_PAYMENT_SIZE_6M,MAX_ABS_DAYS_INSTALMENT,COUNT_UNDERPAYMENT,SUM_UNDERPAYMENT,SUM_UNDERPAYMENT_WEIGHTED,MAX_UNDERPAYMENT,AVG_PAYMENT_SIZE_WEIGHTED,AVG_PAYMENT_SIZE,MAX_PAYMENT_SIZE_WEIGHTED,MAX_PAYMENT_SIZE,MIN_PAYMENT_SIZE_WEIGHTED,MIN_PAYMENT_SIZE,SUM_PAYMENT_WEIGHTED,SUM_PAYMENT,SUM_DAYS_ENTRY_PAYMENT_GT_DAYS_INSTALMENT,MAX_DAYS_ENTRY_PAYMENT,MIN_DAYS_ENTRY_PAYMENT,RANGE_DAYS_ENTRY_PAYMENT,MAX_UNDERPAYMENT_6M,MAX_UNDERPAYMENT_12M,SUM_PAYMENT_6M,SUM_PAYMENT_DIFF_6M_12M,MAX_AMT_INSTALMENT_6M,MIN_AMT_INSTALMENT_6M,MAX_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,MIN_DAYS_ENTRY_PAYMENT_DIFF_DAYS_INSTALMENT_12M,INSTALLMENT_AGG_SYNTHETIC_TARGET,INSTALLMENT_AGG_COMP1,INSTALLMENT_AGG_COMP2,INSTALLMENT_AGG_COMP3,INSTALLMENT_AGG_COMP4,INSTALLMENT_AGG_COMP5,INSTALLMENT_AGG_COMP6,INSTALLMENT_AGG_COMP7
0,100001,0.0,0.0,,,,2916.0,0.0,0.0,0.0,0.0,3.116986,5885.132143,10.686671,17397.9,1.365586,3951.0,21.8189,41195.925,1.0,-1628.0,-2916.0,1288.0,,,0.0,0.0,,,11.0,-36.0,0.06575,-1.311336,-0.289649,-1.265137,0.259213,1.925718,0.03656,1.028292
1,100002,0.0,0.0,53093.745,53093.745,9251.775,565.0,0.0,0.0,0.0,0.0,95.448632,11559.247105,1083.545816,53093.745,15.761116,9251.775,1813.524009,219625.695,0.0,-49.0,-587.0,538.0,0.0,0.0,90100.845,34590.195,53093.745,9251.775,-12.0,-31.0,0.087013,-0.752288,1.616157,0.754784,-0.702447,-0.535793,0.056893,0.006947
2,100003,0.0,0.0,,,,2310.0,0.0,0.0,0.0,0.0,100.798053,64754.586,1030.947353,560835.36,2.899015,6662.97,2519.951327,1618864.65,0.0,-544.0,-2324.0,1780.0,,,0.0,0.0,,,-1.0,-14.0,0.050903,0.659627,-0.035899,-0.624043,1.42402,0.239304,-0.727819,1.224179
3,100004,0.0,0.0,,,,784.0,0.0,0.0,0.0,0.0,9.434878,7096.155,14.544656,10573.965,6.738679,5357.25,28.304633,21288.465,0.0,-727.0,-795.0,68.0,,,0.0,0.0,,,-3.0,-11.0,0.111008,-1.700891,1.634542,0.456757,-0.633019,-0.255442,-0.037613,0.026482
4,100005,0.0,0.0,,,,706.0,0.0,0.0,0.0,0.0,11.09417,6240.205,37.566479,17656.245,6.539674,4813.2,99.847528,56161.845,1.0,-470.0,-736.0,266.0,,,0.0,0.0,,,1.0,-37.0,0.114271,-1.571981,1.428804,0.404391,-0.61607,-0.355109,-0.043216,-0.162738


time: 1.96 s


In [27]:
installment_agg.shape

(339587, 39)

time: 2.53 ms
