# Home Credit EDA

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr, randint, uniform
import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective
import warnings
import pickle
import gc

gc.enable()
warnings.filterwarnings("ignore")
np.random.seed(0)

In [3]:
application_train = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/application_train.csv")
application_test = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/application_test.csv")

In [4]:
bureau_balance = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/bureau_balance.csv")
bureau = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/bureau.csv")
credit_card = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/credit_card_balance.csv")
installments = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/installments_payments.csv")
pos_cash = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/POS_CASH_balance.csv")
previous_application = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/previous_application.csv")

df = pd.read_csv("/Users/danielsaxton/home_credit_default_risk/preprocessed_train.csv")

In [67]:
application_train.shape

(307511, 122)

In [89]:
application_train["SK_ID_CURR"].where(lambda x: x.isin(credit_card["SK_ID_CURR"])).dropna().shape

(86905,)

In [90]:
application_train["SK_ID_CURR"].where(lambda x: x.isin(pos_cash["SK_ID_CURR"])).dropna().shape

(289444,)

In [91]:
application_train["SK_ID_CURR"].where(lambda x: x.isin(installments["SK_ID_CURR"])).dropna().shape

(291643,)

### Installments / POS / credit card

* Paying down debt?  Aggregate to SK_ID_PREV level?
* Rate of change, variance
* Change in utilization

* For each credit card trade, get the change in average balance across 6 month intervals

In [39]:
installments.sort_values(["SK_ID_CURR", "SK_ID_PREV", "NUM_INSTALMENT_NUMBER"]).head(100).tail(20)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
470118,1692033,100007,1.0,2,-314.0,-317.0,16037.64,16037.64
3349535,1692033,100007,1.0,3,-284.0,-287.0,16037.64,16037.64
3797824,1692033,100007,1.0,4,-254.0,-258.0,16037.64,16037.64
1421369,1692033,100007,1.0,5,-224.0,-226.0,16037.64,16037.64
1350720,1692033,100007,1.0,6,-194.0,-198.0,16037.64,16037.64
1337649,1692033,100007,1.0,7,-164.0,-166.0,16037.64,16037.64
3713578,1692033,100007,1.0,8,-134.0,-136.0,16037.64,16037.64
199642,1692033,100007,1.0,9,-104.0,-106.0,16037.64,16037.64
3009235,1692033,100007,1.0,10,-74.0,-75.0,16037.64,16037.64
3573177,1692033,100007,1.0,11,-44.0,-44.0,16037.64,16037.64


In [43]:
pos_cash.sort_values(["SK_ID_CURR", "SK_ID_PREV", "MONTHS_BALANCE"]).head(150).tail(20)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
4906366,2001242,100007,-15,17.0,4.0,Active,0,0
1690801,2001242,100007,-14,17.0,3.0,Active,0,0
2888418,2001242,100007,-13,17.0,2.0,Active,0,0
2196414,2001242,100007,-12,17.0,1.0,Active,0,0
1694281,2001242,100007,-11,17.0,0.0,Completed,0,0
2463436,2119973,100007,-77,10.0,10.0,Active,0,0
1987670,2119973,100007,-76,10.0,9.0,Active,0,0
3199534,2119973,100007,-75,10.0,8.0,Active,0,0
2553642,2119973,100007,-74,10.0,7.0,Active,0,0
76767,2119973,100007,-73,10.0,6.0,Active,0,0


In [11]:
credit_card.sort_values(["SK_ID_CURR", "SK_ID_PREV", "MONTHS_BALANCE"]).query("MONTHS_BALANCE >= -12").head(200).tail(20)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
1635325,1691588,100077,-4,0.0,0,,0.0,,,0.0,...,0.0,0.0,,0,,,0.0,Active,0,0
640195,2628319,100082,-12,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,30.0,Active,0,0
3745556,2628319,100082,-11,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,30.0,Active,0,0
2997646,2628319,100082,-10,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,30.0,Active,0,0
1626848,2628319,100082,-9,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,30.0,Active,0,0
2739629,2628319,100082,-8,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,30.0,Active,0,0
3357437,2628319,100082,-7,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,30.0,Active,0,0
3601018,2628319,100082,-6,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,30.0,Active,0,0
1281205,2628319,100082,-5,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,30.0,Active,0,0
3817265,2628319,100082,-4,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,30.0,Active,0,0


In [12]:
bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,
