# Home Credit Modeling

In [31]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr, randint, uniform
import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective
import pickle
import gc

gc.enable()
np.random.seed(235)

path = "/Users/danielsaxton/home_credit_default_risk/"

#### Load the data

In [5]:
df = pd.read_csv(path + "_preprocessed_train.csv")

In [88]:
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

#### Select features

In [89]:
clf = lgb.LGBMClassifier(n_estimators=1000, num_leaves=23, subsample=0.5)
clf.fit(df, y)

lgb_cols = df.columns[clf.feature_importances_ > 0]
len(lgb_cols)

382

In [90]:
var_imp = pd.DataFrame({"Feature": df.columns, "Importance": clf.feature_importances_})[["Feature", "Importance"]].sort_values("Importance", ascending=False)

#### Most important features

In [98]:
var_imp.head(20)

Unnamed: 0,Feature,Importance
5,AMT_ANNUITY,135
136,MAX_PREV_AMT_ANNUITY_WEIGHTED,135
10,DAYS_REGISTRATION,134
153,MIN_PREV_PROP_APPROVED,133
284,SUM_PAYMENT,131
146,MAX_SYNTH_TARGET,130
312,OWN_CAR_AGE_DIV_DAYS_BIRTH,129
320,REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULAT...,129
195,DAYS_CREDIT_RANGE,127
297,AMT_CREDIT_DIV_AVG_PREV_REQ_AMOUNT,127


#### Average importance over non-zero importance features

In [92]:
int(np.nanmean(var_imp["Importance"].where(var_imp["Importance"] > 0)))

57

#### Importance of selected features

In [93]:
features = []

var_imp[var_imp["Feature"].isin(features)]

Unnamed: 0,Feature,Importance
425,TOTAL_AMT_CREDIT_SUM_POS_DAYS_DIV_SUM_DAYS_CRE...,196
424,TOTAL_AMT_CREDIT_SUM_DIV_SUM_DAYS_CREDIT_ENDDATE,185
126,AVG_SYNTH_TARGET_12M,164
280,MIN_PAYMENT_SIZE_6M,156
275,MAX_PAYMENT_SIZE_6M,154
237,TOTAL_AMT_CREDIT_SUM_POS_DAYS,106
286,SUM_UNDERPAYMENT,90
178,AVG_PROP_CURRENT_WEIGHTED_AMT,86
175,AVG_LEN_BUREAU_BALANCE,83
230,SUM_DAYS_CREDIT_ENDDATE_POS_DAYS,83


#### Features with zero importance

In [95]:
var_imp.query("Importance == 0")

Unnamed: 0,Feature,Importance
411,HOUSETYPE_MODE_terraced_house,0
13,FLAG_MOBIL,0
419,WALLSMATERIAL_MODE_Wooden,0
409,HOUSETYPE_MODE_block_of_flats,0
421,EMERGENCYSTATE_MODE_No,0
422,EMERGENCYSTATE_MODE_Yes,0
403,WEEKDAY_APPR_PROCESS_START_nan,0
415,WALLSMATERIAL_MODE_Monolithic,0
374,NAME_HOUSING_TYPE_Rented_apartment,0
80,FLAG_DOCUMENT_2,0


#### LightGBM CV

In [96]:
params = {"n_estimators": 20000, 
          "num_leaves": 113, 
          "learning_rate": 0.005, 
          "subsample": 0.5, 
          "colsample_bytree": 0.5, 
          "reg_lambda": 0.9}

lgb_data = lgb.Dataset(data=df[lgb_cols], 
                       label=y)

cv_result = lgb.cv(params=params, 
                   train_set=lgb_data, 
                   nfold=5, 
                   metrics="auc", 
                   early_stopping_rounds=200, 
                   stratified=True, 
                   shuffle=True, 
                   verbose_eval=100, 
                   show_stdv=True, 
                   seed=123)

cv_result = pd.DataFrame(cv_result)

[100]	cv_agg's auc: 0.757383 + 0.00262752
[200]	cv_agg's auc: 0.761646 + 0.00256754
[300]	cv_agg's auc: 0.76526 + 0.00259728
[400]	cv_agg's auc: 0.76808 + 0.00244443
[500]	cv_agg's auc: 0.77055 + 0.00240747
[600]	cv_agg's auc: 0.772851 + 0.0022421
[700]	cv_agg's auc: 0.774789 + 0.00210225
[800]	cv_agg's auc: 0.77638 + 0.00199247
[900]	cv_agg's auc: 0.777787 + 0.00188086
[1000]	cv_agg's auc: 0.778992 + 0.00181389
[1100]	cv_agg's auc: 0.780142 + 0.00167474
[1200]	cv_agg's auc: 0.78106 + 0.00160194
[1300]	cv_agg's auc: 0.781893 + 0.00160379
[1400]	cv_agg's auc: 0.782606 + 0.00158053
[1500]	cv_agg's auc: 0.783205 + 0.00158437
[1600]	cv_agg's auc: 0.783715 + 0.00155639
[1700]	cv_agg's auc: 0.784115 + 0.00157239
[1800]	cv_agg's auc: 0.784507 + 0.00155631
[1900]	cv_agg's auc: 0.784846 + 0.00155919
[2000]	cv_agg's auc: 0.785111 + 0.00157966
[2100]	cv_agg's auc: 0.785391 + 0.00158255
[2200]	cv_agg's auc: 0.785584 + 0.00154804
[2300]	cv_agg's auc: 0.785795 + 0.00151811
[2400]	cv_agg's auc: 0.785

In [97]:
cv_result.tail()

Unnamed: 0,auc-mean,auc-stdv
5193,0.787511,0.001523
5194,0.787513,0.001523
5195,0.787514,0.001524
5196,0.787513,0.001525
5197,0.787519,0.001522


#### Fit chosen model

In [None]:
lgb_model = lgb.LGBMClassifier(n_estimators=5300, 
                         num_leaves=113, 
                         learning_rate=0.005, 
                         subsample=0.5, 
                         colsample_bytree=0.5, 
                         reg_lambda=0.9)

lgb_model.fit(df[lgb_cols], y)

#### Score test data and generate submission

In [None]:
df = pd.read_csv(path + "preprocessed_test.csv")

In [None]:
df["NAME_INCOME_TYPE_Maternity_leave"] = 0

In [None]:
submission = pd.DataFrame({"SK_ID_CURR": df["SK_ID_CURR"], 
                           "TARGET": lgb_model.predict_proba(df[lgb_cols])[:,1]})

submission.head()

In [None]:
submission.to_csv(path + "submission.csv", index=False)