# Home Credit Modeling

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr, randint, uniform
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective
import pickle
import gc

%load_ext autotime

gc.enable()
np.random.seed(235)

path = "/Users/dsaxton/home_credit_default/"

#### Load the data

In [3]:
df = pd.read_csv(path + "preprocessed_train.csv")

time: 24.7 s


In [4]:
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 653 ms


#### Select features

In [5]:
clf = lgb.LGBMClassifier(n_estimators=1000, num_leaves=23, subsample=0.5)
clf.fit(df, y)

lgb_cols = df.columns[clf.feature_importances_ > 0]
len(lgb_cols)

453

time: 1min 53s


In [6]:
var_imp = pd.DataFrame({"Feature": df.columns, "Importance": clf.feature_importances_})[["Feature", "Importance"]].sort_values("Importance", ascending=False)

time: 17 ms


#### View important features

In [35]:
var_imp.head(60).tail(20)

Unnamed: 0,Feature,Importance
110,MAX_PREV_PROP_APPROVED_24M,133
217,MAX_DAYS_CREDIT_ENDDATE,130
304,RANGE_DAYS_ENTRY_PAYMENT,130
218,SUM_DAYS_CREDIT_ENDDATE,129
297,MIN_PAYMENT_SIZE_WEIGHTED,129
313,AMT_CREDIT_DIV_SUM_PAYMENT,129
161,AVG_SYNTH_TARGET,128
10,DAYS_REGISTRATION,128
340,REGION_RATING_CLIENT_W_CITY_DIV_REGION_POPULAT...,127
239,MAX_DAYS_CREDIT_UPDATE,127


time: 7.22 ms


#### Average importance over non-zero importance features

In [8]:
int(np.nanmean(var_imp["Importance"].where(var_imp["Importance"] > 0)))

48

time: 210 ms


#### Importance of selected features

In [33]:
features = ["MAX_PAYMENT_SIZE_12M"]

var_imp[var_imp["Feature"].isin(features)]

Unnamed: 0,Feature,Importance
286,MAX_PAYMENT_SIZE_12M,172


time: 6.6 ms


#### Features with zero importance

In [10]:
var_imp.query("Importance == 0")

Unnamed: 0,Feature,Importance
370,NAME_INCOME_TYPE_Maternity_leave,0
368,NAME_INCOME_TYPE_Businessman,0
442,ORGANIZATION_TYPE_Industry:_type_13,0
488,HOUSETYPE_MODE_block_of_flats,0
481,ORGANIZATION_TYPE_XNA,0
359,CODE_GENDER_nan,0
80,FLAG_DOCUMENT_2,0
82,FLAG_DOCUMENT_4,0
482,ORGANIZATION_TYPE_nan,0
373,NAME_INCOME_TYPE_Student,0


time: 36.2 ms


#### LightGBM CV

In [11]:
params = {"n_estimators": 20000, 
          "num_leaves": 113, 
          "learning_rate": 0.005, 
          "subsample": 0.5, 
          "colsample_bytree": 0.5, 
          "reg_lambda": 0.9}

lgb_data = lgb.Dataset(data=df[lgb_cols], 
                       label=y)

cv_result = lgb.cv(params=params, 
                   train_set=lgb_data, 
                   nfold=5, 
                   metrics="auc", 
                   early_stopping_rounds=200, 
                   stratified=True, 
                   shuffle=True, 
                   verbose_eval=100, 
                   show_stdv=True, 
                   seed=123)

cv_result = pd.DataFrame(cv_result)



[100]	cv_agg's auc: 0.759608 + 0.00295035
[200]	cv_agg's auc: 0.76411 + 0.00284402
[300]	cv_agg's auc: 0.767375 + 0.00279802
[400]	cv_agg's auc: 0.770568 + 0.00266559
[500]	cv_agg's auc: 0.773288 + 0.00253967
[600]	cv_agg's auc: 0.775337 + 0.00249169
[700]	cv_agg's auc: 0.777158 + 0.00235195
[800]	cv_agg's auc: 0.778781 + 0.00226231
[900]	cv_agg's auc: 0.780095 + 0.00223199
[1000]	cv_agg's auc: 0.781267 + 0.00216614
[1100]	cv_agg's auc: 0.782298 + 0.00210299
[1200]	cv_agg's auc: 0.783181 + 0.00206149
[1300]	cv_agg's auc: 0.783987 + 0.00207398
[1400]	cv_agg's auc: 0.784603 + 0.00204576
[1500]	cv_agg's auc: 0.785203 + 0.00203132
[1600]	cv_agg's auc: 0.785651 + 0.00199863
[1700]	cv_agg's auc: 0.786083 + 0.00196518
[1800]	cv_agg's auc: 0.786514 + 0.00194148
[1900]	cv_agg's auc: 0.786878 + 0.0019564
[2000]	cv_agg's auc: 0.78716 + 0.00194898
[2100]	cv_agg's auc: 0.787399 + 0.00195082
[2200]	cv_agg's auc: 0.787612 + 0.00191704
[2300]	cv_agg's auc: 0.787814 + 0.00189904
[2400]	cv_agg's auc: 0.

In [12]:
cv_result.tail()

Unnamed: 0,auc-mean,auc-stdv
5041,0.789198,0.001858
5042,0.7892,0.001855
5043,0.789199,0.001855
5044,0.7892,0.001857
5045,0.789202,0.001854


time: 19.1 ms


#### Fit chosen model

In [13]:
lgb_model = lgb.LGBMClassifier(n_estimators=5045, 
                         num_leaves=113, 
                         learning_rate=0.005, 
                         subsample=0.5, 
                         colsample_bytree=0.5, 
                         reg_lambda=0.9)

lgb_model.fit(df[lgb_cols], y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
        learning_rate=0.005, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=5045,
        n_jobs=-1, num_leaves=113, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.9, silent=True, subsample=0.5,
        subsample_for_bin=200000, subsample_freq=1)

time: 18min 40s


#### Score test data and generate submission

In [14]:
df = pd.read_csv(path + "preprocessed_test.csv")

time: 3.44 s


In [15]:
df["NAME_INCOME_TYPE_Maternity_leave"] = 0

time: 12.4 ms


In [16]:
submission = pd.DataFrame({"SK_ID_CURR": df["SK_ID_CURR"], 
                           "TARGET": lgb_model.predict_proba(df[lgb_cols])[:,1]})

submission.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.028409
1,100005,0.131907
2,100013,0.022533
3,100028,0.034576
4,100038,0.182258


time: 11.2 s


In [17]:
submission.to_csv(path + "submission.csv", index=False)

time: 163 ms
