# Home Credit Modeling

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr, randint, uniform
import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective
import pickle
import gc

gc.enable()
np.random.seed(235)

path = "/Users/danielsaxton/home_credit_default_risk/"

#### Load the data

In [2]:
df = pd.read_csv(path + "_preprocessed_train.csv")

In [3]:
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

#### Select features

In [4]:
clf = lgb.LGBMClassifier(n_estimators=1000, num_leaves=23, subsample=0.5)
clf.fit(df, y)

lgb_cols = df.columns[clf.feature_importances_ > 0]
len(lgb_cols)

434

In [5]:
var_imp = pd.DataFrame({"Feature": df.columns, "Importance": clf.feature_importances_})[["Feature", "Importance"]].sort_values("Importance", ascending=False)

#### View important features

In [6]:
var_imp.head(20)

Unnamed: 0,Feature,Importance
31,EXT_SOURCE_3,389
30,EXT_SOURCE_2,377
300,AMT_CREDIT_DIV_AMT_ANNUITY,303
335,EXT_SOURCE_1_DIV_DAYS_BIRTH,241
323,DAYS_REGISTRATION_DIV_DAYS_LAST_PHONE_CHANGE,236
8,DAYS_BIRTH,213
294,TOTAL_AMT_CREDIT_SUM_POS_DAYS_DIV_SUM_DAYS_CRE...,203
11,DAYS_ID_PUBLISH,203
303,EXT_SOURCE_PROD,199
226,SUM_INVERSE_DAYS_CREDIT,196


#### Average importance over non-zero importance features

In [7]:
int(np.nanmean(var_imp["Importance"].where(var_imp["Importance"] > 0)))

50

#### Importance of selected features

In [8]:
features = []

var_imp[var_imp["Feature"].isin(features)]

Unnamed: 0,Feature,Importance


#### Features with zero importance

In [None]:
var_imp.query("Importance == 0")

Unnamed: 0,Feature,Importance
132,MAX_DAYS_FIRST_SENTINEL_COMP_DAYS_LAST_SENTINEL,0
130,MAX_DAYS_FIRST_DRAWING_DAYS_DUE_SENTINEL,0
227,SUM_LEN_BUREAU_BALANCE,0
464,ORGANIZATION_TYPE_University,0
465,ORGANIZATION_TYPE_XNA,0
458,ORGANIZATION_TYPE_Trade:_type_6,0
466,ORGANIZATION_TYPE_nan,0
98,FLAG_DOCUMENT_20,0
418,ORGANIZATION_TYPE_Emergency,0
95,FLAG_DOCUMENT_17,0


#### LightGBM CV

In [None]:
params = {"n_estimators": 20000, 
          "num_leaves": 113, 
          "learning_rate": 0.005, 
          "subsample": 0.5, 
          "colsample_bytree": 0.5, 
          "reg_lambda": 0.9}

lgb_data = lgb.Dataset(data=df[lgb_cols], 
                       label=y)

cv_result = lgb.cv(params=params, 
                   train_set=lgb_data, 
                   nfold=5, 
                   metrics="auc", 
                   early_stopping_rounds=200, 
                   stratified=True, 
                   shuffle=True, 
                   verbose_eval=100, 
                   show_stdv=True, 
                   seed=123)

cv_result = pd.DataFrame(cv_result)



[100]	cv_agg's auc: 0.758761 + 0.00262195


In [None]:
cv_result.tail()

#### Fit chosen model

In [None]:
lgb_model = lgb.LGBMClassifier(n_estimators=5300, 
                         num_leaves=113, 
                         learning_rate=0.005, 
                         subsample=0.5, 
                         colsample_bytree=0.5, 
                         reg_lambda=0.9)

lgb_model.fit(df[lgb_cols], y)

#### Score test data and generate submission

In [None]:
df = pd.read_csv(path + "preprocessed_test.csv")

In [None]:
df["NAME_INCOME_TYPE_Maternity_leave"] = 0

In [None]:
submission = pd.DataFrame({"SK_ID_CURR": df["SK_ID_CURR"], 
                           "TARGET": lgb_model.predict_proba(df[lgb_cols])[:,1]})

submission.head()

In [None]:
submission.to_csv(path + "submission.csv", index=False)