# Home Credit Modeling

In [35]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr, randint, uniform
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective
import pickle
import gc

%load_ext autotime

gc.enable()
np.random.seed(235)

path = "/Users/dsaxton/home_credit_default/"

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 35.9 ms


#### Load the data

In [36]:
df = pd.read_csv(path + "train.csv")

time: 28 s


In [37]:
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 469 ms


#### Select features

In [38]:
clf = lgb.LGBMClassifier(n_estimators=1000, num_leaves=23, subsample=0.5)
clf.fit(df, y)

lgb_cols = df.columns[clf.feature_importances_ > 0]
len(lgb_cols)

485

time: 2min 10s


In [39]:
var_imp = pd.DataFrame({"Feature": df.columns, "Importance": clf.feature_importances_})[["Feature", "Importance"]].sort_values("Importance", ascending=False)

time: 6.57 ms


#### View important features

Max importance

In [46]:
var_imp.head(20)

Unnamed: 0,Feature,Importance
32,EXT_SOURCE_3,383
31,EXT_SOURCE_2,382
387,LDA_SYNTHETIC_TARGET,381
396,AMT_CREDIT_DIV_AMT_ANNUITY,277
436,EXT_SOURCE_1_DIV_DAYS_BIRTH,232
424,DAYS_REGISTRATION_DIV_DAYS_LAST_PHONE_CHANGE,204
8,DAYS_BIRTH,165
9,DAYS_EMPLOYED,158
11,DAYS_ID_PUBLISH,158
399,EXT_SOURCE_PROD,153


time: 7.06 ms


Average importance given positive

In [40]:
round(var_imp.query("Importance > 0")["Importance"].mean())

45

time: 11.9 ms


In [41]:
var_imp[var_imp["Feature"].map(lambda x: ("AGG_COMP" in x) or ("LDA" in x))].head(10)

Unnamed: 0,Feature,Importance
387,LDA_SYNTHETIC_TARGET,381
378,INSTALLMENT_AGG_COMP6,122
285,BUREAU_AGG_COMP4,112
203,PREVIOUS_AGG_COMP7,110
287,BUREAU_AGG_COMP6,102
201,PREVIOUS_AGG_COMP5,100
286,BUREAU_AGG_COMP5,98
199,PREVIOUS_AGG_COMP3,94
379,INSTALLMENT_AGG_COMP7,89
284,BUREAU_AGG_COMP3,87


time: 14.9 ms


#### LightGBM CV

In [None]:
params = {"n_estimators": 20000, 
          "num_leaves": 500, 
          "min_data_in_leaf": 1000,
          "learning_rate": 0.005, 
          "bagging_fraction": 0.5, 
          "bagging_freq": 3, 
          "feature_fraction": 0.5, 
          "lambda_l2": 1}

lgb_data = lgb.Dataset(data=df, 
                       label=y)

cv_result = lgb.cv(params=params, 
                   train_set=lgb_data, 
                   nfold=5, 
                   metrics="auc", 
                   early_stopping_rounds=200, 
                   stratified=True, 
                   shuffle=True, 
                   verbose_eval=100, 
                   show_stdv=True, 
                   seed=2357)

cv_result = pd.DataFrame(cv_result)

In [None]:
cv_result.tail()

CV mean: 0.792743

```
params = {"n_estimators": 3183, 
          "num_leaves": 500, 
          "min_data_in_leaf": 1000,
          "learning_rate": 0.005, 
          "bagging_fraction": 0.5, 
          "bagging_freq": 1, 
          "feature_fraction": 0.5, 
          "lambda_l2": 1}
```

#### Fit chosen model

In [176]:
lgb_model = lgb.LGBMClassifier(n_estimators=2767, 
                         num_leaves=500, 
                         min_data_in_leaf=1000,
                         learning_rate=0.005, 
                         bagging_fraction=0.5, 
                         feature_fraction=0.5, 
                         lambda_l2=0.8)

lgb_model.fit(df[lgb_cols], y)

LGBMClassifier(bagging_fraction=0.5, boosting_type='gbdt', class_weight=None,
        colsample_bytree=1.0, feature_fraction=0.5, lambda_l2=0.8,
        learning_rate=0.005, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_data_in_leaf=1000, min_split_gain=0.0,
        n_estimators=2767, n_jobs=-1, num_leaves=500, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=1)

time: 17min 56s


#### Score test data and generate submission

In [177]:
df = pd.read_csv(path + "test.csv")

time: 4.12 s


In [178]:
submission = pd.DataFrame({"SK_ID_CURR": df["SK_ID_CURR"], 
                           "TARGET": lgb_model.predict_proba(df[lgb_cols])[:,1]})

submission.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.043615
1,100005,0.13638
2,100013,0.023205
3,100028,0.035034
4,100038,0.222979


time: 8.85 s


In [179]:
submission.to_csv(path + "submission.csv", index=False)

time: 178 ms
