# Home Credit Modeling

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr, randint, uniform
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective
import pickle
import gc

%load_ext autotime

gc.enable()
np.random.seed(235)

path = "/Users/dsaxton/home_credit_default/"

#### Load the data

In [2]:
df = pd.read_csv(path + "train.csv")
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 27 s


# Extra Trees

# LightGBM

#### Select features

In [3]:
clf = lgb.LGBMClassifier(n_estimators=1000, num_leaves=23, subsample=0.5)
clf.fit(df, y)

lgb_cols = df.columns[clf.feature_importances_ > 0]
len(lgb_cols)

486

time: 2min 15s


#### LightGBM CV

* With SYNTH_TARGET: 0.792565
* Without SYNTH_TARGET: 0.792550

In [None]:
params = {"n_estimators": 5000, 
          "num_leaves": 500, 
          "min_data_in_leaf": 1000,
          "learning_rate": 0.005, 
          "bagging_fraction": 0.5, 
          "bagging_freq": 1, 
          "feature_fraction": 0.5, 
          "lambda_l2": 1}

lgb_data = lgb.Dataset(data=df, 
                       label=y)

cv_result = lgb.cv(params=params, 
                   train_set=lgb_data, 
                   nfold=5, 
                   metrics="auc", 
                   early_stopping_rounds=200, 
                   stratified=True, 
                   shuffle=True, 
                   verbose_eval=100, 
                   show_stdv=True, 
                   seed=2357)

cv_result = pd.DataFrame(cv_result)



[100]	cv_agg's auc: 0.766022 + 0.00185066
[200]	cv_agg's auc: 0.770263 + 0.00171526
[300]	cv_agg's auc: 0.773939 + 0.00163015
[400]	cv_agg's auc: 0.777171 + 0.00157386


In [None]:
cv_result.tail()

CV mean: 0.792743

```
params = {"n_estimators": 3183, 
          "num_leaves": 500, 
          "min_data_in_leaf": 1000,
          "learning_rate": 0.005, 
          "bagging_fraction": 0.5, 
          "bagging_freq": 1, 
          "feature_fraction": 0.5, 
          "lambda_l2": 1}
```

#### Fit chosen model

In [None]:
lgb_model = lgb.LGBMClassifier(n_estimators=2767, 
                         num_leaves=500, 
                         min_data_in_leaf=1000,
                         learning_rate=0.005, 
                         bagging_fraction=0.5, 
                         feature_fraction=0.5, 
                         lambda_l2=0.8)

lgb_model.fit(df[lgb_cols], y)

#### Score test data and generate submission

In [None]:
df = pd.read_csv(path + "test.csv")

In [None]:
submission = pd.DataFrame({"SK_ID_CURR": df["SK_ID_CURR"], 
                           "TARGET": lgb_model.predict_proba(df[lgb_cols])[:,1]})

submission.head()

In [None]:
submission.to_csv(path + "submission.csv", index=False)