# Home Credit Modeling

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr, randint, uniform
import lightgbm as lgb
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.preprocessing import Imputer, StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
import pickle
from skopt import gp_minimize
from skopt.plots import plot_convergence
import gc
from IPython.display import display
import warnings

warnings.simplefilter("ignore")

pd.options.display.max_columns = None

%load_ext autotime

gc.enable()
np.random.seed(235)

path = "/Users/dsaxton/home_credit_default/"

impute = Imputer(strategy="median")
quant = QuantileTransformer(output_distribution="normal")
stand = StandardScaler()

def get_imp(clf, cols):
    frame = (pd.DataFrame({"Variable": cols, 
                        "Importance": clf.feature_importances_}).
             sort_values(by="Importance", ascending=False).
             reset_index(drop=True))
    
    return frame

#### Load the data

In [2]:
df = pd.read_csv(path + "train.csv")
sk_id_curr = df.pop("SK_ID_CURR")
y = df.pop("TARGET")

time: 48.1 s


# LightGBM

#### Select features

In [3]:
clf = lgb.LGBMClassifier(n_estimators=1000, num_leaves=23, subsample=0.5)
clf.fit(df, y)

lgb_cols = df.columns[clf.feature_importances_ > 0]
len(lgb_cols)

519

time: 4min 4s


In [4]:
var_imp = get_imp(clf, df.columns)
var_imp.head()

Unnamed: 0,Variable,Importance
0,EXT_SOURCE_3,325
1,EXT_SOURCE_2,320
2,EXT_SOURCE_1_DIV_DAYS_BIRTH,217
3,AMT_CREDIT_DIV_AMT_ANNUITY,210
4,AVG_AGG_SYNTHETIC_TARGET,194


time: 40 ms


#### CV performance

In [None]:
params = {"n_estimators": 5000, 
          "num_leaves": 500, 
          "min_data_in_leaf": 1000,
          "learning_rate": 0.01, 
          "bagging_fraction": 0.5, 
          "bagging_freq": 1, 
          "feature_fraction": 0.5, 
          "lambda_l2": 1}

to_drop = [c for c in df if "PROJ" in c]

lgb_data = lgb.Dataset(data=df[lgb_cols].drop(to_drop, axis=1), 
                       label=y)

cv_result = lgb.cv(params=params, 
                   train_set=lgb_data, 
                   nfold=5, 
                   metrics="auc", 
                   early_stopping_rounds=200, 
                   stratified=True, 
                   shuffle=True, 
                   verbose_eval=100, 
                   show_stdv=True, 
                   seed=2357)

cv_result = pd.DataFrame(cv_result)

[100]	cv_agg's auc: 0.769725 + 0.00193869
[200]	cv_agg's auc: 0.776739 + 0.00170675
[300]	cv_agg's auc: 0.781761 + 0.00171266
[400]	cv_agg's auc: 0.784902 + 0.00175227


In [None]:
cv_result.tail()

#### Fit full models

In [6]:
lgb1 = lgb.LGBMClassifier(n_estimators=2500, 
                               num_leaves=500, 
                               min_data_in_leaf=1000,
                               learning_rate=0.005, 
                               bagging_fraction=0.5, 
                               bagging_freq=1, 
                               feature_fraction=0.5, 
                               lambda_l2=1, 
                               random_state=2357) 

print("Fitting model 1...")
lgb1.fit(df, y)

lgb2 = lgb.LGBMClassifier(n_estimators=2500, 
                               num_leaves=500, 
                               min_data_in_leaf=1000,
                               learning_rate=0.005, 
                               bagging_fraction=0.5, 
                               bagging_freq=1, 
                               feature_fraction=0.5, 
                               lambda_l2=1, 
                               random_state=7235) 

print("Fitting model 2...")
lgb2.fit(df, y)

lgb3 = lgb.LGBMClassifier(n_estimators=2500, 
                               num_leaves=500, 
                               min_data_in_leaf=1000,
                               learning_rate=0.005, 
                               bagging_fraction=0.5, 
                               bagging_freq=1, 
                               feature_fraction=0.5, 
                               lambda_l2=1, 
                               random_state=5723) 

print("Fitting model 3...")
lgb3.fit(df, y)

lgb4 = lgb.LGBMClassifier(n_estimators=2500, 
                               num_leaves=500, 
                               min_data_in_leaf=1000,
                               learning_rate=0.005, 
                               bagging_fraction=0.5, 
                               bagging_freq=1, 
                               feature_fraction=0.5, 
                               lambda_l2=1, 
                               random_state=3572) 

print("Fitting model 4...")
lgb4.fit(df, y)

Fitting model 1...
Fitting model 2...
Fitting model 3...
Fitting model 4...


LGBMClassifier(bagging_fraction=0.5, bagging_freq=1, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.5,
        lambda_l2=1, learning_rate=0.005, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001,
        min_data_in_leaf=1000, min_split_gain=0.0, n_estimators=2500,
        n_jobs=-1, num_leaves=500, objective=None, random_state=3572,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)

time: 1h 4min 24s


In [7]:
del df
gc.collect()

110

time: 263 ms


In [8]:
test = pd.read_csv(path + "test.csv")

time: 4.36 s


In [9]:
sk_id_curr = test.pop("SK_ID_CURR")

scores = pd.DataFrame({"SK_ID_CURR": sk_id_curr, 
                      "LGB_SCORE1": lgb1.predict_proba(test)[:,1], 
                      "LGB_SCORE2": lgb2.predict_proba(test)[:,1], 
                      "LGB_SCORE3": lgb3.predict_proba(test)[:,1], 
                      "LGB_SCORE4": lgb4.predict_proba(test)[:,1]}, 
                     index=test.index)

time: 30.4 s


In [19]:
scores["TARGET"] = scores[["LGB_SCORE" + str(i+1) for i in range(4)]].rank().apply(np.mean, axis=1)

time: 2.22 s


In [21]:
scores[["SK_ID_CURR", "TARGET"]].to_csv(path + "submission.csv", index=False, header=True)

time: 187 ms
