# Home Credit Data Preprocessing  <a class='anchor' id='toc'></a>

#### I. [Import Data & Libraries](#idl)
#### II. [Data Cleaning](#dc)
#### III. [Simple Model CV](#smcv)
#### VI. [Stacking](#s)

## I. Import Data & Libraries <a class="anchor" id="idl"></a>
**[Back to top](#toc)**

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Modeling
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR

from mlxtend.classifier import StackingCVClassifier

from keras import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.impute import SimpleImputer

# Data Viz
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Change directory
import os
directory = 'C:/Users/sysux/Desktop/Home Credit/home-credit-default-risk'
os.chdir(directory)

# Ignore warnings
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

Using TensorFlow backend.


In [2]:
# Import main table
train = pd.read_csv('encoded_train_if.csv')
test= pd.read_csv('encoded_test_if.csv')

In [3]:
# store and remove the label
train_label = train['TARGET']
train = train.drop('TARGET', axis=1)

In [4]:
train = train.drop(['Unnamed: 0'], axis=1)
test = test.drop(['Unnamed: 0'], axis=1)

In [5]:
train.head()

Unnamed: 0,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,EXT_SOURCE_1,EXT_SOURCE_2,...,installment_NUM_INSTALMENT_NUMBER_mean,installment_AMT_INSTALMENT_mean,installment_AMT_PAYMENT_sum,p_app_AMT_ANNUITY_mean,p_app_AMT_DOWN_PAYMENT_mean,p_app_HOUR_APPR_PROCESS_START_mean,p_app_RATE_DOWN_PAYMENT_mean,p_app_SELLERPLACE_AREA_mean,p_app_SELLERPLACE_AREA_sum,p_app_CNT_PAYMENT_mean
0,202500.0,406597.5,24700.5,0.018801,-9461,-637.0,-3648.0,-2120,0.083037,0.262949,...,10.0,11559.247105,219625.695,9251.775,0.0,9.0,0.0,500.0,500.0,24.0
1,270000.0,1293502.5,35698.5,0.003541,-16765,-1188.0,-1186.0,-291,0.311267,0.622246,...,5.08,64754.586,1618864.65,56553.99,3442.5,14.666667,0.05003,533.0,1599.0,10.0
2,67500.0,135000.0,6750.0,0.010032,-19046,-225.0,-4260.0,-2531,,0.555912,...,2.0,7096.155,21288.465,5357.25,4860.0,5.0,0.212008,30.0,30.0,4.0
3,135000.0,312682.5,29686.5,0.008019,-19005,-3039.0,-9833.0,-2437,,0.650442,...,4.4375,62947.088438,1007153.415,23651.175,34840.17,14.666667,0.163412,894.222222,8048.0,23.0
4,121500.0,513000.0,21865.5,0.028663,-19932,-3038.0,-4311.0,-3458,,0.322738,...,7.045455,12666.444545,806127.975,12278.805,3390.75,12.333333,0.159516,409.166667,2455.0,20.666667


In [6]:
print('Training set full shape: ', train.shape)
print('Testing set full shape: ' , test.shape)

Training set full shape:  (307511, 26)
Testing set full shape:  (48744, 26)


## II. Data Cleaning <a class="anchor" id="dc"></a>
**[Back to top](#toc)**

In [7]:
# show NA columns
train.isna().sum()

AMT_INCOME_TOTAL                                  0
AMT_CREDIT                                        0
AMT_ANNUITY                                      12
REGION_POPULATION_RELATIVE                        0
DAYS_BIRTH                                        0
DAYS_EMPLOYED                                 55374
DAYS_REGISTRATION                                 0
DAYS_ID_PUBLISH                                   0
EXT_SOURCE_1                                 173378
EXT_SOURCE_2                                    660
EXT_SOURCE_3                                  60965
DAYS_LAST_PHONE_CHANGE                            1
os_NAME_CONTRACT_STATUS_Active_count_norm     18067
pos_MONTHS_BALANCE_mean                       18067
pos_CNT_INSTALMENT_FUTURE_mean                18091
installment_NUM_INSTALMENT_VERSION_mean       15868
installment_NUM_INSTALMENT_NUMBER_mean        15868
installment_AMT_INSTALMENT_mean               15868
installment_AMT_PAYMENT_sum                   15868
p_app_AMT_AN

In [8]:
# impute NA value
imputer = SimpleImputer(strategy = 'median')
imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)

## III. Simple Model CV <a class="anchor" id="smcv"></a>
**[Back to top](#toc)**

### CV Pipeline

In [None]:
# define the train and test data
X_train = train
y_train = train_label
X_test = test

In [None]:
# create a grid/randomized search cross validation pipleline 
def search_pipeline(X_train_data, X_test_data, y_train_data, 
                       model, param_grid, cv=10, scoring_fit=make_scorer(roc_auc_score),
                       do_probabilities = False, search_mode = 'GridSearchCV', n_iterations = 0):
    fitted_model = None
    
    if(search_mode == 'GridSearchCV'):
        cv = GridSearchCV(
            estimator=model,
            param_grid=param_grid, 
            cv=cv, 
            n_jobs=1, 
            scoring=scoring_fit,
            verbose=2
        )
        fitted_model = cv.fit(X_train_data, y_train_data)

    elif (search_mode == 'RandomizedSearchCV'):
        cv = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grid, 
            cv=cv,
            n_iter=n_iterations,
            n_jobs=1, 
            scoring=scoring_fit,
            verbose=2
        )
        fitted_model = cv.fit(X_train_data, y_train_data)
    
    
    if(fitted_model != None):
        if do_probabilities:
            pred = fitted_model.predict_proba(X_test_data)
        else:
            pred = fitted_model.predict(X_test_data)
            
        return [fitted_model, pred]

### LightGBM

In [None]:
model = lgb.LGBMClassifier()
param_grid = {
    'n_estimators': [100, 300, 600],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [15,20,25],
    'num_leaves': [50, 100, 200],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9],
    'subsample_freq': [20]
}

result = search_pipeline(X_train, X_test, y_train, model, 
                              param_grid, cv=5, scoring_fit=make_scorer(roc_auc_score),
                              do_probabilities = True, search_mode = 'RandomizedSearchCV', n_iterations = 5)

print(result[0].best_score_)
print(result[0].best_params_)

### XGboost

In [None]:
model = xgb.XGBClassifier()
param_grid = {
    'n_estimators': [100, 300, 600],
    'max_depth': [15,20,25],
}

result = search_pipeline(X_train, X_test, y_train, model, 
                              param_grid, cv=5, scoring_fit=make_scorer(roc_auc_score),
                              do_probabilities = True, search_mode = 'RandomizedSearchCV', n_iterations = 2)

# Root Mean Squared Error
print(result[0].best_score_)
print(result[0].best_params_)

### Random Forest

In [None]:
model = RandomForestClassifier()
param_grid = {
    'n_estimators': [100, 300, 600],
    'max_depth': [15,20,25],
    'max_leaf_nodes': [50, 100, 200]
}

result = search_pipeline(X_train, X_test, y_train, model, 
                              param_grid, cv=5, scoring_fit=make_scorer(roc_auc_score),
                              do_probabilities = True, search_mode = 'RandomizedSearchCV', n_iterations = 5)

print(result[0].best_score_)
print(result[0].best_params_)

### Neural Network

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled.shape[1]

In [None]:
def build_cnn(activation = 'relu',
              optimizer = 'Adam'):
    
    model = Sequential()
    
    model.add(Dense(10, activation='relu', kernel_initializer='random_normal', input_dim=X_train_scaled.shape[1]))
    model.add(Dense(4, activation='relu', kernel_initializer='random_normal'))
    model.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))
    
    model.compile(
        loss='categorical_crossentropy', 
        optimizer=optimizer, 
        metrics=['accuracy']
    )
    
    return model

In [None]:
param_grid = {
              'epochs':[1,2,3],
              'batch_size':[128]
              #'epochs' :              [100,150,200],
              #'batch_size' :          [32, 128],
              #'optimizer' :           ['Adam', 'Nadam'],
              #'dropout_rate' :        [0.2, 0.3],
              #'activation' :          ['relu', 'elu']
             }

model = KerasClassifier(build_fn = build_cnn, verbose=0)

result = search_pipeline(X_train_scaled, X_test_scaled, y_train, model, 
                              param_grid, cv=5, scoring_fit=make_scorer(roc_auc_score),
                              do_probabilities = True, search_mode = 'GridSearchCV', n_iterations = 5)

print(result[0].best_score_)
print(result[0].best_params_)

## VI. Stacking <a class="anchor" id="s"></a>
**[Back to top](#toc)**

In [None]:
# Defining our estimator, the algorithm to optimize
models_to_train = [xgb.XGBClassifier(), lgb.LGBMClassifier(), RandomForestClassifier()]

# Defining the hyperparameters to optimize
grid_parameters = [
    { # XGBoost
        'n_estimators': [100, 300, 600],
        'max_depth': [15,20,25],
    },
    { # LightGBM
        'n_estimators': [100, 300, 6000],
        'learning_rate': [0.12],
        'max_depth': [4],
        'num_leaves': [10, 20],
    }, 
    { # Random Forest
        'max_depth':[3, 5, 10, 13], 
        'n_estimators':[100, 300, 600],
        'max_features':[2, 4, 6, 8, 10]
    }
]

In [None]:
models_preds_scores = []

for i, model in enumerate(models_to_train):
    params = grid_parameters[i]
    
    result = search_pipeline(X_train, X_test, y_train, 
                                 model, params, cv=2)
    models_preds_scores.append(result)

In [None]:
# show the result as a baseline
for result in models_preds_scores:
    print('Model: {0}, Score: {1}'.format(type(result[0].best_estimator_).__name__, roc_auc_score(result[1], y_test))

Improve the performance using stacking

In [None]:
xgboost = xgb.XGBClassifier()
lgbm = lgb.LGBMClassifier()
rf = RandomForestClassifier()
lg = LogisticRegression()
svr = SVR()

stack = StackingCVClassifier(classifiers=(xgboost, lgbm, rf, lg, svr),
                            meta_classifier=xgboost, cv=12,
                            use_features_in_secondary=True,
                            store_train_meta_features=True,
                            shuffle=False,
                            random_state=42)

stack.fit(X_train, y_train)

pred = stack.predict(X_test)
score = r2_score(y_test, pred)
print(score)