# Home Credit Default Risk - Team 3 (Kahsai, Nichols, Pellerito)

### Import packages

In [None]:
# standard Python tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# special tools for working in Kaggle
import joblib   # save and load ML models
import gc       # garbage collection
import os 
import sklearn

# preprocessing steps
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# machine learning models and tools
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from xgboost import plot_importance

# cross validation and metrics - remember this competition is scored as area under curve
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold

# clear out any old junk
gc.collect()

# Don't do this!
import warnings
warnings.filterwarnings("ignore")

# Let's put this someplace where it's easy to find - the proportion split between training and validation data
train_size = 0.75

# define the directory where our stuff lives
MainDir = "../input/../input/home-credit-default-risk"
print(os.listdir(MainDir))

# Building the training data set

### Read the training data

In [None]:
# Main table
pd.options.display.max_columns = None
train = pd.read_csv(f'{MainDir}/application_train.csv')
train.head(5)

### First look at bureau.csv table

In [None]:
# Load Bureau table
# Notice that we have two different keys: SK_ID_CURR helps us link to the train.csv table
# while SK_ID_BUREAU helps us link to the bureau_balance table.
bureau = pd.read_csv(f'{MainDir}/bureau.csv')
print(bureau.shape, "- shape of bureau table")
bureau.head(5)

### first look at bureau_balance.csv

In [None]:
# Load bureau_balance table. Just a few columns here.
# Strategy: make a pivot table with SK_ID_BUREAU as rows and count of STATUS as columns.
# That should give us nine features, since there are eight unique status (plus we'll add a total column.)

bureau_balance = pd.read_csv(f'{MainDir}/bureau_balance.csv')
print(bureau_balance.shape, "- shape of bureau_balance table")
print(bureau_balance.STATUS.nunique(), "unique codes in STATUS column")
bureau_balance.head(5)

### Create features from bureau_balance

In [None]:
# get bureau_status features by creating a crosstab
# BB_ prefix identifies features that came from bureau_balance
bb_status = pd.crosstab(bureau_balance.SK_ID_BUREAU, bureau_balance.STATUS, margins = True)       # get count of 0, 1, 2, 3, 4, 5, C, X by SK_ID_BUREAU
bb_status.columns = ['BB_'+column for column in bb_status.columns]
print(bb_status.shape, "- shape of bb_status table")
bb_status.head(5)

# I had also tried the proportion by row (e.g. the 5001709 row would be 0 , 0, 0, 0, 0, 0. 0.887, 0.113) but these factors didn't help the model.
# The count data proved to be much more valuable in improving model score.

### Merge our bb_status feature table into bureau table on SK_ID_BUREAU

In [None]:
# bureau_balance only has SK_ID_BUREAU key field, so the only thing we can link it to is bureau.csv
bureau = bureau.merge(bb_status, left_on = 'SK_ID_BUREAU', right_on = 'SK_ID_BUREAU')               # merge the tables
bureau = bureau.drop(['SK_ID_BUREAU'], axis = 1)                                                    # no longer need this
print(bureau.shape, "- shape of bureau table after merging in bureau_balance")

# BU_ prefix identifies features that came from bureau
bureau.columns = ['BU_'+column if column !='SK_ID_CURR' else column for column in bureau.columns]
bureau.head(5)

### Create features for bureau data

In [None]:
# I also tried aggregating by sum instead of mean - virtually no difference in final result

# Create numeric features by grouping on SK_ID_CURR and finding group means
bureau_num = bureau.groupby(by=['SK_ID_CURR']).mean().reset_index()                                 # group the numeric features by SK_ID_CURR
print(bureau_num.shape, "- shape of numeric bureau features (incl index)")                          # should be 134,542 x 22

# Create categorical features by creating dummies and then taking group means
bureau_cat = pd.get_dummies(bureau.select_dtypes('object'))                                         # this got rid of the SK_ID_CURR column ...
bureau_cat['SK_ID_CURR'] = bureau['SK_ID_CURR']                                                     # so we have to replace it
bureau_cat = bureau_cat.groupby(by = ['SK_ID_CURR']).mean().reset_index()                           # tried sum - didn't change anything
print(bureau_cat.shape, "- shape of categorical bureau features (incl index)")                      # should be 134,542 x 23

# Number of past loans per customer (just one feature)
bureau_count = bureau.groupby(by = ['SK_ID_CURR'])['BU_CREDIT_ACTIVE'].count().reset_index()
bureau_count.rename(columns={'BU_CREDIT_ACTIVE':'COUNT_of_BUREAU'})                                

### Merge our new features into training data on SK_ID_CURR

In [None]:
# merge bureau_num and bureau_cat into the training data
train = train.merge(bureau_num, on='SK_ID_CURR', how='left')                                        # merge numeric features
train = train.merge(bureau_cat, on='SK_ID_CURR', how='left')                                        # merge categorical features
train = train.merge(bureau_count, on='SK_ID_CURR', how='left')                                      # merge count features
print(train.shape, "- shape of training data after merges with bureau features")                    # added 35 new features.

# no longer need bureau, bureau_num, bureau_cat, bureau_count, bureau_balance
del bureau
del bureau_num
del bureau_cat
del bureau_count
del bureau_balance
del bb_status
gc.collect()
train.head(5)

### First look at previous.csv

### Extra features on previous, including total interest paid and interest rate on loan (heavy calculation)

In [None]:
%%time
# Wall time for refreshing this was around 38 minutes with no accelerator. I want to run this ONE time and save the output.
previous = pd.read_csv(f'{MainDir}/previous_application.csv')
def calc_rate(row):
    return np.rate(row['CNT_PAYMENT'], -row['AMT_ANNUITY'], row['AMT_CREDIT'], 0, guess = 0.05, maxiter = 10)
previous['CALC_RATE'] = previous.apply(calc_rate, axis=1)
previous['INTEREST_PAID'] = previous['AMT_ANNUITY'] * previous['CNT_PAYMENT'] - previous['AMT_CREDIT']
previous['INT_PRINC'] = previous['INTEREST_PAID'] / previous['AMT_CREDIT']

# import numpy_financial as npf           # financial functions are supposedly deprecated in numpy - might need this package in the future
# Example for SK_ID_PREV 2030495
# pv = 17145                                # AMT_CREDIT
# pmt = 1730.43                             # AMT_ANNUITY
# nper = 12                                 # CNT_PAYMENT
# print(np.rate(nper, -pmt, pv, 0), "is the interest rate per period")

#https://numpy.org/numpy-financial/latest/
#rate(nper, pmt, pv, fv[, when, guess, tol, …])   Compute the rate of interest per period.

### POS_CASH_balance

In [None]:
# POS_CASH_balance contains both SK_ID_PREV and SK_ID_CURR. We could merge this data into previous_application on SK_ID_PREV
# and then later merge previous_application into the training data, or we could just merge this directly into the training data.
# We are going to try the former approach for now, although I might also try appending this data directly to training instead.

pos = pd.read_csv(f'{MainDir}/POS_CASH_balance.csv')
pos.drop(['SK_ID_CURR'], axis=1, inplace = True)
pos.columns = ['PO_'+column if column != 'SK_ID_PREV' else column for column in pos.columns]
pos.head()

### Create features from POS_CASH_balance

In [None]:
# I tried aggregating by mean and by sum - doesn't make much of a difference.

# Create numeric features of pos by grouping on SK_ID_PREV and finding group means
pos_num = pos.groupby(by=['SK_ID_PREV']).mean().reset_index()                  # group the numeric features by SK_ID_PREV
print(pos_num.shape, "- shape of numeric features (incl index)")               # should be 936,325 x 7

# Create categorical features by creating dummies and then taking group means
pos_cat = pd.get_dummies(pos.select_dtypes('object'))                          # this got rid of the SK_ID_PREV column ...
pos_cat['SK_ID_PREV'] = pos['SK_ID_PREV']                                      # so we have to replace it
pos_cat = pos_cat.groupby(by = ['SK_ID_PREV']).mean().reset_index()            # could try sum as well.
print(pos_cat.shape, "- shape of categorical features (incl index)")           # should be 936,325 x 10

# merge pos_num and pos_cat into the previous_application data
previous = previous.merge(pos_num, on='SK_ID_PREV', how='left')                # merge numeric features
previous = previous.merge(pos_cat, on='SK_ID_PREV', how='left')                # merge categorical features
print(previous.shape, "- shape of previous data after merges")                 # added 35 new features.

# don't need these anymore: pos, pos_num, pos_cat
del pos
del pos_num
del pos_cat
gc.collect()

### installment_payments

In [None]:
# note that this table contains only numeric features, nothing categorical.
inst = pd.read_csv(f'{MainDir}/installments_payments.csv')
inst.drop(['SK_ID_CURR'], axis=1, inplace = True)
inst.columns = ['IP_'+column if column != 'SK_ID_PREV' else column for column in inst.columns]
inst.head(5)

### Create features from installment_payments

In [None]:
inst_num = inst.groupby(by=['SK_ID_PREV']).mean().reset_index()         # group the numeric features by SK_ID_PREV
print(inst_num.shape, "- shape of numeric features (incl index)")       # should be 997,752 x 8

# installments_payments only has numeric features

# merge pos_num into the previous_application data
previous = previous.merge(inst_num, left_on='SK_ID_PREV', right_on = 'SK_ID_PREV', how='left')         
print(previous.shape, "- shape of previous data after merges")          # should be 1,670,214 x 59

# don't need these anymore: inst, inst_num
del inst
del inst_num
gc.collect()

### Let's try installment_payments direct to training

In [None]:
# Instead of merging installment_payments into previous_application and then merging previous_application into application_train,
# we can also try merging these features directly into application_train because this dataset has both keys. I am leaving these out
# because they aren't improving the model.

#inst2 = pd.read_csv(f'{MainDir}/installments_payments.csv')
#inst2.drop(['SK_ID_PREV'], axis=1, inplace = True)
#inst2.columns = ['IPX_'+column if column != 'SK_ID_CURR' else column for column in inst2.columns]
#inst2.head(5)

#inst2_num = inst2.groupby(by=['SK_ID_CURR']).mean().reset_index()         # group the numeric features by SK_ID_CURR
#print(inst2_num.shape, "- shape of numeric features (incl index)")       # should be 339,587 x 7

#train = train.merge(inst2_num, left_on='SK_ID_CURR', right_on = 'SK_ID_CURR', how='left')         
#print(train.shape, "- shape of train data after merges")          # should be 1,670,214 x 83

# don't need these anymore: inst, inst_num
#del inst2
#del inst2_num
#gc.collect()

### POS_CASH_balance direct to training

In [None]:
# Likewise I am leaving these out of the model
#pos2 = pd.read_csv(f'{MainDir}/POS_CASH_balance.csv')
#pos2.drop(['SK_ID_PREV'], axis=1, inplace = True)
#pos2.columns = ['POX_'+column if column != 'SK_ID_CURR' else column for column in pos2.columns]
#pos2.head(5)

#pos2_num = pos2.groupby(by=['SK_ID_CURR']).mean().reset_index()         # group the numeric features by SK_ID_CURR
#print(pos2_num.shape, "- shape of numeric features (incl index)")       # should be 339,587 x 7

#train = train.merge(pos2_num, left_on='SK_ID_CURR', right_on = 'SK_ID_CURR', how='left')         
#print(train.shape, "- shape of train data after merges")          # should be 1,670,214 x 83

# don't need these anymore: inst, inst_num
#del pos2
#del pos2_num
#gc.collect()

### credit_card_bureau

In [None]:
# Note that there is not a lot of data here - only about 100,000 SK_ID_PREVs out of a million possible. 
ccb = pd.read_csv(f'{MainDir}/credit_card_balance.csv')
ccb.drop(['SK_ID_CURR'], axis=1, inplace = True)
ccb.columns = ['CC_'+column if column != 'SK_ID_PREV' else column for column in ccb.columns]
ccb.head(5)

### Create features from credit_card_bureau

In [None]:
# Create numeric features of pos by grouping on SK_ID_PREV and finding group means
ccb_num = ccb.groupby(by=['SK_ID_PREV']).mean().reset_index()         # group the numeric features by SK_ID_PREV
print(ccb_num.shape, "- shape of numeric features (incl index)")      # should be 104,307 x 22

# Create categorical features by creating dummies and then taking group means
ccb_cat = pd.get_dummies(ccb.select_dtypes('object'))                 # this got rid of the SK_ID_PREV column ...
ccb_cat['SK_ID_PREV'] = ccb['SK_ID_PREV']                             # so we have to replace it
ccb_cat = ccb_cat.groupby(by = ['SK_ID_PREV']).mean().reset_index()   # could try sum as well.
print(ccb_cat.shape, "- shape of categorical features (incl index)")     # should be 305,811 x 24

# merge ccb_num and ccb_cat into the previous_application data
previous = previous.merge(ccb_num, on='SK_ID_PREV', how='left')                # merge numeric features
previous = previous.merge(ccb_cat, on='SK_ID_PREV', how='left')                # merge categorical features
print(previous.shape, "- shape of previous data after merges")                 # added 35 new features.

# don't need these anymore: pos, pos_num, pos_cat
del ccb
del ccb_num
del ccb_cat
gc.collect()

### Merge previous table into training data

In [None]:
# don't need this anymore - all further joins will be on SK_ID_CURR
previous.drop(['SK_ID_PREV'], axis = 1, inplace = True)

previous.columns = ['PR_'+column if column !='SK_ID_CURR' else column for column in previous.columns]
previous['PR_DAYS_LAST_DUE'].replace({365243: np.nan}, inplace = True)
previous['PR_DAYS_TERMINATION'].replace({365243: np.nan}, inplace = True)
previous['PR_DAYS_FIRST_DRAWING'].replace({365243: np.nan}, inplace = True)

# Create numeric features by grouping on SK_ID_CURR and finding group means
previous_num = previous.groupby(by=['SK_ID_CURR']).mean().reset_index()         # group the numeric features by SK_ID_CURR
print(previous_num.shape, "- shape of numeric features (incl index)")         # should be 305,811 x 13

# Create categorical features by creating dummies and then taking group means
previous_cat = pd.get_dummies(previous.select_dtypes('object'))                 # this got rid of the SK_ID_CURR column ...
previous_cat['SK_ID_CURR'] = previous['SK_ID_CURR']                             # so we have to replace it
previous_cat = previous_cat.groupby(by = ['SK_ID_CURR']).mean().reset_index()   # could try sum as well.
print(previous_cat.shape, "- shape of categorical features (incl index)")     # should be 305,811 x 24

# merge bureau_num and bureau_cat into the training data
train = train.merge(previous_num, on='SK_ID_CURR', how='left')                # merge numeric features
train = train.merge(previous_cat, on='SK_ID_CURR', how='left')                # merge categorical features
print(train.shape, "- shape of training data after ALL merges")                 # added 35 new features.

# no longer need bureau, bureau_num, bureau_car, bureau_count, bureau_balance
del previous
del previous_num
del previous_cat
gc.collect()

### data cleansing and feature engineering: create new features based on ratios, logs, etc.

In [None]:
# what is going on with days_employed? Over 50,000 entries have the value 365,243 days! Let's replace those with NaN and let the imputer deal with them.
train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# ratio features
train['CI_ratio'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']        # credit-to-income ratio
train['AI_ratio'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']       # annuity-to-income ratio
train['AC_ratio'] = train['AMT_CREDIT'] / train['AMT_ANNUITY']             # credit to annuity - basically the term of the loan in years
train['CG_ratio'] = train['AMT_CREDIT'] / train['AMT_GOODS_PRICE']         # credit to goods price ratio - how much was financed?

# log features
train['log_INCOME'] = np.log(train['AMT_INCOME_TOTAL'])                    # log of income
train['log_ANNUITY'] = np.log(train['AMT_ANNUITY'])                        # log of annuity
train['log_CREDIT'] = np.log(train['AMT_CREDIT'])                          # log of credit
train['log_GOODS'] = np.log(train['AMT_GOODS_PRICE'])                      # log of goods price

# flag features
train['MissingBureau'] = train.iloc[:, 41:44].isnull().sum(axis=1).astype("category")   # number of bureaus with no score
train['FLAG_CG_ratio'] = train['AMT_CREDIT'] > train['AMT_GOODS_PRICE']                 # FLAG if you borrowed more than the price of the item
train['DAYS_ID_4200'] = train['DAYS_ID_PUBLISH'] < -4200                             # IDs more than about 14 years old are from USSR

# EXT_SOURCE_x variables are very important - let's not leave missing values up to the imputer!
# Instead of imputing missing values by column mean or median, let's fill in missing values by row
# i.e. missing scores are replaced with the average of the scores we do have. If there are no scores at all
# let's just give them a value of 0.2 for now.
train['AVG_EXT'] = train.iloc[:, 41:44].sum(axis=1)/(3- train.iloc[:,41:44].isnull().sum(axis=1))   # average of the (at most) three scores
train['AVG_EXT'].replace(np.nan, 0.2, inplace = True)   # get rid of any /0 errors generated from previous step

train.EXT_SOURCE_1.fillna(train.AVG_EXT, inplace=True)
train.EXT_SOURCE_2.fillna(train.AVG_EXT, inplace=True)
train.EXT_SOURCE_3.fillna(train.AVG_EXT, inplace=True)
train.EXT_SOURCE_MAX = np.max(train.iloc[:,41:44], axis = 1)
# train.drop(['AVG_EXT'], axis = 1, inplace = True)

train.drop(['ORGANIZATION_TYPE'], axis = 1, inplace = True)  # 58 dummies, doesn't do jackshit - let's just get rid of it now

# Ratio Features
train['OD_ratio'] = train['BU_AMT_CREDIT_SUM_OVERDUE'] / train['BU_AMT_CREDIT_SUM_DEBT']   # proportion of debt that is overdue
train['OD_ratio'].replace([np.nan, np.inf, -np.inf], 0, inplace = True)
train['Credit_ratio'] = train['BU_AMT_CREDIT_SUM'] / train['BU_AMT_CREDIT_SUM_LIMIT']      # proportion of credit line used
train['Credit_ratio'].replace([np.nan, np.inf, -np.inf], 0, inplace = True)
train['Debt_ratio'] = train['BU_AMT_CREDIT_SUM_DEBT'] / train['BU_AMT_CREDIT_SUM']         # debt percentage
train['Debt_ratio'].replace([np.nan, np.inf, -np.inf], 0, inplace = True)

### split out our training data - start with about 10% or 30k out of 300k

In [None]:
y = train['TARGET'].values
X_train, X_valid, y_train, y_valid = train_test_split(train.drop(['TARGET', 'SK_ID_CURR'], axis = 1), y, stratify = y, test_size=1 - train_size, random_state=1)
print('Shape of X_train:',X_train.shape)
print('Shape of y_train:',y_train.shape)
print('Shape of X_valid:',X_valid.shape)
print('Shape of y_valid:',y_valid.shape)

### make lists of cat and num features for pipeline, based on dtype

In [None]:
types = np.array([z for z in X_train.dtypes])        # array([dtype('float64'), dtype('float64'), dtype('O'), dtype('O') ...])
all_columns = X_train.columns.values                 # list of all column names
is_num = types != 'object'                           # returns array([False, False, False, False,  True,  True, ...) where True is a numeric variable
num_features = all_columns[is_num].tolist()          # list of all numeric columns
cat_features = all_columns[~is_num].tolist()         # list of all categorical columns

print(len(num_features), "numeric features")
print(len(cat_features), "categorical features")

### build model pipeline based on num_cols and cat_cols lists

In [None]:
features = num_features + cat_features

Pipe_num = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy = 'median')),        # tried median, mean, constant strategies
    ('scaler', StandardScaler())       ])

Pipe_cat = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'Unknown')),
    ('onehot', OneHotEncoder())        ])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', Pipe_num, num_features),
        ('cat', Pipe_cat, cat_features)])

preprocessor.fit(train[features])
X_train = preprocessor.transform(X_train[features])
X_valid = preprocessor.transform(X_valid[features])

print('Shape of X_train:',X_train.shape)
print('Shape of y_train:',y_train.shape)

gc.collect()

del train
gc.collect()

### Build Model Scoreboard
Each time we run a new model, we will append a new row to the scoreboard.

In [None]:
pd.set_option('display.max_colwidth', None)             # LGBM in particular has long hyperparameters and I want to see them all
results = pd.DataFrame(columns = ['Model Type','AUC - 10xv', 'AUC - Valid', 'Hyperparameters'])

### Memory Management
The only data frame open right now should be the model scoreboard 'results' that we just created. Even the train dataframe can be closed now because the information it contained is stored in arrays X_train, X_valid, etc. We should have about 12 GB free at this point.

In [None]:
%who_ls DataFrame

In [None]:
import psutil
print(psutil.virtual_memory()[1]/1E9, "GB free")

# Models

### Logistic Regression
I would very much like for this model to finish running before the sun turns into a red giant and swallows up the earth, so I am limiting the training data to 10,000 records. Obviously we could improve our score with a larger training set.

In [None]:
%%time

lr_clf = LogisticRegression(max_iter=1000, solver='saga', penalty = 'elasticnet')
lr_parameters = {'l1_ratio':[1], 'C': [1]}
lr_grid = GridSearchCV(lr_clf, lr_parameters, cv=10, refit='True', n_jobs=-1, verbose=1, scoring='roc_auc')
lr_grid.fit(X_train[0:10000], y_train[0:10000])

lr_model = lr_grid.best_estimator_

# update model scoreboard
results = results.append({'Model Type' : 'Logistic Regression',
                          'AUC - 10xv' : lr_grid.best_score_,
                          'AUC - Valid' : roc_auc_score(y_valid, lr_model.predict_proba(X_valid)[:, 1]),
                          'Hyperparameters' : lr_grid.best_params_},
                        ignore_index=True)
results

### Support Vector Machine
Once again we need to limit the amount of training data. SVM model runtime varies with the square of the number of training observations (so training on 2000 obs should take 4x as long as training on 1000, etc.) But SVM does have potential for training a 'wide' (many-featured) data set without using too many observations.


In [None]:
from sklearn.svm import SVC
sv_clf = SVC(probability = True)
sv_parameters = {
    'kernel' : ['linear'],   # 'sigmoid', 'poly', 'rbf'],
    'gamma' : [0.3],
    'C' : [0.01]
}
sv_grid = GridSearchCV(sv_clf, sv_parameters, cv=10, refit='True', n_jobs=-1, verbose=1, scoring='roc_auc')
sv_grid.fit(X_train[0:5000], y_train[0:5000])
sv_model = sv_grid.best_estimator_

# update model scoreboard
results = results.append({'Model Type' : 'Support Vector Machine',
                          'AUC - 10xv' : sv_grid.best_score_,
                          'AUC - Valid' : roc_auc_score(y_valid, sv_model.predict_proba(X_valid)[:, 1]),
                          'Hyperparameters' : sv_grid.best_params_},
                        ignore_index=True)
gc.collect()
results

### Random Forest
This is our best performing model so far.

In [None]:
%%time
rf_clf = RandomForestClassifier(random_state=1, n_estimators=100)
rf_parameters = {'max_depth': [32],  'min_samples_leaf': [34]}        # results from previous run - save a little time
rf_grid = GridSearchCV(rf_clf, rf_parameters, cv=10, refit='True', n_jobs=-1, verbose=1, scoring='roc_auc')
rf_grid.fit(X_train, y_train)
rf_model = rf_grid.best_estimator_

# update model scoreboard
results = results.append({'Model Type' : 'Random Forest',
                          'AUC - 10xv' : rf_grid.best_score_,
                          'AUC - Valid' : roc_auc_score(y_valid, rf_model.predict_proba(X_valid)[:, 1]),
                          'Hyperparameters' : rf_grid.best_params_},
                        ignore_index=True)
gc.collect()
results

### Decision Tree

In [None]:
%%time
dt_clf = DecisionTreeClassifier(random_state=1)
dt_parameters = {
    'max_depth': [12],
    'min_samples_leaf': [8]
}

dt_grid = GridSearchCV(dt_clf, dt_parameters, cv=10, refit='True', n_jobs=-1, verbose=0, scoring='roc_auc')
dt_grid.fit(X_train, y_train)

dt_model = dt_grid.best_estimator_

# update model scoreboard
results = results.append({'Model Type' : 'Decision Tree',
                          'AUC - 10xv' : dt_grid.best_score_,
                          'AUC - Valid' : roc_auc_score(y_valid, dt_model.predict_proba(X_valid)[:, 1]),
                          'Hyperparameters' : dt_grid.best_params_},
                        ignore_index=True)
gc.collect()
results

### XG Boost

In [None]:
%%time
XGB_clf = XGBClassifier(objective='binary:logistic', use_label_encoder=False)
XGB_parameters = {
    'max_depth': [10],
    'n_estimators': [100],
    'learning_rate': [0.5]
}

XGB_grid = GridSearchCV(XGB_clf, XGB_parameters, cv=10, n_jobs=1, verbose=True, scoring= 'roc_auc')
XGB_grid.fit(X_train, y_train)

XGB_model = XGB_grid.best_estimator_

# update model scoreboard
results = results.append({'Model Type' : 'XG Boost',
                          'AUC - 10xv' : XGB_grid.best_score_,
                          'AUC - Valid' : roc_auc_score(y_valid, XGB_model.predict_proba(X_valid)[:, 1]),
                          'Hyperparameters' : XGB_grid.best_params_},
                        ignore_index=True)
gc.collect()
results

### Light GBM
Light Gradient Boost Machine runs fast and produces our best predictive model.

In [None]:
%%time
import lightgbm as lgb

params = {'boosting_type': 'gbdt', 'objective': 'binary',                           # choices for boosting type are 'gbdt', 'rf', 'dart', 'goss'
          'nthread': -1, 'num_leaves': 35, 'learning_rate': 0.02,                                  # default learning rate is 0.1 - lower numbers are working well
          'max_bin': 512, 'subsample_for_bin': 200, 'subsample': 0.88,
          'subsample_freq': 1, 'colsample_bytree': 0.8, 'reg_alpha': 20,
          'reg_lambda': 20, 'min_split_gain': 0.5, 'min_child_weight': 1,
          'min_child_samples': 10, 'scale_pos_weight': 11.5, 'num_class' : 1,       # about 92% target=0 to 8% target=1 - ratio is about 11.5 to 1
          'metric' : 'auc'
          }
    
LGB_clf = lgb.LGBMClassifier(**params)
LGB_parameters = {
    'max_depth': [15],
    'n_estimators': [2000]
}

LGB_grid = GridSearchCV(LGB_clf, LGB_parameters, cv=10, scoring= 'roc_auc')
LGB_grid.fit(X_train, y_train)
LGB_model = LGB_grid.best_estimator_

# update model scoreboard
results = results.append({'Model Type' : 'Light GBM',
                          'AUC - 10xv' : LGB_grid.best_score_,
                          'AUC - Valid' : roc_auc_score(y_valid, LGB_model.predict_proba(X_valid)[:, 1]),
                          'Hyperparameters' : LGB_grid.best_params_},
                        ignore_index=True)
gc.collect()
results

# Visualizing Model Outputs

### Plotting the ROC curve (for AUC score on validation data)
Good information on derivation of roc_auc: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html


In [None]:
probabilities = LGB_model.predict_proba(X_valid)[:,1]
fpr, tpr, thresholds = roc_curve(y_valid, probabilities)
auc = roc_auc_score(y_valid, probabilities)               # AUC on validation data was .7802 per table above
plt.figure(figsize=(6,6))
plt.plot(fpr, tpr)                                        # plot the blue curve
plt.plot([0, 1], [0, 1])                                  # plot the orange 45 degree line
plt.title('Receiver operating characteristic curve')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(["AUC = %.6f"%auc])
plt.show()

### Feature Importance
AC_ratio (basically the term of the loan) is the most important feature, followed by age, length of time since ID changed, then the credit bureau scores. Creating the right engineered features was the key to a good performing model; the top 50 contains many of the features that we created.

In [None]:
LGB_model.fit(X_train, y_train)
feature_imp = pd.DataFrame(zip(LGB_model.feature_importances_, features), columns=['Value','Feature']).sort_values(by="Value", ascending=False)
feature_imp = feature_imp.iloc[0:50,:]
plt.figure(figsize=[6,10])
sns.barplot(feature_imp['Value'], feature_imp['Feature'], orient = "h", color = "lightsteelblue")
plt.title("Most important features (top 50)")
plt.show()

### Creating separation between classes

In [None]:
output_data = pd.DataFrame({'prediction' : LGB_model.predict_proba(X_valid)[:,1], 'target' : y_valid})
plt.figure(figsize=[12,3])
sns.boxplot(output_data.prediction, output_data.target, orient = "h")
plt.title('Distribution of predicted probabilities - validation data')
plt.show()

# This is more like it - earlier models were showing that the '0' class had a 7% chance of default and the '1' class had a 9% chance, something like that.
# By adding the class weights, we have a lot more separation between the classes.

In [None]:
# alternate plot, not sure which I prefer
plt.figure(figsize=[12,3])
sns.kdeplot(x= LGB_model.predict_proba(X_valid)[:,1], hue=y_valid, common_norm = False, fill = True)
plt.xlabel('probability')
plt.xlim(0,1)
plt.title('KDE plots by of predicted probabilities by target - validation data')
plt.show()

### Confusion, recall, precision matrices
RECALL is much more important than precision or accuracy for this application, i.e. if we predict that someone will pay, we need to be right! It is quite easy to make a model that achieves 90-92% "accuracy" because the targets are unbalanced; but that is not what we are trying to do.

In [None]:
labels = ['Paid -0-','Default -1-']

boxdata['binpred'] = np.floor(boxdata['prediction']+0.5)     # one of many ways to convert probabilistic predictions to 0/1 binary predictions
CM = confusion_matrix(boxdata.target, boxdata.binpred)       # confusion matrix
RM = (((CM.T)/(CM.sum(axis=1))).T)                           # recall matrix
PM = (CM/CM.sum(axis=0))                                     # precision matrix

plots = [CM, RM, PM]
plot_titles = ['Confusion Matrix', 'Recall Matrix', 'Precision Matrix']
formats = ["d", ".4f", ".4f"]
plt.figure(figsize=(12,4))
for i in range(0,3):
    plt.subplot(1,3,i+1)
    sns.heatmap(plots[i], annot=True, cmap='BuPu', fmt = formats[i], xticklabels = labels, yticklabels=labels, cbar = False)
    plt.xlabel('Predicted Class')
    plt.ylabel('Orignal Class')
    plt.title(plot_titles[i])
plt.tight_layout()
plt.show()

### Store model results for outputs notebook

In [None]:
output_data = pd.DataFrame({'target': y_valid, 'prediction' : LGB_model.predict_proba(X_valid)[:,1]})
output_data['binary'] = np.floor(output_data['prediction']+0.5)     # one of many ways to convert probabilistic predictions to 0/1 binary predictions
output_data.head(5)
output_data.to_pickle("./output.pkl", compression='infer', storage_options=None)

# Final Model Selection - save data

In [None]:
gc.collect()

#final_model = lgb.LGBMClassifier(**params, n_estimators= 2000, max_depth = 15)
#final_model.fit(X_train, y_train)

#joblib.dump(preprocessor, 'default_preprocessor_08.joblib') 
#joblib.dump(final_model, 'default_model_08.joblib')
