## Modeling

In [1]:
import pandas as pd
import numpy as np
import zipfile
# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
# import plotly.offline as po
# import plotly.graph_objs as go

# For models
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, 
    roc_auc_score, roc_curve, auc, precision_recall_curve,
    plot_confusion_matrix, plot_roc_curve
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
loan = pd.read_csv('loan.csv')

loan.head()

Unnamed: 0.1,Unnamed: 0,loan_amnt,term,int_rate,sub_grade,home_ownership,annual_inc,verification_status,purpose,addr_state,...,pub_rec,revol_bal,revol_util,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,loan_status_flag,fico,earliest_cr_line_y
0,0,5000.0,36,10.65,B2,RENT,24000.0,Verified,credit_card,AZ,...,0.0,13648.0,83.7,f,Individual,2.0,0.0,0.0,737.0,36
1,1,2500.0,60,15.27,C4,RENT,30000.0,Source Verified,car,GA,...,0.0,1687.0,9.4,f,Individual,2.0,0.0,1.0,742.0,22
2,2,2400.0,36,15.96,C5,RENT,12252.0,Not Verified,small_business,IL,...,0.0,2956.0,98.5,f,Individual,2.0,0.0,0.0,737.0,20
3,3,10000.0,36,13.49,C1,RENT,49200.0,Source Verified,other,CA,...,0.0,5598.0,21.0,f,Individual,2.0,0.0,0.0,692.0,25
4,4,3000.0,60,12.69,B5,RENT,80000.0,Source Verified,other,OR,...,0.0,27783.0,53.9,f,Individual,2.0,0.0,0.0,697.0,25


In [3]:
loan.columns

Index(['Unnamed: 0', 'loan_amnt', 'term', 'int_rate', 'sub_grade',
       'home_ownership', 'annual_inc', 'verification_status', 'purpose',
       'addr_state', 'dti', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
       'initial_list_status', 'application_type', 'mort_acc',
       'pub_rec_bankruptcies', 'loan_status_flag', 'fico',
       'earliest_cr_line_y'],
      dtype='object')

In [4]:
loan.drop('Unnamed: 0',axis=1, inplace=True)

In [5]:
loan.shape

(746121, 21)

## **Create One-hot encoding for categorical features**

In [6]:
#Find all the categorical variables
dummy = [column for column in loan.columns if loan[column].dtype == object]
dummy

['sub_grade',
 'home_ownership',
 'verification_status',
 'purpose',
 'addr_state',
 'initial_list_status',
 'application_type']

In [7]:
loan = pd.get_dummies(loan, columns=dummy, drop_first=True)

In [8]:
# plt.figure(figsize=(7,16))
# loan.corr().loan_status_flag.sort_values()[:-1].plot.barh()
# plt.title('Correlation of Charged Off with Features')

## **Train Test Split**

In [9]:
X = loan.drop('loan_status_flag', axis=1)
y = loan['loan_status_flag']

In [10]:
y.mean()

0.19900257464942014

The ratio of negative and positive is about 4:1. After spliting the file, will do a downsampling version of the training set.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [12]:
type(X_train)

pandas.core.frame.DataFrame

In [13]:
Xy_train = pd.concat([X_train, y_train], axis = 1)

In [14]:
DOWN_SAMPLE_RATE=0.25

In [15]:
Xy_train_down = pd.concat([Xy_train[Xy_train['loan_status_flag']==0].sample(frac=DOWN_SAMPLE_RATE),\
                           Xy_train[Xy_train['loan_status_flag']==1]], axis=0)

In [16]:
Xy_train_down.loan_status_flag.mean()

0.49843902111517674

The label ratio is 1:1 now.

In [17]:
X_train_down = Xy_train_down.drop('loan_status_flag', axis=1)
y_train_down = Xy_train_down['loan_status_flag']

In [18]:
print("X_train.shape: ", X_train.shape)
print("y_train.shape: ", y_train.shape)
print("X_train_down.shape: ", X_train_down.shape)
print("y_train_down.shape: ", y_train_down.shape)
print("X_test.shape:  ", X_test.shape)
print("y_test.shape:  ", y_test.shape)

X_train.shape:  (596896, 117)
y_train.shape:  (596896,)
X_train_down.shape:  (238312, 117)
y_train_down.shape:  (238312,)
X_test.shape:   (149225, 117)
y_test.shape:   (149225,)


# **Models Building**

First, define a function to output accuracy score, classification report and confusion matrix.

In [19]:
def output(model, xtrain, ytrain, xvalid, yvalid):
    # pred
    valid_pred = model.predict(xvalid)
    train_pred = model.predict(xtrain)
    valid_pred_prob = model.predict_proba(xvalid)
    train_pred_prob = model.predict_proba(xtrain)
    # calculate roc-auc and pr-auc
    train_roc_auc = roc_auc_score(ytrain, train_pred_prob[:, 1], average='micro')
    valid_roc_auc = roc_auc_score(yvalid, valid_pred_prob[:, 1], average='micro')
    train_precision, train_recall, _ = precision_recall_curve(ytrain, train_pred_prob[:, 1])
    valid_precision, valid_recall, _ = precision_recall_curve(yvalid, valid_pred_prob[:, 1])
    train_pr_auc = auc(train_recall, train_precision)
    valid_pr_auc = auc(valid_recall, valid_precision)
    # report
    print(f'ROC-AUC on train data {train_roc_auc:.4f} and on validation data {valid_roc_auc:.4f}')
    print(f'PR-AUC on train data {train_pr_auc:.4f} and on validation data {valid_pr_auc:.4f}')
    print('----------')
    print('Classification Report:\n',classification_report(yvalid, valid_pred))
    print('----------')
    print('Confusion Matrix:\n', confusion_matrix(yvalid, valid_pred))
    return (train_roc_auc,valid_roc_auc,train_pr_auc, valid_pr_auc)

In [20]:
# prediction_model_list = ['Logistic Regression', 'Random Forest Classifier','XGBoost Classifier']
# cols = pd.MultiIndex.from_product([['ROC Score','Accuracy Score'],['Train','Test']])
# score = pd.DataFrame(columns=cols, index=prediction_model_list)

## 1. Logistic Regression

### 1.1 Naive logistic regression

In [21]:
# lr_v0: no scaler
lr_v0 = LogisticRegression(max_iter=1000)
lr_v0.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [22]:
perf_lr_v0 = output(lr_v0, X_train, y_train, X_test, y_test) 

ROC-AUC on train data 0.6854 and on validation data 0.6878
PR-AUC on train data 0.3435 and on validation data 0.3452
----------
Classification Report:
               precision    recall  f1-score   support

         0.0       0.81      0.99      0.89    119529
         1.0       0.51      0.04      0.08     29696

    accuracy                           0.80    149225
   macro avg       0.66      0.52      0.49    149225
weighted avg       0.75      0.80      0.73    149225

----------
Confusion Matrix:
 [[118251   1278]
 [ 28366   1330]]


### 1.2 MinMaxScaler to normalize the features

Next, we try to use the `MinMaxScaler` to normalize the data before feeding them into the model

In [23]:
# we need to use normalized train data in Logistic Regression
scaler = MinMaxScaler()
X_train_n = scaler.fit_transform(X_train)
X_test_n = scaler.transform(X_test)

In [24]:
lr_v1 = LogisticRegression(max_iter=1000)
lr_v1.fit(X_train_n, y_train)

LogisticRegression(max_iter=1000)

In [25]:
perf_lr_v1 = output(lr_v1, X_train_n, y_train, X_test_n, y_test) 

ROC-AUC on train data 0.6985 and on validation data 0.7004
PR-AUC on train data 0.3582 and on validation data 0.3592
----------
Classification Report:
               precision    recall  f1-score   support

         0.0       0.81      0.99      0.89    119529
         1.0       0.56      0.03      0.06     29696

    accuracy                           0.80    149225
   macro avg       0.68      0.51      0.48    149225
weighted avg       0.76      0.80      0.73    149225

----------
Confusion Matrix:
 [[118735    794]
 [ 28704    992]]


The ROC-AUC and PR-AUC both saw improvement here.

In [26]:
# plot_confusion_matrix(
#     lr_v1, X_test_n, y_test, cmap='Oranges',normalize='true',
#     display_labels=['Fully Paid','Charged Off'])
# plt.title('Confusion Matrix of Logistic Regression')

### 1.3 Downsampling

In [27]:
lr_v2 = LogisticRegression(max_iter=1000)
lr_v2.fit(X_train_down, y_train_down)

LogisticRegression(max_iter=1000)

In [28]:
perf_lr_v2 = output(lr_v2, X_train_down, y_train_down, X_test, y_test) 

ROC-AUC on train data 0.6826 and on validation data 0.6853
PR-AUC on train data 0.6639 and on validation data 0.3422
----------
Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.66      0.75    119529
         1.0       0.31      0.61      0.41     29696

    accuracy                           0.65    149225
   macro avg       0.59      0.63      0.58    149225
weighted avg       0.76      0.65      0.68    149225

----------
Confusion Matrix:
 [[79135 40394]
 [11704 17992]]


The downsamplign doesn't help with the training. It may need more tuning on the downsampling rate.

## 2. Random Forest Classifier
### 2.1 Naive RF

In [29]:
rf_v0 = RandomForestClassifier(n_jobs = -1)
rf_v0.fit(X_train, y_train)

RandomForestClassifier(n_jobs=-1)

In [30]:
perf_rf_v0 = output(rf_v0, X_train, y_train, X_test, y_test)

ROC-AUC on train data 1.0000 and on validation data 0.7009
PR-AUC on train data 1.0000 and on validation data 0.3648
----------
Classification Report:
               precision    recall  f1-score   support

         0.0       0.81      0.99      0.89    119529
         1.0       0.56      0.05      0.10     29696

    accuracy                           0.80    149225
   macro avg       0.68      0.52      0.49    149225
weighted avg       0.76      0.80      0.73    149225

----------
Confusion Matrix:
 [[118290   1239]
 [ 28137   1559]]


### 2.2 Downsampling RF

In [31]:
rf_v1 = RandomForestClassifier(n_jobs = -1)
rf_v1.fit(X_train_down, y_train_down)

RandomForestClassifier(n_jobs=-1)

In [32]:
rf_v1.n_estimators, rf_v1.min_samples_leaf

(100, 1)

In [33]:
perf_rf_v1 = output(rf_v1, X_train_down, y_train_down, X_test, y_test)

ROC-AUC on train data 1.0000 and on validation data 0.7033
PR-AUC on train data 1.0000 and on validation data 0.3602
----------
Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.64      0.74    119529
         1.0       0.31      0.66      0.42     29696

    accuracy                           0.64    149225
   macro avg       0.60      0.65      0.58    149225
weighted avg       0.77      0.64      0.68    149225

----------
Confusion Matrix:
 [[76247 43282]
 [10070 19626]]


## 3. XGBoost 

### Native run

In [34]:
xgb_v0= XGBClassifier(use_label_encoder=False)
xgb_v0.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [35]:
perf_xgb_v0 = output(xgb_v0, X_train, y_train, X_test, y_test)

ROC-AUC on train data 0.7452 and on validation data 0.7172
PR-AUC on train data 0.4325 and on validation data 0.3812
----------
Classification Report:
               precision    recall  f1-score   support

         0.0       0.81      0.98      0.89    119529
         1.0       0.55      0.08      0.14     29696

    accuracy                           0.80    149225
   macro avg       0.68      0.53      0.51    149225
weighted avg       0.76      0.80      0.74    149225

----------
Confusion Matrix:
 [[117579   1950]
 [ 27326   2370]]


### 3.1 downsampling

In [36]:
xgb_v1= XGBClassifier(use_label_encoder=False)
xgb_v1.fit(X_train_down, y_train_down)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [37]:
perf_xgb_v1 = output(xgb_v1, X_train_down, y_train_down, X_test, y_test)

ROC-AUC on train data 0.7563 and on validation data 0.7145
PR-AUC on train data 0.7469 and on validation data 0.3769
----------
Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.64      0.74    119529
         1.0       0.32      0.67      0.43     29696

    accuracy                           0.65    149225
   macro avg       0.60      0.66      0.59    149225
weighted avg       0.77      0.65      0.68    149225

----------
Confusion Matrix:
 [[76664 42865]
 [ 9813 19883]]


### 3.2 Hyper-parameter tuning
Let's try some model tuning using randomized search cv

In [60]:
# from scipy import stats
# param_grid = dict(
#     n_estimators=stats.randint(10, 500),
#     max_depth=stats.randint(1, 10),
#     learning_rate=stats.uniform(0, 1),
#     subsample=stats.uniform(0.5, 1)
# )

# xgb_clf = XGBClassifier(use_label_encoder=False)
# xgb_cv = RandomizedSearchCV(
#     xgb_clf, param_grid, cv=3, n_iter=60, 
#     scoring='roc_auc', n_jobs=-1, verbose=1
# )
# xgb_cv.fit(X_train, y_train)

# best_params = xgb_cv.best_params_
# # best_params['tree_method'] = 'gpu_hist'
# # # best_params = {'n_estimators': 50, 'tree_method': 'gpu_hist'}
# print(f"Best Parameters: {best_params}")

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 69.7min finished


Best Parameters: {'learning_rate': 0.3511335924855855, 'max_depth': 3, 'n_estimators': 219, 'subsample': 0.8469015987333157}


In [38]:
# Best Parameters: {'learning_rate': 0.3511335924855855, 'max_depth': 3, 'n_estimators': 219, 'subsample': 0.8469015987333157}

In [42]:
xgb_v2 = XGBClassifier(learning_rate= 0.3511335924855855, max_depth=3,n_estimators=219, subsample=0.8469015987333157)
xgb_v2.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3511335924855855, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=219, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.8469015987333157, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [43]:
perf_xgb_v2 = output(xgb_v2, X_train, y_train, X_test, y_test)

ROC-AUC on train data 0.7256 and on validation data 0.7179
PR-AUC on train data 0.3978 and on validation data 0.3838
----------
Classification Report:
               precision    recall  f1-score   support

         0.0       0.81      0.99      0.89    119529
         1.0       0.56      0.08      0.13     29696

    accuracy                           0.80    149225
   macro avg       0.68      0.53      0.51    149225
weighted avg       0.76      0.80      0.74    149225

----------
Confusion Matrix:
 [[117737   1792]
 [ 27440   2256]]


# Summary

In [44]:
perf = [perf_lr_v0, perf_lr_v1, perf_lr_v2, perf_rf_v0, perf_rf_v1, perf_xgb_v0, perf_xgb_v1, perf_xgb_v2]

In [47]:
df_summary = pd.DataFrame(perf, columns=['train_roc_auc','valid_roc_auc','train_pr_auc', 'valid_pr_auc'])
df_summary.index = ['lr_v0', 'lr_v1', 'lr_v2', 'rf_v0', 'rf_v1', 'xgb_v0', 'xgb_v1', 'xgb_v2']
df_summary

Unnamed: 0,train_roc_auc,valid_roc_auc,train_pr_auc,valid_pr_auc
lr_v0,0.685445,0.687843,0.343481,0.345185
lr_v1,0.698549,0.700377,0.358174,0.359181
lr_v2,0.682626,0.685333,0.663921,0.342192
rf_v0,1.0,0.700888,1.0,0.364798
rf_v1,1.0,0.703348,1.0,0.360214
xgb_v0,0.745182,0.717167,0.432497,0.381197
xgb_v1,0.756257,0.714499,0.746923,0.376853
xgb_v2,0.725642,0.717896,0.397794,0.383788


`xgb_v2` has the best performance in terms of roc-auc and pr-auc.