# Credit Risk Ensemble Techniques

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [7]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [8]:
# Load the data
file_path = Path('Resources/lending_data.csv')
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


# Split the Data into Training and Testing

In [9]:
# Create our features
X = df.drop(columns=['loan_status'])

# encode 'homeowner' column:
home_owner_vals = {
    "own": 2,
    "rent": 1,
    "mortgage":0
}
X["homeowner"] = X["homeowner"].apply(lambda x: home_owner_vals[x])

# Create our target
y = df[['loan_status']]

#encode 'loan_status' column
risk_vals = {
    "low_risk":0,
    "high_risk":1
}

y["loan_status"] = y["loan_status"].apply(lambda x: risk_vals[x])

In [10]:
X.describe()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,0.901439,49221.949804,0.377318,3.82661,0.392308,19221.949804
std,2093.223153,0.889495,0.941637,8371.635077,0.081519,1.904426,0.582086,8371.635077
min,5000.0,5.25,0.0,30000.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,0.0,44800.0,0.330357,3.0,0.0,14800.0
50%,9500.0,7.172,1.0,48100.0,0.376299,4.0,0.0,18100.0
75%,10400.0,7.528,2.0,51400.0,0.416342,4.0,1.0,21400.0
max,23800.0,13.235,2.0,105200.0,0.714829,16.0,3.0,75200.0


In [11]:
# Check the balance of our target values
y['loan_status'].value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [12]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y['loan_status'], random_state=1)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [20]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)
y_pred_rf = brf.predict(X_test)

In [21]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred_rf)

0.9926389529195168

In [15]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_rf)

array([[18610,   155],
       [    4,   615]])

In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_rf))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.99      1.00      0.99      0.99     18765
          1       0.80      0.99      0.99      0.89      0.99      0.99       619

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



In [23]:
# List the features sorted in descending order by feature importance
importances_sorted = sorted(zip(brf.feature_importances_, X.columns), reverse=True)
importances_sorted

[(0.2280047342967411, 'debt_to_income'),
 (0.19632641884659413, 'interest_rate'),
 (0.15872703541873587, 'borrower_income'),
 (0.13919243970577427, 'total_debt'),
 (0.1289276341870819, 'loan_size'),
 (0.11794256608223717, 'num_of_accounts'),
 (0.02867861606410628, 'derogatory_marks'),
 (0.00220055539872938, 'homeowner')]

### Easy Ensemble Classifier

In [24]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=1)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)

In [25]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred_eec)

0.993171859927244

In [26]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_eec)

array([[18630,   135],
       [    4,   615]])

In [27]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_eec))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.99      1.00      0.99      0.99     18765
          1       0.82      0.99      0.99      0.90      0.99      0.99       619

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384

