# Ensemble Learning

## Initial Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Read the CSV and Perform Basic Data Cleaning

In [4]:
# Load the data
file_path = Path('Resources/LoanStats_2019Q1.csv')
df = pd.read_csv(file_path)

# Preview the data
df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


## Split the Data into Training and Testing

In [76]:
# Create our features
X = df.loc[:,df.columns != 'loan_status']

# Create our target
y = df['loan_status']

X.head()


Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,n,27.24,0.0,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,n,20.23,0.0,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,n,24.26,0.0,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,n,31.44,0.0,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,n,18.76,0.0,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


In [77]:
X['issue_d'] = pd.to_datetime(X['issue_d'])
X['next_pymnt_d'] = pd.to_datetime(X['next_pymnt_d'])

In [78]:
X.dtypes

loan_amnt                     float64
int_rate                      float64
installment                   float64
home_ownership                 object
annual_inc                    float64
                               ...   
total_bal_ex_mort             float64
total_bc_limit                float64
total_il_high_credit_limit    float64
hardship_flag                  object
debt_settlement_flag           object
Length: 85, dtype: object

In [79]:
# Check the balance of our target values
# YOUR CODE HERE
y.value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [80]:
# Create X_train, X_test, y_train, y_test
# YOUR CODE HERE

# Import train_test_split
from sklearn.model_selection import train_test_split

# Split features (independent variables) and target (dependent variable or label)
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,stratify=y)


## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [81]:
df.dtypes[df.dtypes != 'int64'][df.dtypes != 'float64']

home_ownership          object
verification_status     object
issue_d                 object
loan_status             object
pymnt_plan              object
initial_list_status     object
next_pymnt_d            object
application_type        object
hardship_flag           object
debt_settlement_flag    object
dtype: object

In [39]:
# Encode categorical features - home_ownership, verification_status, pymnt_plan, initial_list_status, application_type, hardship_flag, debt_settlement_flag
X_train_binary_encoded_not_scaled = pd.get_dummies(X_train, columns=['home_ownership', 'verification_status','pymnt_plan', 'initial_list_status', 'application_type', 'hardship_flag', 'debt_settlement_flag'])
X_test_binary_encoded_not_scaled = pd.get_dummies(X_test, columns=['home_ownership', 'verification_status', 'pymnt_plan', 'initial_list_status', 'application_type', 'hardship_flag', 'debt_settlement_flag'])
X_train_binary_encoded_not_scaled.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,issue_d,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N
50341,20150.0,0.234,784.21,52341.0,2019-01-01,13.44,2.0,0.0,9.0,0.0,...,1,0,0,1,0,1,0,1,1,1
30598,10000.0,0.118,331.19,25008.0,2019-02-01,25.86,0.0,1.0,5.0,0.0,...,1,0,0,1,0,1,1,0,1,1
57521,20000.0,0.1797,507.55,312000.0,2019-01-01,1.95,0.0,0.0,7.0,0.0,...,0,1,0,1,0,1,1,0,1,1
61253,25000.0,0.0881,792.79,155000.0,2019-01-01,12.37,0.0,0.0,17.0,0.0,...,1,0,0,1,0,1,1,0,1,1
49711,8000.0,0.1614,281.81,55000.0,2019-01-01,18.31,1.0,0.0,11.0,0.0,...,0,1,0,1,0,1,1,0,1,1


In [40]:
# Create the StandardScaler instance

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [49]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
X_train_binary_encoded_not_scaled.drop(columns=['issue_d','next_pymnt_d'],inplace=True)
X_test_binary_encoded_not_scaled.drop(columns=['issue_d','next_pymnt_d'],inplace=True)

scaler.fit(X_train_binary_encoded_not_scaled)

StandardScaler()

In [50]:
# Scale the training and testing data

X_train_binary_encoded_scaled = scaler.transform(X_train_binary_encoded_not_scaled)
X_train_binary_encoded_scaled[1]

array([-0.64915425, -0.20108794, -0.51844213, -0.59539713,  0.20717985,
       -0.30457062,  0.66143759, -1.26114326, -0.37237518, -0.4559953 ,
       -1.0295106 , -0.64781192, -0.64737075, -0.25906093, -0.25888438,
       -0.17159883, -0.44589623, -0.03124755,  0.        ,  0.        ,
       -0.22433418, -0.12082237,  0.        ,  0.        , -0.10539838,
       -0.60559976, -0.87190737, -0.77494985, -0.86941438, -1.2359299 ,
        1.12588831, -0.60944762, -0.100243  ,  0.51809581,  0.14884482,
       -0.66597332,  0.07225796, -0.69127454, -0.25048892, -0.59318152,
       -0.08003456, -0.26476695, -0.13964132, -0.48813882, -0.44547246,
       -0.07981652, -0.00795014,  0.07936986, -0.48703155, -0.30228522,
        0.28798193,  0.89390146, -0.48894798, -0.60860002, -0.33469681,
       -0.73209196, -0.76168213, -0.9680588 , -0.96128911, -0.59479957,
       -1.11221311, -1.18296393, -0.76274795, -1.25890046,  0.        ,
        0.        , -0.13504257, -0.11639641,  0.59253457, -0.90

## Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Display the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier only, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [67]:
# Resample the training data with the BalancedRandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

brf.fit(X_train_binary_encoded_scaled, y_train)
y_pred_rf = brf.predict(X_test_binary_encoded_not_scaled)


In [68]:
# Calculated the balanced accuracy score

balanced_accuracy_score(y_test, y_pred_rf)

0.5187803924886487

In [69]:
# Display the confusion matrix

confusion_matrix(y_test, y_pred_rf)

array([[   83,     4],
       [15688,  1430]], dtype=int64)

In [70]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred_rf))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.95      0.08      0.01      0.28      0.09        87
   low_risk       1.00      0.08      0.95      0.15      0.28      0.07     17118

avg / total       0.99      0.09      0.95      0.15      0.28      0.07     17205



In [71]:
# List the features sorted in descending order by feature importance

importances_df = pd.DataFrame(sorted(zip(brf.feature_importances_, X.columns), reverse=True))
importances_df.set_index(importances_df[1], inplace=True)
importances_df.drop(columns=1, inplace=True)
importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances_df.sort_values(by='Feature Importances', ascending=False)
importances_sorted[:40]


Unnamed: 0_level_0,Feature Importances
1,Unnamed: 1_level_1
initial_list_status,0.081373
total_acc,0.070706
total_rec_prncp,0.060342
out_prncp,0.052668
revol_bal,0.050688
int_rate,0.027063
annual_inc,0.021755
num_tl_op_past_12m,0.019384
max_bal_bc,0.018434
open_il_12m,0.017766


### Easy Ensemble Classifier

In [72]:
# Train the Classifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier

base_estimator = AdaBoostClassifier(n_estimators=10)
eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator)
eec.fit(X_train_binary_encoded_scaled, y_train)
y_pred_eec = eec.predict(X_test_binary_encoded_not_scaled)

In [73]:
# Calculated the balanced accuracy score

y_pred_eec = brf.predict(X_test_binary_encoded_not_scaled)
balanced_accuracy_score(y_test, y_pred_eec)

0.5187803924886487

In [74]:
# Display the confusion matrix

confusion_matrix(y_test, y_pred_eec)

array([[   83,     4],
       [15688,  1430]], dtype=int64)

In [75]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred_eec))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.95      0.08      0.01      0.28      0.09        87
   low_risk       1.00      0.08      0.95      0.15      0.28      0.07     17118

avg / total       0.99      0.09      0.95      0.15      0.28      0.07     17205



### Final Questions

1. Which model had the best balanced accuracy score?

    YOUR ANSWER HERE.
    
    #### Both the models have same balanced accuracy score of 0.5187

2. Which model had the best recall score?

    YOUR ANSWER HERE.
    
    #### Both the models have same recall score of 0.09

3. Which model had the best geometric mean score?

    YOUR ANSWER HERE.
    
     #### Both the models have same geometric mean score of 0.28

4. What are the top three features?

    YOUR ANSWER HERE.
    
    #### initial_list_status, total_acc, total_rec_prncp are 3 best features