In [123]:
import warnings
warnings.filterwarnings('ignore')

In [124]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [125]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Read the CSV and Perform Basic Data Cleaning

In [126]:
# https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-

columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [127]:
# Load the data
file_path = Path(Path('../Resources/LoanStats_2019Q1.csv'))
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


In [128]:
df2 = df.copy()
df2 = pd.get_dummies(df2)
print(pd.get_dummies(df2))

       loan_amnt  int_rate  installment  annual_inc    dti  delinq_2yrs  \
0        10500.0    0.1719       375.35     66000.0  27.24          0.0   
1        25000.0    0.2000       929.09    105000.0  20.23          0.0   
2        20000.0    0.2000       529.88     56000.0  24.26          0.0   
3        10000.0    0.1640       353.55     92000.0  31.44          0.0   
4        22000.0    0.1474       520.39     52000.0  18.76          0.0   
...          ...       ...          ...         ...    ...          ...   
68812    10000.0    0.1502       346.76     26000.0   9.60          0.0   
68813    12000.0    0.2727       368.37     63000.0  29.07          0.0   
68814     5000.0    0.1992       185.62     52000.0  14.86          0.0   
68815    40000.0    0.0646      1225.24    520000.0   9.96          0.0   
68816    16000.0    0.1131       350.36     72000.0   7.02          2.0   

       inq_last_6mths  open_acc  pub_rec  revol_bal  ...  \
0                 0.0       8.0      0.

In [129]:
# converting the string values into numerical ones using the get_dummies() method.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = df.copy()
df2['home_ownership'] = le.fit_transform(df2['home_ownership'])
df2['verification_status'] = le.fit_transform(df2['verification_status'])
df2['hardship_flag'] = le.fit_transform(df2['hardship_flag'])
df2['debt_settlement_flag'] = le.fit_transform(df2['debt_settlement_flag'])
df2['pymnt_plan'] = le.fit_transform(df2['pymnt_plan'])
df2['initial_list_status'] = le.fit_transform(df2['initial_list_status'])
df2['application_type'] = le.fit_transform(df2['application_type'])



# Split the Data into Training and Testing

In [130]:
# Create our features
X = df2.copy()
X = X.drop("loan_status", axis=1)
X = X.drop("issue_d", axis=1)
X = X.drop("next_pymnt_d", axis=1)

X.head()


# Create our target
y = df2['loan_status']

In [131]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,1.812779,88213.71,0.669994,0.0,21.778153,0.217766,0.497697,...,95.057627,30.626217,0.125972,0.0,210033.2,61338.43,29734.128558,55722.4,0.0,0.0
std,10277.34859,0.04813,288.062432,0.941313,115580.0,0.719105,0.0,20.199244,0.718367,0.758122,...,8.326426,33.631463,0.336732,0.0,192808.8,57387.98,26795.394232,50958.45,0.0,0.0
min,1000.0,0.06,30.89,0.0,40.0,0.0,0.0,0.0,0.0,0.0,...,20.0,0.0,0.0,0.0,3600.0,235.0,100.0,127.0,0.0,0.0
25%,9000.0,0.0881,265.73,1.0,50000.0,0.0,0.0,13.89,0.0,0.0,...,93.0,0.0,0.0,0.0,66977.0,26503.0,11600.0,22880.0,0.0,0.0
50%,15000.0,0.118,404.56,1.0,73000.0,1.0,0.0,19.76,0.0,0.0,...,100.0,20.0,0.0,0.0,146710.0,45357.0,22100.0,42000.0,0.0,0.0
75%,24000.0,0.1557,648.1,3.0,104000.0,1.0,0.0,26.66,0.0,1.0,...,100.0,50.0,0.0,0.0,303640.0,76570.0,39300.0,72499.0,0.0,0.0
max,40000.0,0.3084,1676.23,3.0,8797500.0,2.0,0.0,999.0,18.0,5.0,...,100.0,100.0,4.0,0.0,3292782.0,1295455.0,509400.0,1426964.0,0.0,0.0


In [132]:
# Check the balance of our target values
Counter(y)

Counter({'low_risk': 68470, 'high_risk': 347})

In [133]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [134]:
#Scaler
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [135]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(random_state=1)
X_resampled, y_resampled 
brf.fit(X_resampled, y_resampled)


BalancedRandomForestClassifier(random_state=1)

In [136]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
confusion_matrix(y_test, y_pred)
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.6483102209893581

In [137]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = brf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[   30,    71],
       [    7, 17097]])

In [138]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.81      0.30      1.00      0.43      0.54      0.28       101
   low_risk       1.00      1.00      0.30      1.00      0.54      0.32     17104

avg / total       0.99      1.00      0.30      0.99      0.54      0.32     17205



In [144]:
pd.set_option('display.max_rows', 500) 
# List the features sorted in descending order by feature importance
importance = brf.feature_importances_
importance_df = pd.DataFrame(importance, index=X_test.columns, 
                      columns=["Importance"])
print (importance_df.sort_values("Importance", ascending = False))


                            Importance
last_pymnt_amnt               0.079739
total_rec_prncp               0.066804
total_rec_int                 0.064345
total_pymnt                   0.057071
total_pymnt_inv               0.053044
int_rate                      0.034447
mths_since_recent_inq         0.019876
installment                   0.019392
out_prncp                     0.019255
dti                           0.019138
annual_inc                    0.019019
out_prncp_inv                 0.018898
max_bal_bc                    0.016572
mths_since_recent_bc          0.016221
total_bal_ex_mort             0.016140
total_bc_limit                0.016052
bc_util                       0.015933
avg_cur_bal                   0.015929
revol_bal                     0.015899
mo_sin_old_il_acct            0.015006
bc_open_to_buy                0.015000
total_rec_late_fee            0.014889
il_util                       0.014715
all_util                      0.014203
total_bal_il             

### Easy Ensemble AdaBoost Classifier

In [151]:
# Train the EasyEnsembleClassifier
# YOUR CODE HERE
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=1) 

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [155]:
# Calculated the balanced accuracy score
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Accuracy Score : 0.995640802092415
Classification Report
              precision    recall  f1-score   support

   high_risk       0.81      0.34      0.48       101
    low_risk       1.00      1.00      1.00     17104

    accuracy                           1.00     17205
   macro avg       0.90      0.67      0.74     17205
weighted avg       1.00      1.00      0.99     17205



In [154]:
# Display the confusion matrix
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,34,67
Actual 1,8,17096


In [156]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.81      0.30      1.00      0.43      0.54      0.28       101
   low_risk       1.00      1.00      0.30      1.00      0.54      0.32     17104

avg / total       0.99      1.00      0.30      0.99      0.54      0.32     17205

