# **Lenders Club ML Loan Processor**

# Imports
##### 1. Import Libraries
##### 2. Import Data

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.metrics as metrics

### Import Data

In [2]:
# Read in cleaned data from previous notebook
df = pd.read_csv('./data/cleaned_df.csv')


# Model Building

In [3]:
# Establish a baseline based on percentage of loans that are good
baseline = len(df[df['loan_status']==1])/len(df)
baseline

0.8722750321985419

In [4]:
# Create X and y
X = df.drop(columns=['loan_status'])
y = df['loan_status']

In [5]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=23, test_size=.2, stratify=y)

# Production Model Fitting and Insights

In [6]:
# Instantiate GBClassifier
gb = GradientBoostingClassifier(learning_rate=0.125, max_depth=2, n_estimators=100)

In [7]:
# Fit model
gb.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.125, max_depth=2)

In [8]:
# Cross Val Score
cvs = cross_val_score(gb, X_train, y_train, cv=5).mean()

KeyboardInterrupt: 

In [None]:
# GB Feature Importance
feat_imp = {k:v for k,v in list(zip(df.columns,gb.feature_importances_)) if v !=0}

In [None]:
# Test Prediction Probabilities
pred_proba = gb.predict_proba(X_test)

In [None]:
# Test Predictions
preds = gb.predict(X_test)

# Analyzing Model
##### 1. Analyze classification metrics for original model
##### 2. Create Resulting dataframe of test data for further analysis of model
##### 3. Create Dataframe of middle of the road loans that may have to be analyzed further by human underwriter
##### 4. Create Dataframe of "solid decision" loans to analyze whether they can be safely determined by the model
##### 5. Adjust model to decrease Type 1 errors

### Analyze classification metrics for original model
Using Scikit-Learn.metrics, obtain confusion matrices, Accuracy scores, Recall score, Specificity score, Precision score, and F1 score and plot various graphs to visualize results

In [None]:
# Plot Histogram of prediction probabilities
fig, ax = plt.subplots(1,1, figsize=(15,10), facecolor='white')

ax.set_title('Distribution of Prediction Probabilities', loc='center', size=35, pad=20)
ax.hist(pred_proba[:,1],bins=20, color='#FEC683', edgecolor='#262F72')
ax.set_xlabel('Predicted Probability Loan is Good', size=25)
ax.set_ylabel('Number of Loans', size=25)
plt.xticks(list(np.arange(0,1.05,.05)))
plt.tight_layout();

In [None]:
# Calculate percentage of loans given a probability of >= .95
len([pp for pp in pred_proba[:,1] if pp >=.95])/len(pred_proba[:,1])

In [None]:
# Confusion matrix dataframe
def conf_mat(actual,pred):
    return pd.DataFrame(metrics.confusion_matrix(actual,pred),
             columns=['pred bad', 'pred good'],
             index=['actual bad', 'actual good'])
conf_mat(y_test, preds)

In [None]:
# Create function to plot confusion matrix
def plot_cm(actual, pred):
    matrix = metrics.confusion_matrix(actual,pred)
    fig = plt.figure(figsize=(15,10),facecolor='white')
    ax = fig.add_subplot(111)
    cax = ax.matshow(matrix, cmap='Blues')
    plt.title('Loan Confusion Matrix', size=35, pad=25)
    plt.xlabel('Predictions', size=25, labelpad=15)
    plt.ylabel('Actual', size=25, labelpad=15)
    labels = ['Bad','Good']
    ax.set_xticklabels(['']+labels, size=15)
    ax.set_yticklabels(['']+labels, size=15)
    ax.xaxis.set_ticks_position('bottom')
    mat = metrics.confusion_matrix(actual,pred)
    for k,v in enumerate(mat.T):
        for n,m in enumerate(v):
            if m < len(pred)/2:
                ax.text(k,n,m, color='#D87702', size=35, va='center', ha='center')
            else:
                ax.text(k,n,m, color='#FCC274', size=35, va='center', ha='center')
    fig.colorbar(cax);
    
plot_cm(y_test,preds)

In [None]:
# Plot an ROC Curve
fpr, tpr, _ = metrics.roc_curve(y_test, pred_proba[:,1])

plt.figure(figsize=(15,10), facecolor='white')

plt.plot(fpr, tpr, color='#16447D', label='ROC Curve')
plt.title('ROC Curve', size=35)
plt.plot([0,1],[0,1],color='#D87702', label='Baseline')
plt.xlabel('False Positive Rate', size=20)
plt.ylabel('True Positive Rate', size=20)
plt.legend()
plt.show()

In [None]:

def cl_metrics(actual,pred):
    # Create variables for classification metrics
    tn, fp, fn, tp = metrics.confusion_matrix(actual, pred).flatten()
    print('Accuracy Score: ', round(metrics.accuracy_score(actual, pred),3))
    print('Recall Score: ', round(metrics.recall_score(actual, pred),3))
    print('Specificity Score: ', round(tn/(tn+fp),3))
    print('Precision Score: ',round(metrics.precision_score(actual, pred),3))
    print('F1 Score: ', round(metrics.f1_score(actual, pred),3))
    
cl_metrics(y_test,preds)

### Create "Results" dataframe of test data
This will combine the original X_test and y_test data into a new dataframe as well as creating  'model_preds' and 'pred_probability' columns from GradientBoostingClassifier

In [None]:
# Create Results dataframe
Results = pd.DataFrame.copy(X_test)
# Add loan_status column
Results['loan_status'] = y_test.copy()
# Add model prediction column
Results['model_preds'] = preds.copy()
# Add prediction probability column
Results['pred_probability'] = pred_proba[:,1:].copy()

In [None]:
pred_1 = Results[Results['loan_status']==1][['pred_probability']]
pred_0 = Results[Results['loan_status']==0][['pred_probability']]

In [None]:
plt.figure(figsize = (12, 5), facecolor='white')

plt.title('Distribution of Prediction Probabilities', size=35, pad=25)
plt.hist(pred_0, bins=20, alpha=.5, label='Actual Bad')
plt.hist(pred_1, bins=20, alpha=.5, label='Actual Good')
plt.axvline(.5, color='red')
plt.axvline(.95, color='green')
plt.xlabel('Predicted Probability that loan is good', size=20)
plt.ylabel('Number of Loans', size=20)
plt.xticks(list(np.arange(0,1.05,.05)))
plt.legend();

In [None]:
plt.figure(figsize = (12, 5), facecolor='white')

plt.ylim(0,17500)
plt.title('Zoomed in on Probability < .95', size=35, pad=25)
plt.hist(pred_0, bins=20, alpha=.5, label='Actual Bad')
plt.hist(pred_1, bins=20, alpha=.5, label='Actual Good')
plt.axvline(.5, color='red')
plt.axvline(.95, color='green')
plt.xlabel('Predicted Probability that loan is good', size=20)
plt.ylabel('Number of Loans', size=20)
plt.xticks(list(np.arange(0,1.05,.05)))
plt.legend();

In [None]:
# Percentage of loans predicted good under main model
round(Results['model_preds'].sum()/len(Results),3)

In [None]:
# Display head of Results dataframe
Results.head()

### Create Dataframe of mid-range probability loans
This dataframe is comprised of the loans that fall in a more questionable range of the 'pred_probability' column. These loans may be worth having a human underwriter examine further. Run classification metrics on this data to confirm that it is in fact questionable.


In [None]:
# Extract data for loans with mid-range probabilities
Questionable = Results[Results['pred_probability'].between(.25,.95)]

In [None]:
# Calculate percentage of X_test contained in "Questionable"
round(len(Questionable)/len(Results),3)

In [None]:
# Confusion matrix
conf_mat(Questionable['loan_status'],Questionable['model_preds'])

In [None]:
# Plot Confusion Matrix
plot_cm(Questionable['loan_status'],Questionable['model_preds'])

In [None]:
cl_metrics(Questionable['loan_status'],Questionable['model_preds'])

### Create Dataframe of "solid decision" loans
This dataframe should contain all loans that are in the Results dataframe but not in the "Questionable" dataframe. The model should be sufficient at predicting these loans to the point that it would be cost effective to forego further analysis by human underwriters and rely on the model to approve/deny loan

In [None]:
# Extract data for laons with more certain probabilities
Solid = Results[Results['pred_probability'].between(.25,.95)==False]

In [None]:
# Calculate percentage of X_test contained in "Solid"
round(len(Solid)/len(Results),3)

In [None]:
# Confusion matrix
conf_mat(Solid['loan_status'],Solid['model_preds'])

In [None]:
# Plot confusion matrix
plot_cm(Solid['loan_status'],Solid['model_preds'])


In [None]:
cl_metrics(Solid['loan_status'],Solid['model_preds'])

### Adjust model to decrease Type 1 errors
Pursuant to Histogram of Results['pred_probability'] that shows a vast majority of loans scoring >=.95 and a desire to prioritize avoiding the approval of loans that go bad, changing the binary 'model_preds' column from a pred_probability score threshold of .5 as the determining score to a score of .95 will improve Precision Score.

In [None]:
# Type 1 error minimizing Dataframe
bad_bias = pd.DataFrame.copy(Results)

In [None]:
bad_bias['model_preds'] = [1 if x>=.95 else 0 for x in bad_bias['pred_probability']]

In [None]:
# Percentage of loans predicted "good" under bad_bias
round(bad_bias['model_preds'].sum()/len(bad_bias),3)

In [None]:
# Confusion matrix
conf_mat(bad_bias['loan_status'],bad_bias['model_preds'])

In [None]:
# Plot confusion matrix
plot_cm(bad_bias['loan_status'],bad_bias['model_preds'])

In [None]:
cl_metrics(bad_bias['loan_status'],bad_bias['model_preds'])

# Feature Importance

In [None]:
feat_imp

In [None]:
# Save results dataframe to csv
Results.to_csv('./data/model_results_df', index=False)

# References:
1. Lenders Club 2007-2018 (by Nathan George) - https://www.kaggle.com/wordsforthewise/lending-club?select=accepted_2007_to_2018Q4.csv.gz
2. Data Dictionary - https://www.kaggle.com/wordsforthewise/lending-club/discussion/170691
3. Cleaning Tips - https://www.dataquest.io/blog/machine-learning-preparing-data/
4. What is a 'trade' - https://www.thepennyhoarder.com/investing/lending-club-note-trading/
5. Subgrade order - https://www.lendingclub.com/foliofn/rateDetail.action
6. Datetime Conversion 1 - https://stackoverflow.com/questions/2265357/parse-date-string-and-change-format
7. Datetime Conversion 2 - https://stackoverflow.com/questions/9504356/convert-string-into-date-type-on-python
8. initial_list_status column definitions - https://www.lendacademy.com/lending-club-whole-loan-program-one-year-later/
9. GradientBoostingClassifier - https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
10. Plotting ROC curve - https://www.youtube.com/watch?v=uVJXPPrWRJ0
11. sns.distplot - https://git.generalassemb.ly/DSIR-412/lesson-classification-metrics-ii
12. Confusion Matrix - https://stackoverflow.com/questions/19233771/sklearn-plot-confusion-matrix-with-labels
13. Confusion Matrix - https://stackoverflow.com/questions/3529666/matplotlib-matshow-labels
14. Confusion Matrix - https://stackoverflow.com/questions/17022154/changing-matshow-xticklabel-position-from-top-to-bottom-of-the-figure
15. Dataframe Copy - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.copy.html
16. List Copy - https://www.w3schools.com/python/python_ref_list.asp
17. Confusion Matrix - https://stackoverflow.com/questions/21712047/matplotlib-imshow-matshow-display-values-on-plot