# Credit Risk Evaluator

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

## Retrieve the Data

The data is located in the Challenge Files Folder:

* `lending_data.csv`

Import the data using Pandas. Display the resulting dataframe to confirm the import was successful.

In [3]:
# Import the data
file_path = Path("../Starter_Code/Resources/lending_data.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700,7.672,52800,0.431818,5,1,22800,0
1,8400,6.692,43600,0.311927,3,0,13600,0
2,9000,6.963,46100,0.349241,3,0,16100,0
3,10700,7.664,52700,0.43074,5,1,22700,0
4,10800,7.698,53000,0.433962,5,1,23000,0


## Predict Model Performance

You will be creating and comparing two models on this data: a Logistic Regression, and a Random Forests Classifier. Before you create, fit, and score the models, make a prediction as to which model you think will perform better. You do not need to be correct! 

Write down your prediction in the designated cells in your Jupyter Notebook, and provide justification for your educated guess.

*Replace the text in this markdown cell with your predictions, and be sure to provide justification for your guess.*

## Split the Data into Training and Testing Sets

In [13]:
# Split the data into X_train, X_test, y_train, y_test
y=df["loan_status"].values
X=df.drop("loan_status", axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#Prediction

I don't have a strong opinion at this point which model will perform better: Logistic Regression or Random Forest. If they give similar predictions, I prefer logistic regression because it is a deterministic algorithm that provides interpretable coefficients (feature weights).  That is important in consumer lending due to federal regulations. 

Logistic Regression is a supervised classification algorithm based on statistics that is used to determine if an independent variable has an effect on a binary dependent variable.   

Random Forest however is ensemble-based learning algorithm, comprised of n collections of de-correlated decision trees. It can be used to make a classifications of customers. What improves the performance of a Random Forest model against a traditional decision tree model is that, by randomly selecting subsets of features, some trees of the forest can isolate more important features while increasing the overall accuracy of the result. 






In [14]:
# Train a Logistic Regression model and print the model score

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=10000)
classifier.fit(X_train, y_train)

print(f'Actual:\t\t{list(y_test[:10])}')
print(f'Predicted:\t{list(classifier.predict(X_test[:10]))}')
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

classifier.classes_
classifier.intercept_
classifier.coef_

classifier.predict_proba(X)

Actual:		[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
Predicted:	[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
Training Data Score: 0.9919177328380795
Testing Data Score: 0.9924680148576145


array([[9.97204658e-01, 2.79534213e-03],
       [9.99732748e-01, 2.67251790e-04],
       [9.99532615e-01, 4.67385415e-04],
       ...,
       [2.26736998e-01, 7.73263002e-01],
       [5.52098595e-01, 4.47901405e-01],
       [6.93515603e-01, 3.06484397e-01]])

In [6]:
# Train a Logistics Regression model with scaled data and print the model score
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler model and fit it to the training data
X_scaler = StandardScaler()
X_scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

classifier_scaled = LogisticRegression(max_iter=10000)
classifier_scaled.fit(X_train_scaled, y_train)

print(f'Actual:\t\t{list(y_test[:20])}')
print(f'Predicted:\t{list(classifier_scaled.predict(X_test_scaled[:20]))}')
print(f"Training Data Score: {classifier_scaled.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier_scaled.score(X_test_scaled, y_test)}")

from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = classifier.predict(X_test)
confusion_matrix(y_true, y_pred)



Actual:		[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
Predicted:	[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
Training Data Score: 0.9941188609162196
Testing Data Score: 0.9941704498555509


array([[18699,    93],
       [   53,   539]], dtype=int64)

In [8]:
from sklearn.metrics import classification_report
print(classification_report(y_test, classifier_scaled.predict(X_test_scaled)))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18792
           1       0.85      0.98      0.91       592

    accuracy                           0.99     19384
   macro avg       0.93      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



In [9]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Scale the data with a Standard Scaler
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
print(f'Actual:\t\t{list(y_test[:20])}')
print(f'Predicted:\t{list(classifier_scaled.predict(X_test_scaled[:20]))}')
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Actual:		[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]
Predicted:	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]
Training Score: 0.9974893382858715
Testing Score: 0.9910751134956666


In [10]:
print(classification_report(y_test, clf.predict(X_test_scaled)))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.88      0.86       619

    accuracy                           0.99     19384
   macro avg       0.92      0.94      0.93     19384
weighted avg       0.99      0.99      0.99     19384



## CONCLUSION
Logistic Regression performs slightly better than a Random Forest Classifier 
based on a nominally higher test score, however, both models do an excellent job in making 
predictions
    