In [84]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectFromModel


## Using Logistic Regression vs Random Forest Classifier model in determining credit risk

#### Prediction: 
Logistic regression will out-perform the Random Forest model because the dataset is relatively simple and does not contain an overwhelming number of noise variables. 

#### Results:
 As predicted, logistic regression performed with a higher average accuracy score *(0.9937 or 99.37%)* compared to the random forest classifier model, which yielded *0.9915 (99.15% accuracy rate)*. Given the slim dataset, it is likely unwise to further reduce its complexity through methods such as feature selection. Doing so may increase overfitting risk and ultimately reduce the overall accuracy of the models, as attested by the values returned by <code>test_model_fs()</code>. 

Increasing the number of estimators(trees) from 100 to 1000 in the random forest classifier model yielded slightly higher nominal scores for accuracy *(+0.015%)*: *0.99153* and *0.99169* respectively. 

#### Further analysis:
A two-sample pairwise t-test could be performed to quantify the statistical significance of the differences in performance metrics between the two models.

In [50]:
# Import data
df = pd.read_csv("Resources/lending_data.csv")
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
# check for missing values
df.isnull().values.any()

False

In [118]:
# drop risk column and define data
# converting data to an array circumvents user warning during fitting.
X = df.drop(['loan_status'], axis=1).to_numpy()
y = df['loan_status'].to_numpy()

In [119]:
X

array([[1.0700e+04, 7.6720e+00, 5.2800e+04, ..., 5.0000e+00, 1.0000e+00,
        2.2800e+04],
       [8.4000e+03, 6.6920e+00, 4.3600e+04, ..., 3.0000e+00, 0.0000e+00,
        1.3600e+04],
       [9.0000e+03, 6.9630e+00, 4.6100e+04, ..., 3.0000e+00, 0.0000e+00,
        1.6100e+04],
       ...,
       [1.7600e+04, 1.0595e+01, 8.0300e+04, ..., 1.1000e+01, 2.0000e+00,
        5.0300e+04],
       [1.6300e+04, 1.0068e+01, 7.5300e+04, ..., 1.0000e+01, 2.0000e+00,
        4.5300e+04],
       [1.5600e+04, 9.7420e+00, 7.2300e+04, ..., 9.0000e+00, 2.0000e+00,
        4.2300e+04]])

In [120]:
def test_model(model, X, y, seed=5):
    """inputs model class and prints out train/test average accuracy scores

    Args:
        model (class): model class name
        X (dataframe): data without label column
        y (1D series): label data
        seed (int): random seed value; default=5
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    clf = model.fit(X_train_scaled, y_train)
    print(f"model: {type(clf).__name__}, seed:{seed}")
    print(f"train score: {clf.score(X_train_scaled, y_train)}")
    print(f"test score: {clf.score(X_test_scaled, y_test)} \n")


In [115]:
def test_model_fs(model, X, y, seed=5):
    """uses feature selection to reduce data width and prints out train/test accuracy scores
    Args:
        model (class): sklearn model class name
        X (dataframe): data without label column
        y (1D series): label data
        seed (int): random seed value; default=5
    """
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    clf = model
    
    sel = SelectFromModel(clf)
    sel.fit(X_train_scaled, y_train)
    
    X_sel_train, X_sel_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=seed)
    scaler = StandardScaler().fit(X_sel_train)
    X_sel_train_scaled = scaler.transform(X_sel_train)
    X_sel_test_scaled = scaler.transform(X_sel_test)
    clf.fit(X_sel_train_scaled, y_train)
    
    print(f"model (fs): {type(clf).__name__}, seed:{seed}")
    print(f"train score: {clf.score(X_sel_train_scaled, y_train)}")
    print(f"test score: {clf.score(X_sel_test_scaled, y_test)} \n")

In [116]:
# print scores for logistic regression
test_model(LogisticRegression(max_iter=100000), X, y,seed=1)
test_model_fs(LogisticRegression(max_iter=100000), X, y,seed=1)


model: LogisticRegression, seed:1
train score: 0.9942908240473243
test score: 0.9936545604622369 

model (fs): LogisticRegression, seed:1
train score: 0.9931386710689228
test score: 0.9928807263722658 



In [117]:
# print scores for random forest classifier model
test_model(RandomForestClassifier(n_estimators=1000), X, y,seed=1)
test_model_fs(RandomForestClassifier(n_estimators=1000), X, y,seed=1)


model: RandomForestClassifier, seed:1
train score: 0.9975409272252029
test score: 0.9916941807676434 

model (fs): RandomForestClassifier, seed:1
train score: 0.9975409272252029
test score: 0.9917457697069748 



In [121]:
# test with fewer trees
test_model(RandomForestClassifier(n_estimators=100), X, y,seed=1)


model: RandomForestClassifier, seed:1
train score: 0.9975409272252029
test score: 0.9914362360709864 

