In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel


## Using Logistic Regression vs Random Forest Classifier model in determining credit risk

#### Prediction: 
Logistic regression will out-perform the Random Forest model because the dataset is relatively simple and does not contain an overwhelming number of noise variables. 

#### Results:
 As predicted, logistic regression performed with a higher average accuracy score *[0.9937 or 99.37%]* compared to the random forest classifier model, which yielded *0.9915 (99.15% accuracy rate)*. 
 Given the slim dataset, it is likely unwise to further reduce its complexity through methods such as feature selection. Doing so may increase overfitting risk and ultimately reduce the overall accuracy of the models, as attested by the values returned by <code>test_model_fs()</code>.

 The accuracy of the logistic regression model was improved *[to 0.9943 or 99.43%]* by tuning its hyper parameters with <code>test_model_param()</code>, which uses the <code>RandomizedSearchCV</code> method from scikit-learn to perform a randomized search on input hyper parameter ranges.
 
Increasing the number of estimators(trees) from 100 to 1000 in the random forest classifier model yielded slightly higher nominal scores for accuracy *(+0.015%)*: *0.99153* and *0.99169* respectively. 

#### Further analysis:
A two-sample pairwise t-test could be performed to quantify the statistical significance of the differences in performance metrics between the two models.

In [2]:
# Import data
df = pd.read_csv("Resources/lending_data.csv")
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
# check for missing values
df.isnull().values.any()

False

In [4]:
# define data without the target column and the target variable
# converting data to an array circumvents user warning during fitting.
X = df.drop(['loan_status'], axis=1).to_numpy()
y = df['loan_status'].to_numpy()

In [5]:
X

array([[1.0700e+04, 7.6720e+00, 5.2800e+04, ..., 5.0000e+00, 1.0000e+00,
        2.2800e+04],
       [8.4000e+03, 6.6920e+00, 4.3600e+04, ..., 3.0000e+00, 0.0000e+00,
        1.3600e+04],
       [9.0000e+03, 6.9630e+00, 4.6100e+04, ..., 3.0000e+00, 0.0000e+00,
        1.6100e+04],
       ...,
       [1.7600e+04, 1.0595e+01, 8.0300e+04, ..., 1.1000e+01, 2.0000e+00,
        5.0300e+04],
       [1.6300e+04, 1.0068e+01, 7.5300e+04, ..., 1.0000e+01, 2.0000e+00,
        4.5300e+04],
       [1.5600e+04, 9.7420e+00, 7.2300e+04, ..., 9.0000e+00, 2.0000e+00,
        4.2300e+04]])

In [6]:
def test_model(model, X, y, seed=5):
    """inputs model class and prints out train/test average accuracy scores

    Args:
        model (class): model class name
        X (dataframe): data without label column
        y (1D series): label data
        seed (int): random seed value; default=5
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    clf = model.fit(X_train_scaled, y_train)
    print(f"model: {type(clf).__name__}, seed:{seed}")
    print(f"train score: {clf.score(X_train_scaled, y_train)}")
    print(f"test score: {clf.score(X_test_scaled, y_test)} \n")

In [7]:
def test_model_param(param_grid, model, X, y, seed=5):
    """inputs model class and a dictionary of hyper parameter ranges and prints out the best hyper parameters and accuracy score

    Args:
        param_grid (dict): parameter object for the RandomizedSearchCV estimator
        model (class): model class name
        X (dataframe): data without label column
        y (1D series): label data
        seed (int): random seed value; default=5
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
    
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    random_clf = RandomizedSearchCV(model, param_grid, random_state=42, verbose=0)
    random_clf.fit(X_train_scaled, y_train)
    print(f"model: {type(model).__name__}, seed:{seed}")
    print(f"best parameters: {random_clf.best_params_}")
    print(f"test score: {random_clf.best_score_} \n")


In [8]:
def test_model_fs(model, X, y, seed=5):
    """uses feature selection to reduce data width and prints out train/test accuracy scores
    Args:
        model (class): sklearn model class name
        X (dataframe): data without label column
        y (1D series): label data
        seed (int): random seed value; default=5
    """
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    clf = model
    
    sel = SelectFromModel(clf)
    sel.fit(X_train_scaled, y_train)
    
    X_sel_train, X_sel_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=seed)
    scaler = StandardScaler().fit(X_sel_train)
    X_sel_train_scaled = scaler.transform(X_sel_train)
    X_sel_test_scaled = scaler.transform(X_sel_test)
    clf.fit(X_sel_train_scaled, y_train)
    
    print(f"model (fs): {type(clf).__name__}, seed:{seed}")
    print(f"train score: {clf.score(X_sel_train_scaled, y_train)}")
    print(f"test score: {clf.score(X_sel_test_scaled, y_test)} \n")

In [17]:
# ranges of hyper parameters that will be cross-tested in test_model_param
param_grid = {
    'C' : np.arange(0, 10, 0.01),
    'tol': np.arange(0, 0.001, 1e-5),
}
# param_grid

{'C': array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
        0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
        0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
        0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
        0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
        0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
        0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
        0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
        0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
        0.99, 1.  , 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09,
        1.1 , 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19, 1.2 ,
        1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3 , 1.31,
        1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4 , 1.41, 1.42,
        1.43, 1.44, 1.45, 1.46, 1

In [13]:
# print scores for logistic regression
test_model(LogisticRegression(max_iter=100000), X, y,seed=1)
test_model_fs(LogisticRegression(max_iter=100000), X, y,seed=1)
test_model_param(param_grid,LogisticRegression(max_iter=100000),X,y,seed=1 )


model: LogisticRegression, seed:1
train score: 0.9942908240473243
test score: 0.9936545604622369 

model (fs): LogisticRegression, seed:1
train score: 0.9931386710689228
test score: 0.9928807263722658 

model: LogisticRegression, seed:1
best parameters: {'tol': 0.0008600000000000001, 'C': 8.23}
test score: 0.9942908272899839 



In [16]:
# print scores for random forest classifier model
test_model(RandomForestClassifier(n_estimators=1000), X, y,seed=1)
test_model_fs(RandomForestClassifier(n_estimators=1000), X, y,seed=1)

model: RandomForestClassifier, seed:1
train score: 0.9975409272252029
test score: 0.9917457697069748 

model (fs): RandomForestClassifier, seed:1
train score: 0.9974893382858715
test score: 0.9918489475856377 



In [12]:
# test with fewer trees
test_model(RandomForestClassifier(n_estimators=100), X, y,seed=1)


model: RandomForestClassifier, seed:1
train score: 0.9975409272252029
test score: 0.991642591828312 

