In [1]:
# Dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
# Read CSV files to dataframe
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))
target_names = ["High Risk", "Low Risk"]

In [3]:
# Convert categorical data to numeric and separate target feature for training data
# Drop the label to create the X_train data
X_train = train_df.drop('target', axis = 1)
y_train = train_df['target']

# One-hot encoding the X_train data
X_train_dummies = pd.get_dummies(X_train)

# Converting target labels to 0 and 1
y_train_label = LabelEncoder().fit_transform(y_train)

In [4]:
# Convert categorical data to numeric and separate target feature for testing data
# Drop the label to create the X_test data
X_test = test_df.drop('target', axis = 1)
y_test = test_df['target']

# One-hot encoding the X_test data
X_test_dummies = pd.get_dummies(X_test)

# Converting target labels to 0 and 1
y_test_label = LabelEncoder().fit_transform(y_test)

In [5]:
# Add missing dummy variables to testing set
# Find missing columns
X_train_header = X_train_dummies.columns
X_test_header = X_test_dummies.columns
missing_columns = list(set(X_test_header) - set(X_train_header)) + list(set(X_train_header) - set(X_test_header))

# Add missing columns and populate with 0s
for i in range(len(missing_columns)):
    index_no = X_train_dummies.columns.get_loc(missing_columns[i])
    X_test_dummies.insert(loc = index_no, column = missing_columns[i], value = 0)

In [6]:
# Split our data into training and testing dat
X_train, X_test, y_train, y_test = train_test_split(X_train_dummies, y_train_label, train_size = 0.7, random_state = 24)

In [7]:
## Prediction: My prediction is that the Random Forest will be have a higher score compared to the Logistic Regression because the dataset is a mix of numbers and categorical entrees. I feel that a linear modeling of the categorical information will be less accurate. Another thing is that the averaging that is happening within the Random Forest Classifier adds value in model's prediction because it supported by the samples behind the average.


In [8]:
# Train the Logistic Regression model on the unscaled data and print the model score
# Create a logistic regression model
lr_classifier = LogisticRegression(max_iter=12500)

# Fit (train) model using the training data
lr_classifier.fit(X_train, y_train)

# Validate the model by using the test data
print(f"Training Data Score: {lr_classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {lr_classifier.score(X_test_dummies, y_test_label)}\n")
y_test_pred1 = lr_classifier.predict(X_test_dummies)
print(classification_report(y_test_label, y_test_pred1, target_names=target_names))

Training Data Score: 0.7158104621158808
Testing Data Score: 0.5672054444917056

              precision    recall  f1-score   support

   High Risk       0.61      0.36      0.46      2351
    Low Risk       0.55      0.77      0.64      2351

    accuracy                           0.57      4702
   macro avg       0.58      0.57      0.55      4702
weighted avg       0.58      0.57      0.55      4702



In [9]:
# Train a Random Forest Classifier model and print the model score

# Fit (train) model using the training data
rfc_classifier = RandomForestClassifier(random_state=24, n_estimators=50).fit(X_train, y_train)

# Validate the model by using the test data
print(f'Training Score: {rfc_classifier.score(X_train, y_train)}')
print(f'Testing Score: {rfc_classifier.score(X_test_dummies, y_test_label)}\n')
y_test_pred2 = rfc_classifier.predict(X_test_dummies)
print(classification_report(y_test_label, y_test_pred2, target_names=target_names))

Training Score: 0.9998827117053718
Testing Score: 0.6501488728200766

              precision    recall  f1-score   support

   High Risk       0.61      0.83      0.70      2351
    Low Risk       0.73      0.47      0.57      2351

    accuracy                           0.65      4702
   macro avg       0.67      0.65      0.64      4702
weighted avg       0.67      0.65      0.64      4702



In [10]:
## Results: The Random Forest performed better than the Logistic Regression for both training and testing data scores. This is aligned with my initial prediction. Not scaling the data probably affected the Logical Regression making it less accurate. I still believe that the averaging of the results from the multiple trees make the Random Forest more robusts and have a higher score. 


In [11]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_dummies_scaled = scaler.transform(X_test_dummies)

In [12]:
## Predictions: I think the Logistic Regression will benefit from the scaling process because it will make all the column data be in comparable values, and I think this help in better regression results. I still think that the Random Forest Classifier will produce better scores.

In [13]:
# Train the Logistic Regression model on the scaled data and print the model score
# Create a log istic regression model
scaled_lr_classifier = LogisticRegression(max_iter=12500)

# Fit (train) model using the scaled training data
scaled_lr_classifier.fit(X_train_scaled, y_train)

# Validate the model by using the scaled test data
print(f"Training Data Score: {scaled_lr_classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {scaled_lr_classifier.score(X_test_dummies_scaled, y_test_label)}\n")
y_test_pred3 = scaled_lr_classifier.predict(X_test_dummies_scaled)
print(classification_report(y_test_label, y_test_pred3, target_names=target_names))

Training Data Score: 0.7247243725076238
Testing Data Score: 0.7662696724797958

              precision    recall  f1-score   support

   High Risk       0.77      0.75      0.76      2351
    Low Risk       0.76      0.78      0.77      2351

    accuracy                           0.77      4702
   macro avg       0.77      0.77      0.77      4702
weighted avg       0.77      0.77      0.77      4702



In [14]:
# Train a Random Forest Classifier model on the scaled data and print the model score
# Fit (train) model using the scaled training data
scaled_rfc_classifier = RandomForestClassifier(random_state=24, n_estimators=50).fit(X_train_scaled, y_train)

# Validate the model by using the scaled test data
print(f'Training Score: {scaled_rfc_classifier.score(X_train_scaled, y_train)}')
print(f'Testing Score: {scaled_rfc_classifier.score(X_test_dummies_scaled, y_test_label)}')
print()
y_test_pred4 = scaled_rfc_classifier.predict(X_test_dummies_scaled)
print(classification_report(y_test_label, y_test_pred4, target_names=target_names))

Training Score: 0.9998827117053718
Testing Score: 0.65142492556359

              precision    recall  f1-score   support

   High Risk       0.61      0.83      0.70      2351
    Low Risk       0.74      0.47      0.58      2351

    accuracy                           0.65      4702
   macro avg       0.67      0.65      0.64      4702
weighted avg       0.67      0.65      0.64      4702



In [15]:
## Results: After scaling the data and training the models again, the Logistic Regression model scored higher when the 2020 data was used. The Random Forest Classifier still had a higher score when the training data is used for the scoring. The scaling benefited the Logistic Regression because it scaled all the data to a range of -2 to 2 thus removing any bias due to big values. The last result is opposite my initial prediction, it could be that Logistic Regression is better for certain analysis and Random Forest Classifier is better for others. 