In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data

dummies_2019_df = pd.get_dummies(train_df)

X_2019 = dummies_2019_df.drop(['loan_status_high_risk','debt_settlement_flag_Y'], axis=1)
y_2019 = dummies_2019_df['loan_status_high_risk']

In [4]:
#dummies_2019_df = dummies_2019_df.drop('debt_settlement_flag_Y', axis=1)

In [5]:
dummies_2019_df.isnull().sum().sum()

0

In [6]:
print(len(dummies_2019_df.columns))

96


In [7]:
col_list_2019 = dummies_2019_df.columns

In [8]:
# Convert categorical data to numeric and separate target feature for testing data

dummies_2020_df = pd.get_dummies(test_df)

X_2020 = dummies_2020_df.drop('loan_status_high_risk', axis=1)
y_2020 = dummies_2020_df['loan_status_high_risk']

In [9]:
dummies_2020_df.isna().sum().sum()

0

In [10]:
print(len(dummies_2020_df.columns))

95


In [11]:
col_list_2020 = dummies_2020_df.columns

In [12]:
def non_match_elements(list_a, list_b):
    non_match = []
    for i in list_a:
        if i not in list_b:
            non_match.append(i)
    return non_match

In [13]:
non_match_elements(col_list_2019, col_list_2020)

['debt_settlement_flag_Y']

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

In [15]:
X_train = X_2019
X_test = X_2020
y_train = y_2019
y_test = y_2020

print(len(X_train.columns))
print(len(X_test.columns))
print(len(y_train))
print(len(y_test))

94
94
12180
4702


# Prediction

Before running these 2 models I think that Random Forest will be a better model here because there are 94 columns to take a look at and I think the process of making decision trees and taking the average across those trees would lead to a better prediction.

In [27]:
# Train the Logistic Regression model on the unscaled data and print the model score

classifier = LogisticRegression(max_iter=15000)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

              precision    recall  f1-score   support

           0       0.54      0.93      0.68      2351
           1       0.73      0.20      0.31      2351

    accuracy                           0.56      4702
   macro avg       0.63      0.56      0.49      4702
weighted avg       0.63      0.56      0.49      4702

Training Data Score: 0.6998357963875205
Testing Data Score: 0.5623139089749043


In [28]:
# Train a Random Forest Classifier model and print the model score

rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)
confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))
print(f"Training Data Score: {rfc.score(X_train, y_train)}")
print(f"Testing Data Score: {rfc.score(X_test, y_test)}")

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2351
           1       1.00      1.00      1.00      2351

    accuracy                           1.00      4702
   macro avg       1.00      1.00      1.00      4702
weighted avg       1.00      1.00      1.00      4702

Training Data Score: 1.0
Testing Data Score: 1.0


## Result

Based off these results, technically RFC got a better score with 1 but generally that means that it's overfit as it was explained in class.

In [29]:
# Scale the data

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Prediction

Before scaling the data I predict that RFC will still out perform Logistic Regression due to it already being overfit.

In [30]:
# Train the Logistic Regression model on the scaled data and print the model score

classifier = LogisticRegression(max_iter=15000)
classifier.fit(X_train_scaled, y_train)

y_pred = classifier.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2351
           1       1.00      1.00      1.00      2351

    accuracy                           1.00      4702
   macro avg       1.00      1.00      1.00      4702
weighted avg       1.00      1.00      1.00      4702

Training Data Score: 1.0
Testing Data Score: 0.9997873245427478


In [31]:
# Train a Random Forest Classifier model on the scaled data and print the model score

rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(X_train_scaled, y_train)

y_pred = rfc.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))
print(f"Training Data Score: {rfc.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rfc.score(X_test_scaled, y_test)}")

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2351
           1       1.00      1.00      1.00      2351

    accuracy                           1.00      4702
   macro avg       1.00      1.00      1.00      4702
weighted avg       1.00      1.00      1.00      4702

Training Data Score: 1.0
Testing Data Score: 1.0


## Result

After running the modesl after scaling the data, Logistic Regression was brought way up and RFC remained the same.