In [1]:
# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
# Pathing to csv files
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
y_train = train_df["loan_status"]
X_train = train_df.drop(columns = ["loan_status"])
X_train = pd.get_dummies(X_train)
X_train.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,321143,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,11778,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,0,1,0,1,1,0,1,0,1,0


In [4]:
# Convert categorical data to numeric and separate target feature for testing data
y_test = test_df["loan_status"]
X_test = test_df.drop(columns = ["loan_status"])
X_test = pd.get_dummies(X_test)
X_test.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,...,0,0,1,0,1,1,0,1,0,1
1,25429,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,...,0,0,1,0,1,1,0,1,0,1
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,0,0,1,0,1,1,0,1,0,1
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,0,0,1,0,1,1,0,1,0,1
4,37505,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,...,0,0,1,0,1,1,0,1,0,1


In [5]:
# add missing dummy variables to testing set
for column in X_train.columns:
    if column not in X_test.columns:
        X_test[column] = 0

### Prediction 1 :
I believe, from previous models observed in class, that the logistic regression will perform worse than the Random Forest Classifier model. Logistic regessions are used to compare and understand the relationship between the dependant and independant variables, and knowing this I know outliers and poor spreads can cause them to be inaccurate. Random forest classifiers, on the other hand, use more feature randomness and this in turn makes more accurate predictions than a model that follows an individual 'tree'. This is why I predict that the Random Forest Classifier will run a more accurate model in the first round.

In [6]:
# Train the Logistic Regression model on the unscaled data and print the model score
reg = LogisticRegression(max_iter = 10000).fit(X_train, y_train)
print("Logistic Regression Score: ", reg.score(X_test, y_test))

Logistic Regression Score:  0.5638026371756699


In [7]:
# Train a Random Forest Classifier model and print the model score
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train, y_train)
print("Random Forest Classifier Score: ", forest.score(X_test,y_test))

Random Forest Classifier Score:  0.6305827307528711


### Prediction 2 :
Scaling the data is a very important step in the machine learning process; it can take a weaker model and make it better in some cases. I predict that scaling will make both models better, but I still believe the Random Forest Classifier will create a more accurate model in the end.

In [8]:
# Scale the data
scale = StandardScaler()
scale.fit(X_train)
scale_tr = scale.transform(X_train)
scale_te = scale.transform(X_test)

In [9]:
# Train the Logistic Regression model on the scaled data and print the model score
reg_scale = LogisticRegression(max_iter=10000, solver='lbfgs' , random_state=42)

reg_scale.fit(scale_tr, y_train)
print("Scaled Logistic Regression Score: ", reg_scale.score(scale_te, y_test))

Scaled Logistic Regression Score:  0.7203317737133135


In [10]:
# Train a Random Forest Classifier model on the scaled data and print the model score
forest_scale = RandomForestClassifier(random_state=42)

forest_scale.fit(scale_tr, y_train)
print("Scaled Random Forest Classifier Score: ", forest_scale.score(scale_te, y_test))

Scaled Random Forest Classifier Score:  0.6297320289238622


### Unscaled Comparison :
Like I had predicted, the Random Forest Classifier ended up being more accurate than the Logistic Regression by about .067.

Logistic Regression Score: .563

Random Forest Classifier Score: .630

While the difference in accuracy is marginal, the Random Forest Classifier ended up being more accurate nonetheless. I believe this is mainly due to the reasons of the Random Forest Classifier having more of a non linear nature.

### Scaled Comparison :
For the scaled prediction I ended up being wrong. Logistic Regression was more accurate by about .091.

Scaled Logistic Regression Score: .720

Scaled Random Forest Classifier Score: .629

The conclusion for this outcome is a bit divided. On one hand we can see how dependable a Random Forest Classifier can be, and this is evident in the score differece between the scaled and unscaled being about a .001 difference. This would make me think that it is more reliable, and would make me want to move forward with that model. However, scaling is an important step when optimizing the pre-processing step of machine learning. Upon closer inspection of the data I can see that there is a bit of highly varying values in our data, and I believe this to be one of the causes for the scaling making such a difference for the Logistic Regression score since it lowers the noise. There is also a good bit of features in our data and Logistic Regressions can handle that better in some cases more so than Random Forest Classifiers. I would have to defer to someone with more expertise in the field of machine learning to see which model would be most appropriate to move forward with.  