In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

# Additional imports:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

### Evaluate DFs to find the missing categories.

In [3]:
train_df.info(), test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Data columns (total 84 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   12180 non-null  float64
 1   int_rate                    12180 non-null  float64
 2   installment                 12180 non-null  float64
 3   home_ownership              12180 non-null  object 
 4   annual_inc                  12180 non-null  float64
 5   verification_status         12180 non-null  object 
 6   pymnt_plan                  12180 non-null  object 
 7   dti                         12180 non-null  float64
 8   delinq_2yrs                 12180 non-null  float64
 9   inq_last_6mths              12180 non-null  float64
 10  open_acc                    12180 non-null  float64
 11  pub_rec                     12180 non-null  float64
 12  revol_bal                   12180 non-null  float64
 13  total_acc                   121

(None, None)

In [4]:
train_df

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,19975.0,0.2565,801.09,RENT,28000.0,Not Verified,n,28.42,0.0,0.0,...,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,N,N,high_risk
12176,15000.0,0.1774,540.34,RENT,50000.0,Verified,n,23.43,4.0,0.0,...,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,N,N,high_risk
12177,3600.0,0.1862,131.28,RENT,60000.0,Not Verified,n,28.80,0.0,1.0,...,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,N,N,high_risk
12178,15000.0,0.0881,475.68,MORTGAGE,62000.0,Source Verified,n,11.44,0.0,0.0,...,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,N,N,high_risk


In [5]:
train_df[train_df['debt_settlement_flag']=='Y']

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
6896,5300.0,0.1557,185.21,MORTGAGE,70000.0,Source Verified,n,26.8,0.0,1.0,...,75.0,1.0,0.0,55885.0,32539.0,11100.0,42885.0,N,Y,high_risk
6930,17500.0,0.1033,567.4,MORTGAGE,202000.0,Not Verified,n,27.88,0.0,1.0,...,42.9,0.0,0.0,1095484.0,268154.0,141400.0,197084.0,N,Y,high_risk
7243,10000.0,0.1474,236.54,RENT,28000.0,Not Verified,n,23.35,0.0,0.0,...,0.0,0.0,0.0,41011.0,26996.0,4000.0,30000.0,N,Y,high_risk
7730,10000.0,0.1474,345.39,RENT,100000.0,Source Verified,n,18.17,0.0,0.0,...,66.7,0.0,0.0,64610.0,57247.0,4500.0,55110.0,N,Y,high_risk
9018,8800.0,0.0819,276.54,MORTGAGE,40000.0,Not Verified,n,30.96,0.0,0.0,...,40.0,0.0,0.0,133291.0,22805.0,16100.0,15716.0,N,Y,high_risk


In [6]:
# Convert categorical data to numeric and separate target feature for training data
train_df=pd.get_dummies(train_df)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Data columns (total 94 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   loan_amnt                            12180 non-null  float64
 1   int_rate                             12180 non-null  float64
 2   installment                          12180 non-null  float64
 3   annual_inc                           12180 non-null  float64
 4   dti                                  12180 non-null  float64
 5   delinq_2yrs                          12180 non-null  float64
 6   inq_last_6mths                       12180 non-null  float64
 7   open_acc                             12180 non-null  float64
 8   pub_rec                              12180 non-null  float64
 9   revol_bal                            12180 non-null  float64
 10  total_acc                            12180 non-null  float64
 11  out_prncp                   

In [7]:
train_df[train_df['debt_settlement_flag_Y']==1]

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y,target_high_risk,target_low_risk
6896,5300.0,0.1557,185.21,70000.0,26.8,0.0,1.0,14.0,1.0,9846.0,...,0,1,1,0,1,0,0,1,1,0
6930,17500.0,0.1033,567.4,202000.0,27.88,0.0,1.0,17.0,0.0,114404.0,...,0,1,1,0,1,0,0,1,1,0
7243,10000.0,0.1474,236.54,28000.0,23.35,0.0,0.0,5.0,0.0,1207.0,...,0,1,0,1,1,0,0,1,1,0
7730,10000.0,0.1474,345.39,100000.0,18.17,0.0,0.0,12.0,0.0,4026.0,...,0,1,1,0,1,0,0,1,1,0
9018,8800.0,0.0819,276.54,40000.0,30.96,0.0,0.0,13.0,0.0,8400.0,...,0,1,1,0,1,0,0,1,1,0


In [8]:
# Convert categorical data to numeric and separate target feature for testing data
test_df=pd.get_dummies(test_df)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 93 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   loan_amnt                            4702 non-null   float64
 1   int_rate                             4702 non-null   float64
 2   installment                          4702 non-null   float64
 3   annual_inc                           4702 non-null   float64
 4   dti                                  4702 non-null   float64
 5   delinq_2yrs                          4702 non-null   float64
 6   inq_last_6mths                       4702 non-null   float64
 7   open_acc                             4702 non-null   float64
 8   pub_rec                              4702 non-null   float64
 9   revol_bal                            4702 non-null   float64
 10  total_acc                            4702 non-null   float64
 11  out_prncp                     

In [9]:
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,target_high_risk,target_low_risk
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,1,0,0,1,1,0,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,1,0,1,1,0,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,1,0,1,1,0,0,1,1,0,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,1,0,1,1,0,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,1,0,1,0,1,0,1


In [10]:
test_df[test_df['debt_settlement_flag_N']==0]    # This is what is missing. 

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,target_high_risk,target_low_risk


### debt_settlement_flag_Y does not exist in the testing data because there is no data in the set with a debt settlement flag

# Preprocessing: Convert Categorical Data to Numeric

In [11]:
#  Call the data back in...
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [None]:
# train_df.head()
# test_df.head()

In [15]:
# Convert categorical data to numeric and separate target feature for training data
X_train = train_df.drop(['target'], axis=1)
X_train = pd.get_dummies(X_train)
y_train = LabelEncoder().fit_transform(train_df['target'])
y_train

array([1, 1, 1, ..., 0, 0, 0])

In [16]:
X_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,0,1,1,0,1,0,1,0,1,0
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,0,1,1,0,1,0,1,0
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,1,1,0,1,1,0,1,0,1,0
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0,1,0,1,0,1,1,0,1,0
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0,1,0,1,1,0,1,0,1,0


In [17]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = test_df.drop(['target'], axis=1)
X_test = pd.get_dummies(X_test)
y_test = LabelEncoder().fit_transform(test_df['target'])
y_test

array([1, 1, 1, ..., 0, 0, 0])

In [18]:
# add missing dummy variables to testing set
X_test['debt_settlement_flag_Y']=0
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,0,1,0,1,1,0,0,1,1,0
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,1,0,1,1,0,1,0,1,0
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,1,0,1,1,0,0,1,1,0
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,1,1,0,1,1,0,1,0,1,0
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,0,1,0,1,1,0,1,0,1,0


# Consider the Models:

In comparing Logistic Regression and Random Forests Classifier I theorize that Random Forests Classifier will perform better in terms of accuracy. This is because Random Forests gives more weight to the features that are more important. Additionally, Logistic Regression accuracy tends to be negatively affected as the amount of categorical data and also noise are increased. 

# Unscaled Data

In [19]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression(max_iter=15000)
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train):.3f}")
print(f"Testing Data Score: {classifier.score(X_test, y_test):.3f}")

Training Data Score: 0.697
Testing Data Score: 0.564


In [20]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train):.3f}')
print(f'Testing Score: {clf.score(X_test, y_test):.3f}')

Training Score: 1.000
Testing Score: 0.621


### Unscaled Data Conclusions: 

As expected, the Random Forest Classifier improved accuracy over the testing data set however an improvement from 0.575 to 0.641 still seems lacking. 

# Scaled Data

## Reconsidering the Models with Scaled Data:

I expect that the Random Forests Classifier will perform better however I believe that scaling the data will improve accuracy for both models. 

In [21]:
# Scale the data
scaler = StandardScaler().fit(X_train)
# scaler = MinMaxScaler().fit(X_train)   (accuracy using this scaler was reduced compared to the StandardScaler)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression(max_iter=15000)
classifier.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train):.3f}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test):.3f}")

Training Data Score: 0.711
Testing Data Score: 0.760


In [23]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train):.3f}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test):.3f}')

Training Score: 1.000
Testing Score: 0.623


# Final Conlusions:

Scaling the data using Standard Scaler improved the testing prediction accuracy by almost 15% for the Logistic Regression Model however there was little noticeable improvement for the Random Forest Classifier when scaling the data either with StandardScaler OR MinMaxScaler. Considering that the training score on the Random Forests Classifier is 1.00 I think that there is an issue with overfitting. From what I have learned I would maybe trying increasing the training data. It is surprising, but LogisticRegression with scaled data using StandardScaler performed better than any of the other 3 models at 0.760 accuracy. I see this as a potentially useful model. 