# Data Cleaning

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
train_one_hot = pd.get_dummies(train_df)
X = train_one_hot.drop(['loan_status_high_risk','loan_status_low_risk'],axis=1)
y = train_one_hot['loan_status_high_risk']
X_train = X.drop("Unnamed: 0",axis=1)
y_train = y.values.ravel()

In [4]:
# Convert categorical data to numeric and separate target feature for testing data

#one-hot encoding creates one less column than the train dataset as "debt_settlement" contains only one value
# I will create a new colum and assign 0 values to compensate

test_one_hot = pd.get_dummies(test_df)

test_one_hot['debt_settlement_flag_Y'] = 0

X = test_one_hot.drop(['loan_status_high_risk','loan_status_low_risk'],axis=1)
y = test_one_hot['loan_status_high_risk']
X_test = X.drop("Unnamed: 0",axis=1)
y_test = y.values.ravel()


In [5]:
# Check the number of columns across the test and train datasets
# Check number of rows is the same across X and Y for each dataset

print("Shape: ", X_train.shape, y_train.shape)
print("Shape: ", X_test.shape, y_test.shape)

Shape:  (12180, 93) (12180,)
Shape:  (4702, 93) (4702,)


In [6]:
# Check data type
y_train

array([0, 0, 0, ..., 1, 1, 1], dtype=uint8)

In [7]:
# Check data type
y_test

array([0, 0, 0, ..., 1, 1, 1], dtype=uint8)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [9]:
# Order columns to match across test and train data

X_test = X_test[X_train.columns]

In [10]:
# Check column order

X_test.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,...,0,1,0,1,1,0,1,0,1,0
1,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,...,0,1,0,1,1,0,1,0,1,0
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,0,1,0,1,1,0,1,0,1,0
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,0,1,0,1,1,0,1,0,1,0
4,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,0,1,0,1,1,0,1,0,1,0


In [11]:
# Check column order

X_train.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,...,0,1,0,1,1,0,1,0,1,0


# __Predictions__

Classifier: Logistic Regression Vs. Random Forest

I predict that the Random Forest Classifier will perform best pre-scaling due to the large number of features with an unknown relevance. Scaling of the data will improve the accuracy of the Logistic Regression model and improve the accuracy of the model.

Even after scaling, I predict that the Random Forest classifier will perform more accurately than logisitc regression

# Models

In [12]:
# Train the Logistic Regression model on the unscaled data and print the model score
# Create a logistic regression model
classifier = LogisticRegression()
classifier

classifier.fit(X_train, y_train)

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6572249589490968
Testing Data Score: 0.5214802211824755


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
from sklearn.metrics import classification_report

In [14]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=42, n_estimators=500)
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
print(classification_report(y_test, y_predict))

print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Training Score: {clf.score(X_test, y_test)}')

              precision    recall  f1-score   support

           0       0.62      0.86      0.72      2351
           1       0.77      0.46      0.58      2351

    accuracy                           0.66      4702
   macro avg       0.69      0.66      0.65      4702
weighted avg       0.69      0.66      0.65      4702

Training Score: 1.0
Training Score: 0.662484049340706


In [15]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
# Train the Logistic Regression model on the scaled data and print the model score

classifier_scaled = LogisticRegression()

# Fit our model using the training data
classifier_scaled.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier_scaled.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier_scaled.score(X_test_scaled, y_test)}")

Training Data Score: 0.7130541871921182
Testing Data Score: 0.7216078264568269


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf_scaled = RandomForestClassifier(random_state=42, n_estimators=500)
clf_scaled.fit(X_train_scaled, y_train)
#classification reports
y_predict = clf_scaled.predict(X_test_scaled)
print(classification_report(y_test, y_predict))

print(f"Training Data Score: {clf_scaled.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {clf_scaled.score(X_test_scaled, y_test)}")

              precision    recall  f1-score   support

           0       0.62      0.86      0.72      2351
           1       0.77      0.47      0.58      2351

    accuracy                           0.66      4702
   macro avg       0.69      0.66      0.65      4702
weighted avg       0.69      0.66      0.65      4702

Training Data Score: 1.0
Testing Data Score: 0.663334751169715


# __Results__

Logistic Regression

In [18]:
print(f"Training Unsacled Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Unscaled Data Score: {classifier.score(X_test, y_test)}")
print("****************************************************************")
print(f"Training Scaled Data Score: {classifier_scaled.score(X_train_scaled, y_train)}")
print(f"Testing Scaled Data Score: {classifier_scaled.score(X_test_scaled, y_test)}")

Training Unsacled Data Score: 0.6572249589490968
Testing Unscaled Data Score: 0.5214802211824755
****************************************************************
Training Scaled Data Score: 0.7130541871921182
Testing Scaled Data Score: 0.7216078264568269


Random Forest

In [19]:
print(f'Training Unscaled Data Score: {clf.score(X_train, y_train)}')
print(f'Testing Unscaled Data Score: {clf.score(X_test, y_test)}')
print("****************************************************************")
print(f"Training Scaled Data Score: {clf_scaled.score(X_train_scaled, y_train)}")
print(f"Testing Scaled Data Score: {clf_scaled.score(X_test_scaled, y_test)}")

Training Unscaled Data Score: 1.0
Testing Unscaled Data Score: 0.662484049340706
****************************************************************
Training Scaled Data Score: 1.0
Testing Scaled Data Score: 0.663334751169715


# Conclusion

As predicted, the Random Forest classifier performed best with the unscaled data. We can see that after scaling, the logistic regression model peformed better.

An interesting observation - the training score is at maximum performance with the Random Forest classifier, which suggests oversampling - the test score came out worse than with the Logistic Regression model using the scaled data. 

It is for this reason that I would select Logisitic Regression using scaled data as the model for this dataset.
