In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
train = train_df.drop(columns = ['loan_status', 'Unnamed: 0', 'index'])
X_train = pd.get_dummies(train, dtype = float)
X_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,39728.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,9585.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
2,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,16708.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
3,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,8809.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
4,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,65420.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [4]:
X_train.shape

(12180, 92)

In [5]:
y_train = train_df['loan_status']
y_train

0         low_risk
1         low_risk
2         low_risk
3         low_risk
4         low_risk
           ...    
12175    high_risk
12176    high_risk
12177    high_risk
12178    high_risk
12179    high_risk
Name: loan_status, Length: 12180, dtype: object

In [6]:
# Convert categorical data to numeric and separate target feature for testing data
test = test_df.drop(columns = ['loan_status', 'Unnamed: 0', 'index'])
X_test = pd.get_dummies(test, dtype = float)
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0


In [7]:
# Comparing train and test datsets to identify the missing columns
set(list(X_train.columns))- set(list(X_test.columns))

{'debt_settlement_flag_Y'}

In [8]:
# Obtaining values from missing column
X_train['debt_settlement_flag_Y']

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
12175    0.0
12176    0.0
12177    0.0
12178    0.0
12179    0.0
Name: debt_settlement_flag_Y, Length: 12180, dtype: float64

In [9]:
# add missing dummy variables to testing set
X_test['debt_settlement_flag_Y'] = pd.Series([0 for x in range(len(X_test.index))])
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0


In [10]:
X_test.shape

(4702, 92)

In [11]:
y_test = test_df['loan_status']
y_test

0        low_risk
1        low_risk
2        low_risk
3        low_risk
4        low_risk
          ...    
4697    high_risk
4698    high_risk
4699    high_risk
4700    high_risk
4701    high_risk
Name: loan_status, Length: 4702, dtype: object

In [12]:
y_test.shape

(4702,)

In [13]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [14]:
# Fit the model to the data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [15]:
# Classifier scores
print(f'Training Score: {classifier.score(X_train, y_train)}')
print(f'Testing Score: {classifier.score(X_test, y_test)}')

Training Score: 0.6507389162561577
Testing Score: 0.5157379838366652


In [16]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
classifier = RandomForestClassifier(random_state=42, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {classifier.score(X_train, y_train)}')
print(f'Testing Score: {classifier.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.6446193109315185


In [17]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression().fit(X_train_scaled, y_train)
print(f'Training Score: {classifier.score(X_train_scaled, y_train)}')
print(f'Testing Score: {classifier.score(X_test_scaled, y_test)}')

Training Score: 0.7078817733990148
Testing Score: 0.767333049766057


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
# Train a Random Forest Classifier model on the scaled data and print the model score
classifier = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {classifier.score(X_train_scaled, y_train)}')
print(f'Testing Score: {classifier.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6420672054444917


In [None]:
#looking at the models' score prior to scaling, Random forest has scored much better than logisic Regression. When we scaled
#the data the overall scores imporoved for both models signiificantly for Logistic Regression but not enough to surpass
#the performance of Random Forest model's performance. Unltimately, I would use Random Forest Classifier as a model.