In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
x_train = train_df.drop(columns='target')
x_train = pd.get_dummies(x_train, dtype=float)
x_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [4]:
y_train = train_df['target']
y_train, y_train.value_counts()

(0         low_risk
 1         low_risk
 2         low_risk
 3         low_risk
 4         low_risk
            ...    
 12175    high_risk
 12176    high_risk
 12177    high_risk
 12178    high_risk
 12179    high_risk
 Name: target, Length: 12180, dtype: object,
 high_risk    6090
 low_risk     6090
 Name: target, dtype: int64)

In [5]:
# Convert categorical data to numeric and separate target feature for testing data
x_test = test_df.drop(columns='target')
x_test = pd.get_dummies(x_test, dtype=float)
x_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0


In [6]:
y_test = test_df['target']
y_test, y_test.value_counts()

(0        low_risk
 1        low_risk
 2        low_risk
 3        low_risk
 4        low_risk
           ...    
 4697    high_risk
 4698    high_risk
 4699    high_risk
 4700    high_risk
 4701    high_risk
 Name: target, Length: 4702, dtype: object,
 low_risk     2351
 high_risk    2351
 Name: target, dtype: int64)

In [7]:
# add missing dummy variables to testing set
print(x_train.columns, x_train.shape)
print(x_test.columns, x_test.shape)
set(list(x_train.columns)) - set(list(x_test.columns))

Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_in

{'debt_settlement_flag_Y'}

In [8]:
x_test['debt_settlement_flag_Y'] = 0

## Prediction
I think the random forest model will do better because the ability to increase the n_estimators will help the model training

In [9]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression().fit(x_train, y_train)
print(f"Training Data Score: {classifier.score(x_train, y_train)}")
print(f"Testing Data Score: {classifier.score(x_test, y_test)}")

Training Data Score: 0.6529556650246305
Testing Data Score: 0.5089323692045938


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Train a Random Forest Classifier model and print the model score
RFclassifier = RandomForestClassifier(random_state=1, n_estimators=25).fit(x_train, y_train)
print(f'Training Score: {RFclassifier.score(x_train, y_train)}')
print(f'Testing Score: {RFclassifier.score(x_test, y_test)}')

Training Score: 0.9986863711001642
Testing Score: 0.6261165461505742


## Model Comparision
As expected, the random forest model did better with both the training and testing score, tho it seems to be over fitting

In [11]:
# Scale the data
scaler = StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

## Prediction
Since the random forest did better in the intial test, I think it will again do better than the logistic regression model.

In [12]:
# Train the Logistic Regression model on the scaled data and print the model score
scaled_classifier = LogisticRegression().fit(x_train_scaled, y_train)
print(f'Training Score: {scaled_classifier.score(x_train_scaled, y_train)}')
print(f'Testing Score: {scaled_classifier.score(x_test_scaled, y_test)}')

Training Score: 0.710919540229885
Testing Score: 0.7598894087622289


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Train a Random Forest Classifier model on the scaled data and print the model score
scaled_RFclassifier = RandomForestClassifier(random_state=1, n_estimators=25).fit(x_train_scaled, y_train)
print(f'Training Score: {scaled_RFclassifier.score(x_train_scaled, y_train)}')
print(f'Testing Score: {scaled_RFclassifier.score(x_test_scaled, y_test)}')

Training Score: 0.9986863711001642
Testing Score: 0.6273925988940876


## Model Comparision
while the training score for the random forest model is better, it seems like the logistic regression model is better at predicting since the testing score is actually higher than the training score