In [33]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [34]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))
target_names = ["low_risk", "high_risk"]

In [35]:
# Look at data set train
train_df.head()
print(train_df[train_df['debt_settlement_flag'] == 'Y'])

      Unnamed: 0   index  loan_amnt  int_rate  installment home_ownership  \
6896       94972   94972     5300.0    0.1557       185.21       MORTGAGE   
6930       96648   96648    17500.0    0.1033       567.40       MORTGAGE   
7243      112570  112570    10000.0    0.1474       236.54           RENT   
7730      137200  137200    10000.0    0.1474       345.39           RENT   
9018      202248  202248     8800.0    0.0819       276.54       MORTGAGE   

      annual_inc verification_status loan_status pymnt_plan  ...  \
6896     70000.0     Source Verified   high_risk          n  ...   
6930    202000.0        Not Verified   high_risk          n  ...   
7243     28000.0        Not Verified   high_risk          n  ...   
7730    100000.0     Source Verified   high_risk          n  ...   
9018     40000.0        Not Verified   high_risk          n  ...   

      pct_tl_nvr_dlq  percent_bc_gt_75  pub_rec_bankruptcies  tax_liens  \
6896           100.0              75.0               

In [36]:
# Look at data set test
test_df.head()
print(test_df[test_df['debt_settlement_flag'] == 'Y'])

Empty DataFrame
Columns: [Unnamed: 0, index, loan_amnt, int_rate, installment, home_ownership, annual_inc, verification_status, loan_status, pymnt_plan, dti, delinq_2yrs, inq_last_6mths, open_acc, pub_rec, revol_bal, total_acc, initial_list_status, out_prncp, out_prncp_inv, total_pymnt, total_pymnt_inv, total_rec_prncp, total_rec_int, total_rec_late_fee, recoveries, collection_recovery_fee, last_pymnt_amnt, collections_12_mths_ex_med, policy_code, application_type, acc_now_delinq, tot_coll_amt, tot_cur_bal, open_acc_6m, open_act_il, open_il_12m, open_il_24m, mths_since_rcnt_il, total_bal_il, il_util, open_rv_12m, open_rv_24m, max_bal_bc, all_util, total_rev_hi_lim, inq_fi, total_cu_tl, inq_last_12m, acc_open_past_24mths, avg_cur_bal, bc_open_to_buy, bc_util, chargeoff_within_12_mths, delinq_amnt, mo_sin_old_il_acct, mo_sin_old_rev_tl_op, mo_sin_rcnt_rev_tl_op, mo_sin_rcnt_tl, mort_acc, mths_since_recent_bc, mths_since_recent_inq, num_accts_ever_120_pd, num_actv_bc_tl, num_actv_rev_tl, 

In [37]:
# Convert categorical data to numeric and separate target feature for training data
dum_train_df = pd.get_dummies(train_df, dummy_na=False)
# dum_train_df.columns
dum_train_df.head(1)

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,1,1,0,1,1,0,1,0,1,0


In [38]:
# Convert categorical data to numeric and separate target feature for testing data
# Missing debt_settlement_flag_Y
# There are no Y in the column on the orginal DF
dum_test_df = pd.get_dummies(test_df, dummy_na=False)
dum_test_df.columns
# print(dum_test_df[dum_test_df['debt_settlement_flag_N'] == 1])

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc'

In [39]:
# add missing dummy variables to testing set
dum_test_df['debt_settlement_flag_Y'] = 0
dum_test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,67991,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,...,1,1,0,1,1,0,1,0,1,0
1,25429,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,...,1,1,0,1,1,0,1,0,1,0
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,1,1,0,1,1,0,1,0,1,0
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,1,1,0,1,1,0,1,0,1,0
4,37505,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,...,1,1,0,1,1,0,1,0,1,0


In [40]:
# Get X and y
X = dum_train_df.drop(['loan_status_high_risk', 'loan_status_low_risk', 'Unnamed: 0'], axis =1)
y = dum_train_df['loan_status_high_risk']
X_tst = dum_test_df.drop(['loan_status_high_risk', 'loan_status_low_risk', 'Unnamed: 0'], axis =1)
y_tst = dum_test_df['loan_status_high_risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

# Predictions

### I think since there are 96 features to consider that the RandomForest model will preform better.  From the lessons it tends to work better than the logistical progression.

In [46]:
# Train the Logistic Regression model on the unscaled data and print the model score
# This is testing data against the same set
classifier = LogisticRegression(max_iter=10000)
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_tst, y_tst)}")

Training Data Score: 0.6972085385878489
Testing Data Score: 0.5642279880901744


In [47]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1).fit(X_train, y_train)
y_pred = clf.predict(X_tst)
print(classification_report(y_tst, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_tst, y_tst)}')

              precision    recall  f1-score   support

    low_risk       0.59      0.80      0.68      2351
   high_risk       0.69      0.44      0.54      2351

    accuracy                           0.62      4702
   macro avg       0.64      0.62      0.61      4702
weighted avg       0.64      0.62      0.61      4702

Training Score: 1.0
Testing Score: 0.620374308804764


In [49]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_tst)

# Predictions

### Scaling the data will help with the outliers on the data.  I think that it will improve the accuracy.  For instance for annual income alone  there is a variations of over 400,000 from some rows.  This will bring the values closer together.

In [50]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression(max_iter=10000)
classifier.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_tst)}")

Training Data Score: 0.7057471264367816
Testing Data Score: 0.7169289663972778


In [51]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_tst, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_tst)}')

              precision    recall  f1-score   support

    low_risk       0.59      0.80      0.68      2351
   high_risk       0.69      0.44      0.54      2351

    accuracy                           0.62      4702
   macro avg       0.64      0.62      0.61      4702
weighted avg       0.64      0.62      0.61      4702

Training Score: 1.0
Testing Score: 0.6197362824330073


# Conclusion

### The scaled data performed better for the logistical data but around the same of the the random forest.  Part of my prediction was correct.  I thought that the scaled data would vastly improve the score on both values but it only seemed to improve on the Logistical.  The RandomForest was better with the unscaled data but I am not sure why and also unsure of why it did not improve with the scaled data.