In [69]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier


# Setting Up Testing

In [61]:
# Load in data

columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership", "annual_inc", 
    "verification_status", "pymnt_plan", "dti", "delinq_2yrs", 
    "inq_last_6mths", "open_acc", "pub_rec", "revol_bal", "total_acc", 
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt", 
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", 
    "total_rec_late_fee", "recoveries", "collection_recovery_fee", 
    "last_pymnt_amnt", "collections_12_mths_ex_med", "policy_code", 
    "application_type", "acc_now_delinq", "tot_coll_amt", "tot_cur_bal", 
    "open_acc_6m", "open_act_il", "open_il_12m", "open_il_24m", 
    "mths_since_rcnt_il", "total_bal_il", "il_util", "open_rv_12m", 
    "open_rv_24m", "max_bal_bc", "all_util", "total_rev_hi_lim", "inq_fi", 
    "total_cu_tl", "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", 
    "bc_open_to_buy", "bc_util", "chargeoff_within_12_mths", "delinq_amnt", 
    "mo_sin_old_il_acct", "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", 
    "mo_sin_rcnt_tl", "mort_acc", "mths_since_recent_bc", 
    "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl", 
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0", "num_sats", 
    "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m", 
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", 
    "pub_rec_bankruptcies", "tax_liens", "tot_hi_cred_lim", 
    "total_bal_ex_mort", "total_bc_limit", "total_il_high_credit_limit", 
    "hardship_flag", "debt_settlement_flag",
    "loan_status"
]

df = pd.read_csv(R'2019_loans.csv', skiprows=1, header=None, names=columns, index_col=False)
df

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,loan_status
0,12000.0,0.0702,370.64,MORTGAGE,99000.0,Source Verified,n,13.56,0.0,2.0,...,0.0,0.0,0.0,285336.0,45614.0,26300.0,39136.0,N,N,low_risk
1,16000.0,0.1557,559.13,RENT,45680.0,Source Verified,n,43.01,0.0,0.0,...,50.0,0.0,0.0,70133.0,54321.0,20400.0,41733.0,N,N,low_risk
2,24000.0,0.2055,643.23,MORTGAGE,90000.0,Source Verified,n,11.67,0.0,3.0,...,25.0,0.0,0.0,226232.0,63206.0,12600.0,48549.0,N,N,low_risk
3,26575.0,0.1308,605.76,RENT,99000.0,Not Verified,n,16.29,0.0,1.0,...,33.3,0.0,0.0,66809.0,52103.0,15700.0,41933.0,N,N,low_risk
4,20000.0,0.1695,712.56,RENT,65000.0,Not Verified,n,21.82,0.0,0.0,...,80.0,0.0,0.0,60088.0,48300.0,13800.0,37888.0,N,N,low_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10889,30000.0,0.2880,948.61,MORTGAGE,180000.0,Not Verified,n,12.74,0.0,0.0,...,75.0,0.0,0.0,665863.0,82055.0,30000.0,74843.0,N,N,high_risk
10890,16000.0,0.2055,599.12,RENT,110000.0,Verified,n,15.88,1.0,2.0,...,0.0,0.0,0.0,34204.0,30207.0,4000.0,28204.0,N,N,high_risk
10891,10000.0,0.1171,330.76,MORTGAGE,101853.0,Source Verified,n,13.76,0.0,1.0,...,25.0,0.0,0.0,75847.0,44777.0,10000.0,63347.0,N,N,high_risk
10892,30000.0,0.1240,673.42,RENT,140480.0,Source Verified,n,15.74,0.0,0.0,...,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,N,N,high_risk


In [62]:
df1 = df.drop("tot_coll_amt", axis=1)
X = pd.get_dummies(df1, dtype=float)

X

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y,loan_status_high_risk,loan_status_low_risk
0,12000.0,0.0702,370.64,99000.0,13.56,0.0,2.0,16.0,0.0,11249.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,16000.0,0.1557,559.13,45680.0,43.01,0.0,0.0,11.0,0.0,25588.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
2,24000.0,0.2055,643.23,90000.0,11.67,0.0,3.0,16.0,0.0,16741.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,26575.0,0.1308,605.76,99000.0,16.29,0.0,1.0,15.0,0.0,14255.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,20000.0,0.1695,712.56,65000.0,21.82,0.0,0.0,10.0,0.0,15401.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10889,30000.0,0.2880,948.61,180000.0,12.74,0.0,0.0,16.0,0.0,24589.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
10890,16000.0,0.2055,599.12,110000.0,15.88,1.0,2.0,6.0,0.0,4241.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
10891,10000.0,0.1171,330.76,101853.0,13.76,0.0,1.0,9.0,0.0,4482.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
10892,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,0.0,23215.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [63]:
y = LabelEncoder().fit_transform(df['tot_coll_amt'])
y

array([  0,   0,   0, ...,  42,   0, 856], dtype=int64)

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Linear Regression

## Predictions
I believe Random Forest Classifier will be more accurate than Linear Regression with this dataset due to the size of the dataset and how many trees it can look at.

In [72]:
linear_df = LogisticRegression()
linear_df.fit(X_train, y_train)

training_score = linear_df.score(X_train, y_train)
testing_score = linear_df.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")


Training Score: 0.8511627906976744
Testing Score: 0.8656387665198237


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Random Forest Classifier

In [71]:
clf = RandomForestClassifier(max_depth=10, random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 0.8559363525091799
Testing Score: 0.8707782672540382


# Scaling Linear Regression & Random Forest Classifier

## Predictions

I expect to see improvement with Linear Regression as the data becomes normalized. However, I do not believe Random Forest Classifier will change much due to it's process of looking at the data.

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [76]:
linear_df.fit(X_train_scaled, y_train)
training_score = linear_df.score(X_train_scaled, y_train)
testing_score = linear_df.score(X_test_scaled, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.9241126070991432
Testing Score: 0.8623348017621145


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [75]:
clf = RandomForestClassifier(max_depth=10, random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.8559363525091799
Testing Score: 0.8707782672540382
