In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [6]:
train_df = pd.read_csv(Path('Resources/generator/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/generator/2020Q1loans.csv'))

In [7]:

train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Data columns (total 84 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   12180 non-null  float64
 1   int_rate                    12180 non-null  float64
 2   installment                 12180 non-null  float64
 3   home_ownership              12180 non-null  object 
 4   annual_inc                  12180 non-null  float64
 5   verification_status         12180 non-null  object 
 6   pymnt_plan                  12180 non-null  object 
 7   dti                         12180 non-null  float64
 8   delinq_2yrs                 12180 non-null  float64
 9   inq_last_6mths              12180 non-null  float64
 10  open_acc                    12180 non-null  float64
 11  pub_rec                     12180 non-null  float64
 12  revol_bal                   12180 non-null  float64
 13  total_acc                   121

In [11]:
X_train = train_df.drop("target", axis=1).copy()
y_train = train_df["target"].copy()

X_test = test_df.drop("target", axis=1).copy()
y_test = test_df["target"].copy()

In [13]:
# Convert categorical data to numeric and separate target feature for training data
X_train = pd.get_dummies(X_train)
X_train.head()


Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,0,1,1,0,1,0,1,0,1,0
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,0,1,1,0,1,0,1,0
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,1,1,0,1,1,0,1,0,1,0
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0,1,0,1,0,1,1,0,1,0
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0,1,0,1,1,0,1,0,1,0


In [14]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = pd.get_dummies(X_test)
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,0,1,1,0,0,1,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,0,1,0,1,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,0,1,0,1,1,0,0,1,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,0,1,1,0,1,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,0,1,1,0,1,0,1


In [16]:
# add missing dummy variables to testing set
cols_to_add = set(X_train.columns) ^ set(X_test.columns)
# Loop through each missing column and add it to X_test with all 1s
for col in cols_to_add:
    X_test[col] = 1
    
X_test = X_test[X_train.columns]
# Encode the target column with 1s and 0s (both train and test)
target_encoder = LabelEncoder().fit(y_train)
y_train = target_encoder.transform(y_train)
y_test = target_encoder.transform(y_test)

In [18]:
# Check the shapes to make sure X_train and X_test have the same number of columns and rows are same in both X_train and X_test

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(12180, 92)
(4702, 92)
(12180,)
(4702,)


In [19]:
# Train the Logistic Regression model on the unscaled data and print the model score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression()
lr.fit(X_train, y_train)
print(lr.score(X_test, y_test))
print(lr.score(X_train, y_train))

0.5091450446618461
0.653448275862069


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
# Train a Random Forest Classifier model and print the model score

rf = RandomForestClassifier(n_estimators=900, max_depth=3)
rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))
print(rf.score(X_train, y_train))

0.6097405359421523
0.7296387520525451


In [27]:
# Scale the data

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [28]:
# Train the Logistic Regression model on the scaled data and print the model score
lr.fit(X_train_scaled, y_train)
print(lr.score(X_test_scaled, y_test))
print(lr.score(X_train_scaled, y_train))

0.5548702679710762
0.710919540229885


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rf.fit(X_train_scaled, y_train)
print(rf.score(X_test_scaled, y_test))
print(rf.score(X_train_scaled, y_train))

rf.feature_importances_

0.6214376860910251
0.7332512315270936


array([8.48879667e-03, 9.21181993e-02, 1.70444669e-02, 6.64671032e-04,
       5.42056553e-04, 5.44006780e-04, 3.55406501e-03, 2.49601821e-04,
       1.27918722e-05, 2.28078934e-03, 3.75771152e-04, 2.58643646e-02,
       2.69159695e-02, 9.94697812e-02, 1.00312700e-01, 7.14161743e-02,
       1.36330953e-01, 5.97325553e-02, 0.00000000e+00, 0.00000000e+00,
       2.46838207e-01, 1.03750140e-05, 0.00000000e+00, 0.00000000e+00,
       2.38196647e-04, 1.06856778e-03, 2.06395394e-03, 8.62423276e-05,
       1.33895083e-04, 5.75520136e-04, 5.77605225e-04, 3.34198803e-04,
       2.59856117e-03, 1.39344787e-03, 4.20783121e-03, 4.35498480e-03,
       2.81910090e-03, 6.29841271e-03, 8.14667803e-04, 1.15841916e-04,
       3.09991352e-03, 8.63830520e-03, 8.62997785e-04, 5.61872565e-03,
       9.23262340e-04, 1.01224317e-05, 0.00000000e+00, 7.71273076e-04,
       3.47932210e-03, 2.06671602e-03, 3.91538024e-03, 1.48757636e-03,
       1.66054717e-03, 4.10770027e-03, 3.26062242e-04, 3.78895871e-04,
      