In [24]:
%matplotlib inline
import numpy as np
import pandas as pd
from pathlib import Path
import random as rnd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Before beginning, my prediction is that the linear regression will be a better fit
# since most of the data's columns seem to be related. For example, missed payments,
# number of bills going to collections, delinquencies, etc all generally indicate that
# a higher value in those categories means a higher risk.

In [2]:
# The test and train data samples have already been split into two separate files
# Pull in the csv files
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))


In [10]:
# view the train_df columns
train_df.columns

Index(['loan_amnt', 'int_rate', 'installment', 'home_ownership', 'annual_inc',
       'verification_status', 'pymnt_plan', 'dti', 'delinq_2yrs',
       'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal',
       'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m',
       'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',
       'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
       'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal',
       'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt',
       'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'm

In [53]:
# identify the number of columns
len(train_df.columns)

84

In [54]:
# compare the train_df columns with the test_df columns and see how many are in common
common = (train_df.columns == test_df.columns)
len(common)

84

In [20]:
# split up the X columns from the y column, isolating the target
# Removed Unnamed and index because they are irrelevant
X_train = train_df.drop('target', axis=1).copy()
X_test = test_df.drop('target', axis=1).copy()
y_train = train_df['target'].copy()
y_test = test_df['target'].copy()

In [57]:
# this identifies columns that do not match
cols_to_add = set(X_train.columns) ^ set(X_test.columns)
for col in cols_to_add:
    X_test[col] = 0
X_test = X_test[X_train.columns]

In [58]:
# this switches the non-numberic columns into several numberic columns
target_encoder = LabelEncoder().fit(y_train)
y_train = target_encoder.transform(y_train)
y_test = target_encoder.transform(y_test)
y_test 

array([1, 1, 1, ..., 0, 0, 0])

In [59]:
# Train the Logistic Regression Model 
# this trains and scores the model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5091450446618461

In [None]:
# Train a Random Forest Classifier model and print the model score

In [52]:
# This trains and scores the random forest model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 150)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

1.0

In [60]:
# This scales the data down significantly, typically between 0 and 1
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [41]:
# Train the Logistic Regression model on the scaled data and print the model score
# The scores below show a good model to use. It is not over-fitted and shows reliability
lr.fit(X_train_scaled, y_train)
print(lr.score(X_test_scaled, y_test))
print(lr.score(X_train_scaled, y_train))

0.7598894087622289
0.710919540229885


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
# Train a Random Forest Classifier model on the scaled data and print the model score
# this is an over-fit, since it's 100%
rf.fit(X_train_scaled, y_train)
print(rf.score(X_test_scaled, y_test))
print(rf.score(X_train_scaled, y_train))

0.6509995746490855
1.0


In [44]:
rf.feature_importances_

array([1.71592604e-02, 3.14331278e-02, 3.04747517e-02, 1.41740314e-02,
       1.54956635e-02, 2.58530991e-03, 4.18670155e-03, 8.46111603e-03,
       1.22234115e-03, 1.51668233e-02, 1.16199932e-02, 2.90506782e-02,
       2.87885386e-02, 4.34494211e-02, 4.56721343e-02, 5.28972293e-02,
       4.85726105e-02, 1.79203918e-02, 0.00000000e+00, 0.00000000e+00,
       1.01468463e-01, 5.99583734e-04, 0.00000000e+00, 0.00000000e+00,
       4.02184552e-03, 1.45226804e-02, 4.49778109e-03, 6.74043289e-03,
       3.76408508e-03, 6.55625949e-03, 1.19530388e-02, 1.30482097e-02,
       1.44377385e-02, 5.20809924e-03, 7.96856529e-03, 1.61847282e-02,
       1.30606916e-02, 1.49773990e-02, 6.33873223e-03, 6.71147432e-03,
       7.57513567e-03, 9.72818281e-03, 1.49380043e-02, 1.58019325e-02,
       1.42213460e-02, 2.80931626e-04, 0.00000000e+00, 1.55251495e-02,
       1.65339839e-02, 1.18820584e-02, 9.98721087e-03, 5.81183161e-03,
       1.29155000e-02, 1.17402783e-02, 3.30649962e-03, 7.25521640e-03,
      

In [None]:
#How do the model scores compare to each other, and to the previous results on unscaled data? 
# The unscaled showed low scores, compared to the scaled data. I believe this is because
# it allows various columns to be compared in the same range of quantities.

#How does this compare to your prediction? Write down your results and thoughts.
# My predicition seems to coincide with the results: the linear regression faired better.
# I am not certain if this is because the columns seemed to have a similar theme or if it was due
# to other reasons.