In [1]:
import pandas as pd
from pandas.core import datetools
import numpy as np
import matplotlib.pyplot as plt
import os
import re

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.model_selection import cross_val_score
#from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold

%matplotlib inline


  


# Classifier Final Model
This is the final model for the classification methods.

# Data Acquisition
The following code imports and validates the LendingClub data.

In [2]:
converters = dict(
    id=str,
    desc=str,
    hardship_type=str,
    hardship_reason=str,
    hardship_status=str,
    hardship_loan_status=str,
    verification_status_joint=str
)
dates = [
    'next_pymnt_d',
    'hardship_start_date',
    'hardship_end_date',
    'payment_plan_start_date',
    'earliest_cr_line',
    'issue_d'
]

In [3]:
# Imports loan data

df = pd.DataFrame()
basepath='./Source Data/Loan Data/'
files = os.listdir(basepath)
csvs = []

for file in files:
    if re.match('.*csv$',file):
        csvs += [file]

if 0:
    #ignore this - was trying to pickle the data into
    #formats like feather, hdf5, native python pickling, etc
    # but found issues on python 3.7
    df=pd.read_pickle(basepath+'df.pkl')
else:
    cols = df.dtypes
    for csv in csvs:
        path = basepath + csv
        print("Reading",path)
        tdf = pd.read_csv(path,header=1,low_memory=False)
        df=df.append(tdf)
    
df.reset_index(inplace=True) # This will help with joining back data if necessary.

print(df.shape)

Reading ./Source Data/Loan Data/LoanStats3a_securev1.csv
Reading ./Source Data/Loan Data/LoanStats3b_securev1.csv
Reading ./Source Data/Loan Data/LoanStats3c_securev1.csv
Reading ./Source Data/Loan Data/LoanStats3d_securev1.csv
Reading ./Source Data/Loan Data/LoanStats_securev1_2016Q1.csv
Reading ./Source Data/Loan Data/LoanStats_securev1_2016Q2.csv
Reading ./Source Data/Loan Data/LoanStats_securev1_2016Q3.csv
Reading ./Source Data/Loan Data/LoanStats_securev1_2016Q4.csv
Reading ./Source Data/Loan Data/LoanStats_securev1_2017Q1.csv
Reading ./Source Data/Loan Data/LoanStats_securev1_2017Q2.csv
Reading ./Source Data/Loan Data/LoanStats_securev1_2017Q3.csv
Reading ./Source Data/Loan Data/LoanStats_securev1_2017Q4.csv
Reading ./Source Data/Loan Data/LoanStats_securev1_2018Q1.csv
(1873317, 152)


In [4]:
has_data = {}
for column in df.columns:
    has_data[column] = len(df[column].dropna())
#print(len(has_data))
#has_data

order_has_data=sorted(has_data, key=lambda dict_key: has_data[dict_key])

top_sparse=50

In [5]:
# Convert dates to datetime
df['issue_d'] = pd.to_datetime(df['issue_d'])
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'])

#determine age of credit line prior to loan issue and convert to integer
# days of credit history
df['earliest_cr_line'] = (df['issue_d']-df['earliest_cr_line']).dt.days

# convert issue_d to a year to consider economic conditions
#SHOULD WE GO TO QUARTERS?
df['issue_d'] = df['issue_d'].dt.year

print(df.shape)

(1873317, 152)


In [6]:
# Limit to loans that are paid in full or written off. Uses dates so that 
# loans that are delinquent are not disproportionaltely dropped from data

mature_filter = (df['loan_status']=='Fully Paid')|(df['loan_status']=='Charged Off')
#mature_filter = (df['loan_status']=='Fully Paid')
#latest_mature = df[~mature_filter]['issue_d'].min()
#latest_mature
#reduced_df = df[df['issue_d']<=latest_mature]
reduced_df = df[mature_filter] # Pulls only loans that are charged off or paid in full.
#
## Use my documentation to filter to only 
data_dict = pd.read_excel('./Source Data/LCDataDictionary.xlsx',sheet_name='LoanStats')
features = list(data_dict[data_dict['Useful Predictor']=='Yes']['LoanStatNew'].values)
kaggle_features=["addr_state", "annual_inc", "delinq_2yrs", "desc", "dti", "earliest_cr_line", "emp_length",
                 "emp_title", "grade", "home_ownership", "id", "inq_fi", "inq_last_6mths", "installment",
                 "int_rate", "loan_amnt", "loan_status", "mths_since_last_delinq", "mths_since_last_major_derog",
                 "mths_since_last_record", "open_acc", "pub_rec", "purpose", "pymnt_plan", "revol_bal",
                 "revol_util", "sub_grade", "term", "title", "total_acc", "url", "verification_status", "zip_code"]
#reduced_df=reduced_df[features]
non_kaggle_features=['application_type','fico_range_low','fico_range_high',"total_pymnt"]
reduced_df=reduced_df[kaggle_features+non_kaggle_features]

# Combines fields when necessary
reduced_df['fico_est'] = (reduced_df['fico_range_low']+reduced_df['fico_range_high'])/2

reduced_df.drop(columns=['fico_range_low','fico_range_high'],inplace=True)

print(reduced_df.shape)

(1020552, 36)


In [7]:
# backup our df
backup_df = reduced_df.copy()

In [8]:
#restore our df
reduced_df = backup_df.copy()

In [9]:
# Convert strings to numbers emp_length, int_rate, revol_util
emp_length_map={'10+ years':10, '< 1 year':0, '1 year':1, '3 years':3, '8 years':8, '9 years':9,
                '4 years':4, '5 years':5, '6 years':6, '2 years':2, '7 years':7}

reduced_df['emp_length']=reduced_df['emp_length'].replace(pd.Series(emp_length_map))

grade_map={"A":1,"B":2,"C":3,"D":4,"E":5,"F":6,"G":7}
reduced_df['grade']=reduced_df['grade'].replace(pd.Series(grade_map))

reduced_df['int_rate']=reduced_df['int_rate'].apply(lambda x: float(x[:-1]))
reduced_df['revol_util']=reduced_df['revol_util'].apply(lambda x:
                                                        x[:-1] if isinstance(x, str) else np.nan).astype(float)

reduced_df['earliest_cr_line']=reduced_df['earliest_cr_line'].apply(lambda x:
                                                        0.0 if np.isnan(x) else x)

reduced_df.drop(columns=['title','emp_title','desc','url','id','sub_grade','addr_state','zip_code'],inplace=True)

print(reduced_df.shape)

(1020552, 28)


In [10]:
seta=set(reduced_df.columns)

reduced_df=pd.get_dummies(data=reduced_df,columns=['pymnt_plan','loan_status','application_type','term',
                                                   'verification_status',
                                                   'home_ownership','purpose'],
                          drop_first=True)


setb=set(reduced_df.columns)
print(setb-seta)
print(reduced_df.shape)

{'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'purpose_renewable_energy', 'application_type_Joint App', 'purpose_house', 'term_ 60 months', 'purpose_moving', 'purpose_educational', 'purpose_other', 'verification_status_Verified', 'purpose_medical', 'home_ownership_RENT', 'home_ownership_OWN', 'purpose_debt_consolidation', 'purpose_wedding', 'purpose_credit_card', 'verification_status_Source Verified', 'purpose_major_purchase', 'purpose_small_business', 'loan_status_Fully Paid', 'purpose_home_improvement', 'purpose_vacation', 'home_ownership_NONE'}
(1020552, 44)


# How to treat NaN?

For now, let's remove majority NaN columns...

In [11]:
has_data = {}
for column in reduced_df.columns:
    has_data[column] = len(reduced_df[column].dropna())
has_data

order_has_data=sorted(has_data, key=lambda dict_key: has_data[dict_key])

top_sparse=25
for i,j in zip(range(top_sparse),order_has_data[0:top_sparse]):
    print(i,j, has_data[j])

0 mths_since_last_record 172185
1 mths_since_last_major_derog 264308
2 inq_fi 296093
3 mths_since_last_delinq 503506
4 emp_length 965386
5 revol_util 1019945
6 dti 1020435
7 inq_last_6mths 1020551
8 annual_inc 1020552
9 delinq_2yrs 1020552
10 earliest_cr_line 1020552
11 grade 1020552
12 installment 1020552
13 int_rate 1020552
14 loan_amnt 1020552
15 open_acc 1020552
16 pub_rec 1020552
17 revol_bal 1020552
18 total_acc 1020552
19 total_pymnt 1020552
20 fico_est 1020552
21 loan_status_Fully Paid 1020552
22 application_type_Joint App 1020552
23 term_ 60 months 1020552
24 verification_status_Source Verified 1020552


In [12]:
nonnan_df=reduced_df.fillna(0)
#nonnan_df=reduced_df.drop(columns=order_has_data[0:4])
print(nonnan_df.shape)
print(len(reduced_df.dropna()),len(nonnan_df.dropna()))

(1020552, 44)
14988 1020552


In [13]:
nonnan_df=nonnan_df.dropna()
print(nonnan_df.shape)

n_options = {}
for column in nonnan_df.columns:
    n_options[column] = len(nonnan_df[column].unique())
#n_options
order_n_options=sorted(n_options, key=lambda dict_key: n_options[dict_key])

for i in order_n_options[0:49]:
    print(i, n_options[i],nonnan_df[i].unique())

(1020552, 44)
loan_status_Fully Paid 2 [1 0]
application_type_Joint App 2 [0 1]
term_ 60 months 2 [0 1]
verification_status_Source Verified 2 [0 1]
verification_status_Verified 2 [1 0]
home_ownership_MORTGAGE 2 [0 1]
home_ownership_NONE 2 [0 1]
home_ownership_OTHER 2 [0 1]
home_ownership_OWN 2 [0 1]
home_ownership_RENT 2 [1 0]
purpose_credit_card 2 [1 0]
purpose_debt_consolidation 2 [0 1]
purpose_educational 2 [0 1]
purpose_home_improvement 2 [0 1]
purpose_house 2 [0 1]
purpose_major_purchase 2 [0 1]
purpose_medical 2 [0 1]
purpose_moving 2 [0 1]
purpose_other 2 [0 1]
purpose_renewable_energy 2 [0 1]
purpose_small_business 2 [0 1]
purpose_vacation 2 [0 1]
purpose_wedding 2 [0 1]
grade 7 [2 3 1 5 6 4 7]
inq_last_6mths 9 [1. 5. 2. 0. 3. 4. 6. 7. 8.]
emp_length 11 [10.  0.  1.  3.  8.  9.  4.  5.  6.  2.  7.]
inq_fi 29 [ 0.  2.  1.  5.  3.  4.  6. 16.  7.  9. 10. 11. 13. 17.  8. 12. 14. 15.
 24. 20. 21. 18. 19. 23. 28. 22. 25. 27. 32.]
delinq_2yrs 30 [ 0.  2.  3.  1.  4.  6.  5.  8.  7.  

total_pymnt 968429 [ 5863.1551867   1014.53        3005.66684414 ...  7428.09944457
 12483.1542331  14662.94701135]


In [14]:
nonnan_df['percent_of_income'] = nonnan_df['installment']*12/nonnan_df['annual_inc']
nonnan_df['percent_of_income'].describe()

count    1.020552e+06
mean              inf
std               NaN
min      1.231699e-04
25%      4.688933e-02
50%      7.284552e-02
75%      1.057535e-01
max               inf
Name: percent_of_income, dtype: float64

In [15]:
nonnan_df = nonnan_df[nonnan_df['application_type_Joint App']==0]

In [16]:
# Additional factors to remove
to_remove=[]
if 'installment' not in to_remove:
    #to_remove += ['installment']
    to_remove += ['total_pymnt','loan_amnt','grade','loan_status_Fully Paid']
features = list(set(nonnan_df.columns) - set(to_remove))

# Let's work with training and test sets

In [17]:
print(df.shape,nonnan_df.shape)

# need to look at interest rate as well!!!
#y=nonnan_df['total_pymnt']/(nonnan_df['installment']*36 + nonnan_df['installment']*nonnan_df['term_ 60 months']*24)
y=nonnan_df['loan_status_Fully Paid']
#y=nonnan_df['total_pymnt']/nonnan_df['loan_amnt']
print(features)
traintest_df=nonnan_df[features]
print(traintest_df.columns)

Xscaler = StandardScaler()
Xscaler.fit_transform(traintest_df)

X_traintune, X_test, y_traintune, y_test = train_test_split(
    traintest_df,y,test_size=0.2)#,random_state=42)#,stratify=nonnan_df[['loan_status']])

X_train, X_tune, y_train, y_tune = train_test_split(
    X_traintune,y_traintune,test_size=0.2)#,random_state=42)#,stratify=nonnan_df[['loan_status']])

print(X_train.shape,X_tune.shape,X_test.shape)

(1873317, 152) (1011023, 45)
['delinq_2yrs', 'home_ownership_MORTGAGE', 'dti', 'home_ownership_OTHER', 'application_type_Joint App', 'purpose_renewable_energy', 'purpose_house', 'term_ 60 months', 'inq_last_6mths', 'purpose_moving', 'earliest_cr_line', 'fico_est', 'home_ownership_NONE', 'purpose_educational', 'purpose_other', 'inq_fi', 'verification_status_Verified', 'purpose_medical', 'total_acc', 'emp_length', 'home_ownership_RENT', 'installment', 'home_ownership_OWN', 'purpose_debt_consolidation', 'revol_bal', 'mths_since_last_record', 'purpose_wedding', 'purpose_credit_card', 'verification_status_Source Verified', 'purpose_major_purchase', 'mths_since_last_major_derog', 'pub_rec', 'revol_util', 'purpose_small_business', 'open_acc', 'percent_of_income', 'mths_since_last_delinq', 'purpose_home_improvement', 'annual_inc', 'purpose_vacation', 'int_rate']
Index(['delinq_2yrs', 'home_ownership_MORTGAGE', 'dti', 'home_ownership_OTHER',
       'application_type_Joint App', 'purpose_renewab

In [18]:
print(y.mean(),y.std())

0.7955644925980913 0.40328871988163834


In [19]:
# your code here
#kfold=KFold(5, shuffle=True)

parameters = {'max_depth':range(1,10)}
clf = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=4,cv=5)
clf.fit(X=X_train, y=y_train)
dtclf = clf.best_estimator_

results=clf.cv_results_

print(dtclf)

print("\nBest decision tree score: {} occurs at depth: {}".format(
    clf.best_score_, clf.best_params_['max_depth']))

print("Decision Tree test prediction accuracy: %f" % accuracy_score(dtclf.predict(X_test),y_test))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

Best decision tree score: 0.7971452150825125 occurs at depth: 7
Decision Tree test prediction accuracy: 0.796973


In [20]:
# your code here
rfclf = RandomForestClassifier(n_estimators=25,max_depth=10)
rfclf.fit(X_train, y_train)

print("Random Forest training prediction accuracy: %f" % accuracy_score(rfclf.predict(X_train),y_train))

print("Random Forest test prediction accuracy: %f" % accuracy_score(rfclf.predict(X_test),y_test))

Random Forest training prediction accuracy: 0.798372
Random Forest test prediction accuracy: 0.797097


In [21]:
# your code here
basic=DecisionTreeClassifier(max_depth=3)
abclf = AdaBoostClassifier(n_estimators=50,
                         learning_rate=0.05,
                         base_estimator=basic)
# Train model
abclf.fit(X_train, y_train)
print("Adaboost test prediction accuracy: %f" % accuracy_score(abclf.predict(X_test),y_test))

Adaboost test prediction accuracy: 0.796959


In [22]:
# your code here
fitted_logreg = LogisticRegressionCV().fit(X_train, y_train)
print("Logistic model accuracy: {}".format(fitted_logreg.score(X_test, y_test)))

#print("\nLogistic coefficients weights:")
#print(*X_train, sep=' ')
#print(*fitted_logreg.coef_[0], sep=' ')


Logistic model accuracy: 0.7963601295714745


In [23]:
# pickle the models
ensemble_dict={}
ensemble_dict["DT"]=dtclf
ensemble_dict["RF"]=rfclf
ensemble_dict["Log"]=fitted_logreg
ensemble_dict["Ada"]=abclf

import pickle
with open("ensemble.pkl", 'wb') as outfile:
    pickle.dump(ensemble_dict,outfile)

# Ensemble Tuning

In [24]:
with open("ensemble.pkl", 'rb') as infile:
    model_dict = pickle.load(infile)
    
ensemble_tune=pd.DataFrame()
ensemble_test=pd.DataFrame()
for k,v in model_dict.items():
    ensemble_tune[k]=v.predict_proba(X_tune)[:,1]
    ensemble_test[k]=v.predict_proba(X_test)[:,1]

print(ensemble_tune.shape,ensemble_test.shape,X_tune.shape,X_test.shape)
eX_tune=X_tune.reset_index(drop=True)
eX_test=X_test.reset_index(drop=True)
ey_tune=y_tune.reset_index(drop=True)
ey_test=y_test.reset_index(drop=True)

# your code here
augmented_tune=pd.concat((ensemble_tune,eX_tune),axis=1)
augmented_test=pd.concat((ensemble_test,eX_test),axis=1)

print(augmented_tune.shape,augmented_test.shape)

(161764, 4) (202205, 4) (161764, 41) (202205, 41)
(161764, 45) (202205, 45)


In [25]:
ensemble_clf=DecisionTreeClassifier(max_depth=10).fit(augmented_tune,ey_tune)
print(ensemble_clf.score(augmented_test,ey_test))

0.7949457233995203


In [26]:
ensemble_rfclf = RandomForestClassifier(n_estimators=25,max_depth=10).fit(augmented_tune,ey_tune)
print(ensemble_rfclf.score(augmented_test,ey_test))

0.798808140253703


In [27]:
# your code here
ensemble_logreg = LogisticRegressionCV().fit(augmented_tune, ey_tune)
print("Logistic model accuracy: {}".format(ensemble_logreg.score(augmented_test, ey_test)))

Logistic model accuracy: 0.7963799114759773


In [28]:
# your code here
basic=DecisionTreeClassifier(max_depth=3)
ensemble_abclf = AdaBoostClassifier(n_estimators=50,
                         learning_rate=0.05,
                         base_estimator=basic)
# Train model
ensemble_abclf.fit(augmented_tune, ey_tune)
print("Adaboost test prediction accuracy: %f" % accuracy_score(ensemble_abclf.predict(augmented_test),ey_test))

Adaboost test prediction accuracy: 0.798783


In [29]:
data_for_final = pd.DataFrame(index=traintest_df.index)

for key in ensemble_dict.keys():
    data_for_final[key] = ensemble_dict[key].predict(traintest_df)

In [30]:
pickle.dump(data_for_final, open("pred_for_final.pkl", 'wb'))