In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re

# Final Project

# Data Acquisition
The following code imports and validates the LendingClub data.

In [40]:
converters = dict(
    id=str,
    desc=str,
    hardship_type=str,
    hardship_reason=str,
    hardship_status=str,
    hardship_loan_status=str,
    verification_status_joint=str
)
dates = [
    'next_pymnt_d',
    'hardship_start_date',
    'hardship_end_date',
    'payment_plan_start_date',
    'earliest_cr_line',
    'issue_d'
]

In [58]:
# Imports loan data

df = pd.DataFrame()
files = os.listdir('./Source Data/Loan Data')
csvs = []
for file in files:
    if re.match('.*csv$',file):
        csvs += [file]
cols = df.dtypes
for csv in csvs:
    path = './Source Data/Loan Data/' + csv
    tdf = pd.read_csv(path,header=1,low_memory=False,)
    df=df.append(tdf)
df.reset_index(inplace=True) # This will help with joining back data if necessary.

In [59]:
pd.set_option('display.max_rows', 151)
print(df.dtypes)
pd.set_option('display.max_rows', 20)

id                                             object
member_id                                     float64
loan_amnt                                     float64
funded_amnt                                   float64
funded_amnt_inv                               float64
term                                           object
int_rate                                       object
installment                                   float64
grade                                          object
sub_grade                                      object
emp_title                                      object
emp_length                                     object
home_ownership                                 object
annual_inc                                    float64
verification_status                            object
issue_d                                        object
loan_status                                    object
pymnt_plan                                     object
url                         

In [60]:
# Checks for duplicates within our combined data
df['count']=1 # Generate field to measure the number of occurances

counts = df.groupby('id')['count'].count()
print('Data are duplicated %i times.' % len(counts[counts>1]))

Data are duplicated 0 times.


Some data are not useful as a predictor. Because we are going to use these data as a training set, we need to remove data that are not useful for predictions. There are two categories of data that are not useful for predictions. First, we need information about loans that have fully run their course. That is to say, we need to see loans that have either been paid in full or written off. It is not useful to see loans that are currently delinquent or current on payments but still early in the loan. Second, we need to remove predictors that are not import for predicting the loan. For example the URL has no impact on the borrower's ability to repay the loan. If we need this data in the future, we can still pair it back with the original dataframe.

In [82]:
# Limit to loans that are paid in full or written off. Uses dates so that 
# loans that are delinquent are not disproportionaltely dropped from data
# Convert the loan issue date to a datetime
df['issue_d'] = pd.to_datetime(df['issue_d'])
mature_filter = (df['loan_status']=='Fully Paid')|(df['loan_status']=='Charged Off')
latest_mature = df[loan_over_filter]['issue_d'].max()
reduced_df = df[df['issue_d']<=latest_mature]

# Use my documentation to filter to only 
data_dict = pd.read_excel('./Source Data/LCDataDictionary.xlsx',sheet_name='LoanStats')
features = list(data_dict[data_dict['Useful Predictor']=='Yes']['LoanStatNew'].values)
reduced_df=reduced_df[features]

The following code looks at every column and returns the number of unique values. This will give us insights on which variables might need dummy variables.

In [10]:
n_options = {}
for column in reduced_df.columns:
    n_options[column] = len(reduced_df[column].unique())

In [11]:
n_options

{'acc_now_delinq': 10,
 'acc_open_past_24mths': 58,
 'addr_state': 52,
 'all_util': 185,
 'annual_inc': 80238,
 'annual_inc_joint': 11233,
 'application_type': 3,
 'avg_cur_bal': 83899,
 'bc_open_to_buy': 84096,
 'bc_util': 1481,
 'chargeoff_within_12_mths': 12,
 'collection_recovery_fee': 110547,
 'collections_12_mths_ex_med': 17,
 'debt_settlement_flag': 3,
 'debt_settlement_flag_date': 75,
 'deferral_term': 2,
 'delinq_2yrs': 35,
 'delinq_amnt': 2570,
 'desc': 124502,
 'disbursement_method': 3,
 'dti': 9010,
 'dti_joint': 3921,
 'earliest_cr_line': 742,
 'emp_length': 12,
 'emp_title': 462870,
 'fico_range_high': 49,
 'fico_range_low': 49,
 'funded_amnt': 1573,
 'funded_amnt_inv': 10054,
 'grade': 8,
 'hardship_amount': 6366,
 'hardship_dpd': 35,
 'hardship_end_date': 20,
 'hardship_flag': 3,
 'hardship_last_payment_amount': 6094,
 'hardship_length': 2,
 'hardship_loan_status': 6,
 'hardship_payoff_balance_amount': 7274,
 'hardship_reason': 10,
 'hardship_start_date': 19,
 'hardship