# Import Statements

In [1]:
# regular imports
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

%matplotlib inline

In [2]:
# display related imports
from IPython.display import display, Image
from IPython.display import HTML
from IPython.display import IFrame

# Widgets
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
!jupyter nbextension enable --py widgetsnbextension

# to save dataframe as an image
import dataframe_image as dfi

# hide warnings
import warnings
warnings.filterwarnings('ignore')

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


# Functions for display and data


In [3]:
# interact allows makes the function interactive without a needed call to it
#@interact
def w_(n_columns=(5, 30, 5), n_rows=(10,60, 10)):
    ''' sliders to select max rows and columns, not really needed here, but usefull in other notebooks'''
    if n_columns>25:
        pd.set_option('display.max_columns', None)
        c = 'ALL'
    else:
        pd.set_option('display.max_columns', n_columns)
        c = str(n_columns)
    if n_rows > 50:
        pd.set_option('display.max_rows', None)
        r = 'ALL'
    else:
        pd.set_option('display.max_rows', n_rows)
        r = str(n_rows)
    print('Number of columns to display ->', c)
    print('Number of rows to display ->', r)

In [4]:
# EDA Functions

In [5]:
def view_describe(df, ret_percent):
    ''' does a describe on data and returns a smaller dataframe to look at based on percent'''
    print('SHAPE')
    print(df.shape)
    print('*' * 100)
    print('HEAD')
    print(df.head(10))
    print('*' * 100)
    print('DESCRIBE')
    print(df.describe().round())
    if ret_percent < 1:
        return df.sample(frac = ret_percent, random_state = 22)
    return df

In [6]:
# Change Col Names

In [7]:
def change_col_space_lower(df):
    '''Function to setup the column names
    Inputs: dataframe
    Outputs: dataframe with changed column names
     - All lower Case
     - no spaces, change to _
     - no dashes, change to _
     - will add a dict input that will change key words into other word
     (i.e. employment to emp)
    '''
    df.columns = df.columns.str.replace((' ' and '-'), '_').str.lower()

In [8]:
# Look at Missing Data

In [9]:
def missing_data_df(df_):
    ''' function to transpose the data use the column names to describe the data set
    input: dataframe
    output: transposed dataframe with info on missing count,
        missing percent,
        count of data,
        unigue values
    '''
    df = pd.DataFrame(df_.head(1).T).reset_index()
    df['na_count']   = [df_[col_].isnull().sum() for col_ in df['index']]
    df['na_percent'] = [df_[col_].isnull().sum()/df_.shape[0] for col_ in df['index']]
    df['count']      = [df_[col_].count() for col_ in df['index']]
    df['unique_count'] = [len(df_[col_].unique()) for col_ in df['index']]
    df['dtype_'] = [df_[col_].dtypes for col_ in df['index']]
    df['unique_data'] = ['None' if df_[col_].dtypes != np.object else ','.join(map(str, df_[col_].unique())) for col_ in df['index']]
    df['Drop_'] = 0

    df.drop(0, axis=1, inplace=True)
    df.columns = df.columns.str.replace('index', 'col_name')
    return df



# Lending Club Info
Lending Club: https://drive.google.com/file/d/10XHxLtu0Jcuf4hXxB8F3oPdlUlMKYV9g/view?usp=sharing

https://www.kaggle.com/wordsforthewise/lending-club

https://www.lendingclub.com

# Display Setup

In [15]:
print('sliders to change number of rows and columns')
interact(w_)
print('be careful, full dataset can go slow when set to max')

sliders to change number of rows and columns
be careful, full dataset can go slow when set to max


interactive(children=(IntSlider(value=15, description='n_columns', max=30, min=5, step=5), IntSlider(value=30,…

In [10]:
# load data, set that you want pick one, both sets will slow down computer - also skip and load smaller pickle
# set if that is all you need

LOAD = 'Accept_pickle' # 'None' 'Rejected' 'Both' 'Accept_pickle' 'Accepted'
if LOAD == 'Accepted':
    accepted = pd.read_csv('data/lending_data/accepted_2007_to_2018Q4.csv')
    print('You Loaded the accepted file, this is the full file!')
elif LOAD == 'Rejected':
    rejected = pd.read_csv('data/lending_data/rejected_2007_to_2018Q4.csv')
    print('You loaded the Rejected File, this is the full file')
elif LOAD == 'Both':
    accepted = pd.read_csv('data/lending_data/accepted_2007_to_2018Q4.csv')
    rejected = pd.read_csv('data/lending_data/rejected_2007_to_2018Q4.csv')
elif LOAD == 'Accept_pickle':
    accepted = pd.read_pickle('data/accepted_rem_na.pkl')
    print('Loaded the pickle file - all changes below are reflected in this file')
else:
    print('Nothing Has Been Loaded!')

In [None]:
%%script false
# makes a small version of the data set and shows describe on full set of data
# %%script false <- make false if you just want to load pickle version
accepted_sm = view_describe(accepted, .01)
rejected_sm = view_describe(rejected, .01)

In [None]:
#drop stuff from memory




# To Pickle

In [None]:
%%script false # pickle is out in data, else remove save pickle info

col_accep = missing_data_df(accepted)
col_accep.to_pickle('data/col_info_accept.pkl')
col_rejected = missing_data_df(rejected)
col_rejected.to_pickle('data/col_info_rejected.pkl')
accepted_sm.to_pickle('data/accepted_sm.pkl')
rejected_sm.to_pickle('data/rejected_sm.pkl')

# Load Pickle

In [None]:
# %%script false set to load from pickle

col_accepted = pd.read_pickle('data/col_info_accept.pkl')
col_rejected = pd.read_pickle('data/col_info_rejected.pkl')
accepted_sm = pd.read_pickle('data/accepted_sm.pkl')

In [None]:
col_accepted

In [None]:
# ******* setting up an xlxs for looking at data columns *****
#!pip install openpyxl - need to install to write excel file
#!pip install xlrd - need to install to read excel file
#sending to csv so I can update column info in excel
#accepted = pd.read_csv('data/lending_data/accepted_2007_to_2018Q4.csv')
#col_accep = missing_data_df(accepted)
#col_accep.to_excel('data/column_accep_info.xlsx')

In [None]:
col_accep_excel = pd.read_excel('data/column_accep_info.xls')

In [None]:
col_accep_excel

# Notes

In [None]:
# drop url, member id, id?,
# keep hardship flag drop all hardship? not known at start

In [None]:
# predict expected return from start?
# recreate their risk score?
# can I create a warning system when a loan is become in danger, create
# a system to mediate before it hits the danger area?
# figure out the loss amounts and see how to reduce?

breaking all the accepted loans into deals (bonds)
Have different investment classes for the deals (groups of loans)
You have the Very Safe Deal
the Medium Risk deal (some risk some return)
the High Risk High Reward Class
The Junk bond status let's roll the dice!

What discount rate would be required to make these marketable?

Problems:
not sure this data gives the date that problems start? might not be useable that way
but can use the start data and make projections about the data from the start

Can also look at their interest rate to calculate based on their starting rates...

High of 30+ percent, pretty sure that is capped in some states, might need to see which states cap their rates for personal loans... usury laws

In [None]:
# and all good things come to an end, and so does this!

https://help.lendingclub.com/hc/en-us/articles/360050574891-Important-Updates-to-the-LendingClub-Notes-Platform

They are ending the notes program Dec 28th of this year!
So, it will no longer be available as a program.
Might want to copy their website so that future students have this as a resource.

Below is their loan grades and how they classify them into 12 deals... with their percents as of june 30, 2019...
2,311,305 loans open at that time...




In [None]:
Image('lending_club_deals.png')


In [None]:
# Split app date to month year
# State 52? is that na and DC?
# Risk score, High Bad?
# Loan Title - parse interesting words out? Can figure from columns at start

# From here down I'm just playing around with the data getting used to it...
# Finding columns to eliminate

In [None]:
col_accepted['col_name']

# Column info

# credit report info
delinq_2yrs
earliest_cr_line date
fico_range_high
fico_range_high
inq_last_6mnths
mths_since_last_delinq

loan_amnt - float *
funded_amnt - drop
funded_amnt_inv - drop

term - cat (36, 60) *
int_rate - float *
installment (float) *
grade - cat (a-e) *
sub_grade - cat *
emp_title - string keep for now? *
emp_length - cat 0-10 *
home_ownership - cat (rent, own, Mortgage) *
annual_inc float *
verification_status cat (change to bool) *
issue_d date (month year) *

loan_status cat - keep
pymnt_plan - I think this is used for defaults ?
desc - used to describe pymnt_plan ?

purpose cat consol *
title cat consol *
zip_code cat *
abbr_state cat *
dti * debt-to-income same as in rejected

In [None]:
rej_set = set(rejected_sm.columns)

In [None]:
rej_not_in_accepted = rej_set - set(accepted_sm.columns)

In [None]:
in_both = rej_set - rej_not_in_accepted

In [None]:
in_both

In [None]:
change_col_space_lower(rejected_sm)

In [None]:
sns.pairplot(rejected_sm)

# Reducing NA's

In [32]:
accepted = pd.read_csv('data/lending_data/accepted_2007_to_2018Q4.csv')

In [33]:
# getting rid of the 33 na

In [11]:
accepted_rem_na = accepted[accepted['loan_amnt'].notna()].drop('member_id', axis=1)
accepted_rem_na.drop(['funded_amnt',
                      'funded_amnt_inv'], axis=1, inplace=True)
accepted_rem_na = accepted_rem_na[accepted_rem_na['dti'].notna()]
accepted_rem_na = accepted_rem_na[accepted_rem_na['total_acc'].notna()]
accepted_rem_na = accepted_rem_na[accepted_rem_na['collections_12_mths_ex_med'].notna()]
accepted_rem_na = accepted_rem_na[accepted_rem_na['last_credit_pull_d'].notna()]
accepted_rem_na = accepted_rem_na[accepted_rem_na['zip_code'].notna()]
accepted_rem_na = accepted_rem_na[accepted_rem_na['inq_last_6mths'].notna()]
hardship_col = [col for col in accepted_rem_na if col.startswith('hard')]
accepted_rem_na.drop(hardship_col[1:], axis=1, inplace=True)
settle_col = [col for col in accepted_rem_na if col.startswith('settle')]
accepted_rem_na.drop(settle_col[1:], axis=1, inplace=True)
accepted_rem_na.drop(['debt_settlement_flag_date',
                      'settlement_status',
                      'emp_title',
                      'url',
                      'desc',
                      'title'], axis=1, inplace=True)

In [13]:
del accepted
del hardship_col
del settle_col
col_accep = missing_data_df(accepted_rem_na)
for col_ in col_accep['col_name'][1:]:
     #['unique_count']])#if accepted_rem_na[col_][['unique_count']<20]:
     if (len(accepted_rem_na[col_].unique()) < 55):
         print(col_)
         accepted_rem_na[col_] = accepted_rem_na[col_].astype('category')


term
grade
sub_grade
emp_length
home_ownership
verification_status
loan_status
pymnt_plan
purpose
addr_state
delinq_2yrs
fico_range_low
fico_range_high
inq_last_6mths
pub_rec
initial_list_status
collections_12_mths_ex_med
policy_code
application_type
verification_status_joint
acc_now_delinq
open_acc_6m
open_il_12m
open_il_24m
open_rv_12m
open_rv_24m
inq_fi
inq_last_12m
chargeoff_within_12_mths
mort_acc
mths_since_recent_inq
num_accts_ever_120_pd
num_actv_bc_tl
num_rev_tl_bal_gt_0
num_tl_120dpd_2m
num_tl_30dpd
num_tl_90g_dpd_24m
num_tl_op_past_12m
pub_rec_bankruptcies
tax_liens
sec_app_inq_last_6mths
sec_app_mort_acc
sec_app_open_act_il
sec_app_chargeoff_within_12_mths
sec_app_collections_12_mths_ex_med
hardship_flag
deferral_term
payment_plan_start_date
disbursement_method
debt_settlement_flag


In [None]:
col_accep = missing_data_df(accepted_rem_na)

In [16]:
col_accep

Unnamed: 0,col_name,na_count,na_percent,count,unique_count,dtype_,unique_data,Drop_
0,id,0,0.0,2258740,2258740,object,"68407277,68355089,68341763,66310712,68476807,6...",0
1,loan_amnt,0,0.0,2258740,1572,float64,,0
2,term,0,0.0,2258740,2,object,"36 months, 60 months",0
3,int_rate,0,0.0,2258740,673,float64,,0
4,installment,0,0.0,2258740,93255,float64,,0
5,grade,0,0.0,2258740,7,object,"C,B,F,A,E,D,G",0
6,sub_grade,0,0.0,2258740,35,object,"C4,C1,B4,C5,F1,C3,B2,B1,A2,B5,C2,E2,A4,E3,A1,D...",0
7,emp_title,165321,0.073192,2093419,512582,object,"leadman,Engineer,truck driver,Information Syst...",0
8,emp_length,145305,0.06433,2113435,12,object,"10+ years,3 years,4 years,6 years,1 year,7 yea...",0
9,home_ownership,0,0.0,2258740,6,object,"MORTGAGE,RENT,OWN,ANY,NONE,OTHER",0


In [18]:
# saving current state to pickle
accepted_rem_na.to_pickle('data/accepted_rem_na.pkl')


In [None]:
dfi.export(col_accep, 'col_accep_table.png', max_rows=150)  # where df is your data frame

In [None]:
Image('col_accep_table.png')

In [42]:
pd.set_option.display.max_rows = 160

AttributeError: 'CallableDynamicDoc' object has no attribute 'display'