In [16]:
# regular imports
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

%matplotlib inline

In [17]:
# display related imports
from IPython.display import display, Image, clear_output, HTML, IFrame

# Widgets
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

# to save dataframe as an image
import dataframe_image as dfi

# hide warnings
import warnings
warnings.filterwarnings('ignore')

# Forest
from sklearn.tree import DecisionTreeClassifier

In [18]:
# Functions

def percent_good(df_, col):
    print('_' * 100) # printing header to seperate the column info
    print('- ' * 20 , col, ' -' * 20)
    # Column Header for Data that follows in the for loop
    print('{:30s}  {:>8s}  {:>8s}  {:>8s}  {:>14s}  {:>8s}'.format('Category', 'Good %', 'Bad %', 'Col %', 'Tot Count', 'Int %'))

    for item_ in df_[col].unique():                               # Item is the unique category item from the column (col)
        tot_ = df_[col].count()                                   # total loans with data for this column
        t_ = df_[df[col]==item_][col].count()                     # Count of loans matching the category for this column
        c_ = df_[(df_[col]==item_) & (df_['good'])][col].count()  # Count of Good Loans 
        b_ = t_ - c_                                              # Count of bad loans (total - good)

        print('{:30s}  {:8.1%}  {:8.1%}  {:8.1%}  {:14,.0f}  {:8.1%}'
              .format(str(item_), c_/t_, b_/t_, t_/tot_, t_, 
                      df_[df[col]==item_]['int_rate'].mean()/100))


def mort_acc_bin(df_):
    '''Changes mort_acc to 0 for none and 1 for true'''
    df_['mort_acc'] = [0 if x== 0.0 else 1 for x in df_['mort_acc']]
    df_['mort_acc'] = df_['mort_acc'].astype('category')
    return df_


def m_bin(item):                                                  # Used to bin larger cat data into smaller amounts of bins
    if item == 0.0:
        return 0
    elif item < 5.0:
        return 1
    elif item < 10:
        return 2
    else:
        return 3
    
def to_true_false(df_, col, item=0):
    df_[col] = [0 if x == item else 1 for x in df_[col]]
    return df_

In [19]:
df = pd.read_pickle('data/df_accep.pkl')

In [20]:
df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,...,settlement_percentage,settlement_term,issue_year,issue_month,fico,term_60,years_since_earliest_credit,last_pay_d,months_of_pay,loss_amnt
0,3600.0,3600.0,3600.0,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,...,,,2015,12,3,0,12,2019-01-01,36,0.0
1,24700.0,24700.0,24700.0,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,...,,,2015,12,3,0,16,2016-06-01,5,0.0
2,20000.0,20000.0,20000.0,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,...,,,2015,12,3,1,15,2017-06-01,17,0.0
3,35000.0,35000.0,35000.0,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,...,,,2015,12,4,1,7,2019-02-01,37,0.0
4,10400.0,10400.0,10400.0,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,...,,,2015,12,3,1,17,2016-07-01,5,0.0


# The Good Field!
The good field has only one purpose in life, to show the good loans! 
What are good loans? Loans that pay off, or are current in at this time!

True for Good, False for BAD!

This is the field that all other fields will be judged by!
That's right, we are a bit judgy here...

In [6]:
df['good']=[True if x in ['Current', 'Fully Paid'] else False for x in df['loan_status']]

In [7]:
# fixing some fields here, more to be added shortly
df = mort_acc_bin(df)
df['earliest_credit_10_more'] = [0 if x<10 else 1 for x in df['years_since_earliest_credit']]

In [8]:
# test of above and the percent_good function above
percent_good(df, 'mort_acc')

____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  mort_acc  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
1                                  87.7%     12.3%     58.9%       1,331,062     12.7%
0                                  84.9%     15.1%     41.1%         929,606     13.6%


In [9]:
# Fields that are dropped, more to come!
to_drop = [
    'pymnt_plan',
    'url',
    'desc',
    'emp_title',
    'pymnt_plan',
    'title',
    'next_pymnt_d',
    'deferral_term',
    'years_since_earliest_credit',
    'debt_settlement_flag', 
    'orig_projected_additional_accrued_interest', 
    'payment_plan_start_date', 
    'sec_app_fico_range_high'
]
hard_ = [col for col in df if col.startswith('hard')]
settle_ = [col for col in df if col.startswith('settle')]
df.drop(to_drop, axis=1, inplace=True)
df.drop(hard_, axis=1, inplace=True)
df.drop(settle_, axis=1, inplace=True)

In [10]:
# save data for filter viewing in another notebook
df.to_csv('data/df_for_filter.csv')

KeyboardInterrupt: 

In [11]:
df = to_true_false(df, 'tax_liens')
df = to_true_false(df, 'disbursement_method', 'cash')
df = to_true_false(df, 'emp_length', '10+ years')
df.rename(columns={'disbursement_method': 'disb_direct', 'emp_length': 'emp_len_under_10'}, inplace=True)
#df['delinq_2yrs'] = [m_bin(x) for x in df['delinq_2yrs']]

In [12]:
col_ = ['inq_last_6mths', 
       'open_acc',
        'collections_12_mths_ex_med',
        'open_acc_6m',
        'collections_12_mths_ex_med',
        'delinq_2yrs',
        'pub_rec',
        'acc_now_delinq',
        'open_il_12m',
        'open_il_24m',
        'num_tl_90g_dpd_24m',
        'num_tl_op_past_12m'
        
       ]

for col in col_:
    df[col] = [m_bin(x) for x in df[col]]
df['delinq_2yrs'].unique


<bound method Series.unique of 0          0
1          1
2          0
3          0
4          1
          ..
2260694    0
2260695    0
2260696    2
2260697    0
2260698    0
Name: delinq_2yrs, Length: 2260668, dtype: int64>

In [13]:
# months since

m = 36 # using 36 as cutoff - might change to 50 or use this as a controlled variable later
for col_ in df.columns:
    if col_.startswith('mths'):
        df[col_] = [0 if x < m else 1 for x in df[col_]]

In [14]:
percent_good(df, 'emp_len_under_10')

____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  emp_len_under_10  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
0                                  87.5%     12.5%     33.1%         748,005     13.0%
1                                  86.1%     13.9%     66.9%       1,512,663     13.1%


In [15]:
# This is the magic! This little bit of lovely code will print all the columns that it can, 
# based on unique values in the columns with percent GOOD, BAD, Percent of Column, 
# Number of Loans in, Category and Interest Rate
for col in df.columns:
    if len(df[col].unique()) < 55:
        percent_good(df, col)
    else:
        if col != 'int_rate':
            print('-' * 100)
            print('- '* 20 , col, ' -' * 20)
            print('Unique Items = ', len(df[col].unique()), type(col))
            try:
                df[col]=pd.qcut(df[col], q=4)
                percent_good(df, col)
            except:
                print('Column : ', col, " can't be converted")

----------------------------------------------------------------------------------------------------
- - - - - - - - - - - - - - - - - - - -  loan_amnt  - - - - - - - - - - - - - - - - - - - -
Unique Items =  1572 <class 'str'>
____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  loan_amnt  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
(499.999, 5000.0]                  89.2%     10.8%     13.2%         299,371     12.8%
(23025.0, 30000.0]                 85.4%     14.6%     12.4%         281,125     13.8%
(19350.0, 23025.0]                 85.3%     14.7%     10.0%         225,997     13.5%
(30000.0, 40000.0]                 86.1%     13.9%      7.6%         170,833     13.9%
(10325.0, 12900.0]                 85.6%     14.4%     10.0%         225,965     13.3%
(9450.0, 10325.0]                  87.9%     12.1%     10

In [15]:
# find the loss on the bad
# can we do better than the 86.6%?

think of the returns as time value of money... you recieve the payments each month (p & i) minus 1% and any late or no payments
You must reinvest or accept that you recieve the lower amount of interest each month until you do reinvest the money


Some strategies (get out of higher risk loans before they start hitting danger zone - what is the penalty to selling? do other investers know this
and therefore pay a reduced price for this loan?



In [16]:
# credit usually only uses the low fico not both

In [17]:
loan_amnts = ['funded_amnt_inv', 'funded_amnt', 'loan_amnt']

In [18]:
# reduce or do something with

In [None]:
sec_app_mths_since_last_major_derog, sec_app_open_act_il, sec_app_mort_acc

In [51]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 40)

In [72]:
issue_year_ = 2013
grade_ = ['D', 'E', 'F']
fico_ = 1 # above this
min_int_rate_ = 14.5
emp_length_ = '<1 year'
annual_inc_ = 70000
earliest_credit_10_more_ = 1
loan_amnt_ = 25001
term_60_ = 0
chargeoff_within_12_mths_ = 0
purpose_ = ['credit_card', 'debt_consolidation']
total_acc_ = 25 # less than this amount
installment_ = 800
issue_month_ = [1,3]

In [78]:
df_filtered = df[(df['issue_year']==issue_year_) & 
   (df['grade'].isin(grade_)) & 
   (df['fico']>fico_) & 
   (df['int_rate']>min_int_rate_) & 
   (df['home_ownership']!='RENT') & 
   (df['emp_length']!= emp_length_) &
   (df['annual_inc'] > annual_inc_) &
   (df['earliest_credit_10_more'] == earliest_credit_10_more_) &
   (df['loan_amnt'] < loan_amnt_) &
   (df['term_60']==term_60_) &
   (df['purpose'].isin(purpose_)) &
   (df['total_acc'] < total_acc_) &
   (df['installment'] < installment_) &
   (df['issue_month'].isin(issue_month_))
  ].iloc[:,[117,116, 13, 118, 15, 9, 5, 6, 7, 8, 0,3,4, 119, 124, 12, 121, 122, 123, 28, 29, 30, 31, 32, 33]].sort_values(['issue_month', 'int_rate'])
print(df_filtered.groupby('good')['loss_amnt'].count())
loss = round(df_filtered[df_filtered['good']!=True]['loss_amnt'].sum(),-1)
loan = round(df_filtered[df_filtered['good']!=True]['loan_amnt'].sum(),-1)
print('Loss amount :', loss)
print('Loan amount :', loan)
print('Loss / Loan :', round(loss/loan, 3))
# total_pymnt_inv loan_amnt
rec = df_filtered['total_pymnt_inv'].sum()
paid_out = df_filtered['loan_amnt'].sum()
gain = rec - paid_out
print('Total rec   :', round(rec, -1))
print('Total paid  :', round(paid_out, -1))
print('Total gain  :', round(gain, -1))
print('Percent     :', round(gain/paid_out * 100, 1))


df_filtered

good
False    3
True     9
Name: loss_amnt, dtype: int64
Loss amount : 1960.0
Loan amount : 35100.0
Loss / Loan : 0.056
Total rec   : 165600.0
Total paid  : 148420.0
Total gain  : 17180.0
Percent     : 11.6


Unnamed: 0,issue_month,issue_year,purpose,fico,dti,annual_inc,grade,sub_grade,emp_length,home_ownership,loan_amnt,int_rate,installment,term_60,earliest_credit_10_more,loan_status,months_of_pay,loss_amnt,good,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries
1882504,1,2013,debt_consolidation,3,15.83,78000.0,D,D1,10+ years,MORTGAGE,4000.0,17.77,144.15,0,1,Fully Paid,12,0.0,True,4707.775234,4707.78,4000.0,707.78,0.0,0.0
1883125,1,2013,debt_consolidation,2,16.78,116000.0,E,E3,10+ years,MORTGAGE,16000.0,21.49,606.84,0,1,Fully Paid,1,0.0,True,16843.609586,16843.61,16000.0,843.61,0.0,0.0
1863642,3,2013,debt_consolidation,2,21.66,75000.0,D,D1,6 years,OWN,19500.0,17.77,702.73,0,1,Fully Paid,15,0.0,True,23144.569781,23144.57,19500.0,3644.57,0.0,0.0
1864136,3,2013,debt_consolidation,3,21.03,73000.0,D,D1,10+ years,MORTGAGE,20000.0,17.77,720.75,0,1,Fully Paid,6,0.0,True,21681.204973,21681.2,20000.0,1681.2,0.0,0.0
1868199,3,2013,debt_consolidation,3,27.13,77000.0,D,D1,10+ years,OWN,10000.0,17.77,360.38,0,1,Fully Paid,35,0.0,True,12973.265902,12973.27,10000.0,2973.27,0.0,0.0
1869520,3,2013,debt_consolidation,2,6.41,105000.0,D,D1,3 years,MORTGAGE,21325.0,17.77,768.5,0,1,Fully Paid,21,0.0,True,26640.489962,26515.56,21325.0,5315.49,0.0,0.0
1870914,3,2013,debt_consolidation,3,12.26,74000.0,D,D1,10+ years,OWN,6000.0,17.77,216.23,0,1,Fully Paid,35,0.0,True,7783.937329,7783.94,6000.0,1783.94,0.0,0.0
1865075,3,2013,credit_card,2,6.2,112000.0,D,D2,< 1 year,MORTGAGE,9000.0,18.49,327.59,0,1,Fully Paid,35,0.0,True,11793.149695,11727.63,9000.0,2793.15,0.0,0.0
1867839,3,2013,debt_consolidation,3,11.83,72000.0,D,D2,10+ years,MORTGAGE,7500.0,18.49,273.0,0,1,Fully Paid,22,0.0,True,9529.43,9529.43,7500.0,2029.43,0.0,0.0
1870803,3,2013,debt_consolidation,2,18.59,89000.0,D,D2,10+ years,MORTGAGE,9600.0,18.49,349.43,0,1,Charged Off,10,4227.0,False,4269.58,4269.58,1692.4,1461.54,0.0,1115.64


In [137]:
for i,v in enumerate(df.columns):
    print(i,"'"+v+"'")

0 'loan_amnt'
1 'funded_amnt'
2 'funded_amnt_inv'
3 'int_rate'
4 'installment'
5 'grade'
6 'sub_grade'
7 'emp_length'
8 'home_ownership'
9 'annual_inc'
10 'verification_status'
11 'issue_d'
12 'loan_status'
13 'purpose'
14 'addr_state'
15 'dti'
16 'delinq_2yrs'
17 'inq_last_6mths'
18 'mths_since_last_delinq'
19 'mths_since_last_record'
20 'open_acc'
21 'pub_rec'
22 'revol_bal'
23 'revol_util'
24 'total_acc'
25 'initial_list_status'
26 'out_prncp'
27 'out_prncp_inv'
28 'total_pymnt'
29 'total_pymnt_inv'
30 'total_rec_prncp'
31 'total_rec_int'
32 'total_rec_late_fee'
33 'recoveries'
34 'collection_recovery_fee'
35 'last_pymnt_d'
36 'last_pymnt_amnt'
37 'last_credit_pull_d'
38 'last_fico_range_high'
39 'last_fico_range_low'
40 'collections_12_mths_ex_med'
41 'mths_since_last_major_derog'
42 'policy_code'
43 'application_type'
44 'annual_inc_joint'
45 'dti_joint'
46 'verification_status_joint'
47 'acc_now_delinq'
48 'tot_coll_amt'
49 'tot_cur_bal'
50 'open_acc_6m'
51 'open_act_il'
52 'open

What goes into FICO scores? A popular FICO score chart describes the main factors that affect score are 35% payment history, 
30% debt owed, 15% age of credit history, 10% new credit, and 10% types of credit.





https://www.econ.berkeley.edu/sites/default/files/KyleJacksonHonorsThesis.pdf




df.groupby(['loan_status', 'issue_year'])['funded_amnt_inv'].count().reset_index()
df[df['funded_amnt_inv']!=df['funded_amnt']]
df[df['loan_amnt']!=df['funded_amnt']]