In [34]:
# regular imports
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

%matplotlib inline

# display related imports
from IPython.display import display, Image
from IPython.display import HTML
from IPython.display import IFrame

# Widgets
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
#!jupyter nbextension enable --py widgetsnbextension

# to save dataframe as an image
import dataframe_image as dfi

# hide warnings
import warnings
warnings.filterwarnings('ignore')

In [35]:
def missing_data_df(df_):
    ''' function to transpose the data use the column names to describe the data set
    input: dataframe
    output: transposed dataframe with info on missing count,
        missing percent,
        count of data,
        unigue values
    '''
    df = pd.DataFrame(df_.head(1).T).reset_index()
    df['na_count']   = [df_[col_].isnull().sum() for col_ in df['index']]
    df['na_percent'] = [df_[col_].isnull().sum()/df_.shape[0] for col_ in df['index']]
    df['count']      = [df_[col_].count() for col_ in df['index']]
    df['unique_count'] = [len(df_[col_].unique()) for col_ in df['index']]
    df['dtype_'] = [df_[col_].dtypes for col_ in df['index']]
    df['unique_data'] = ['None' if df_[col_].dtypes != np.object else ','.join(map(str, df_[col_].unique())) for col_ in df['index']]
    df['Drop_'] = 0

    df.drop(0, axis=1, inplace=True)
    df.columns = df.columns.str.replace('index', 'col_name')
    return df




In [36]:
# Create New Field With Binned FICO Score

def fico(ret_):
    
    if ret_ < 580:
        return 1
    if ret_ < 670:
        return 2
    if ret_ <740:
        return 3
    if ret_ < 800:
        return 4
    if ret_ < 851:
        return 5
    else:
        return 0
# based on scores from https://corporatefinanceinstitute.com/resources/knowledge/credit/credit-score-analysis/     

In [37]:
# Functions

def percent_good(df_, col):
    print('_' * 100) # printing header to seperate the column info
    print('- ' * 20 , col, ' -' * 20)
    # Column Header for Data that follows in the for loop
    print('{:30s}  {:>8s}  {:>8s}  {:>8s}  {:>14s}  {:>8s}'.format('Category', 'Good %', 'Bad %', 'Col %', 'Tot Count', 'Int %'))

    for item_ in df_[col].unique():                               # Item is the unique category item from the column (col)
        tot_ = df_[col].count()                                   # total loans with data for this column
        t_ = df_[df[col]==item_][col].count()                     # Count of loans matching the category for this column
        c_ = df_[(df_[col]==item_) & (df_['good'])][col].count()  # Count of Good Loans 
        b_ = t_ - c_                                              # Count of bad loans (total - good)

        print('{:30s}  {:8.1%}  {:8.1%}  {:8.1%}  {:14,.0f}  {:8.1%}'
              .format(str(item_), c_/t_, b_/t_, t_/tot_, t_, 
                      df_[df[col]==item_]['int_rate'].mean()/100))


def mort_acc_bin(df_):
    '''Changes mort_acc to 0 for none and 1 for true'''
    df_['mort_acc'] = [0 if x== 0.0 else 1 for x in df_['mort_acc']]
    df_['mort_acc'] = df_['mort_acc'].astype('category')
    return df_


def m_bin(item):                                                  # Used to bin larger cat data into smaller amounts of bins
    if item == 0.0:
        return 0
    elif item < 5.0:
        return 1
    elif item < 10:
        return 2
    else:
        return 3
    
def to_true_false(df_, col, item=0):
    df_[col] = [0 if x == item else 1 for x in df_[col]]
    return df_

In [38]:
# Load Data
accepted = pd.read_csv('data/lending_data/accepted_2007_to_2018Q4.csv')

In [39]:
# fill NA
accepted['funded_amnt'] = accepted['funded_amnt'].fillna(0)

# get rid of NaN in loan_amnt - 33 rows that are missing numerous columns
accepted_rem_na = accepted[accepted['loan_amnt'].notna()]
# dropping rows with data not usefull for our project
accepted_rem_na.drop(['zip_code', 'member_id', 'id'], axis=1, inplace=True)
accepted.drop(['url', 'emp_title', 'desc', 'title'], axis=1, inplace=True)
# replacing missing dti with max dti - these were created by devide by zero errors
accepted_rem_na['dti'].fillna(accepted.dti.max(), inplace=True)
# Getting rid of Hardship other than flag and settle columns
hardship_col = [col for col in accepted_rem_na if col.startswith('hard')]
accepted_rem_na.drop(hardship_col[1:], axis=1, inplace=True)
settle_col = [col for col in accepted_rem_na if col.startswith('settle')]
accepted_rem_na.drop(settle_col[1:], axis=1, inplace=True)
# changing to date and adding month and year
accepted_rem_na['issue_d'] = pd.to_datetime(accepted_rem_na['issue_d'])
accepted_rem_na['issue_year'] = pd.DatetimeIndex(accepted_rem_na['issue_d']).year
accepted_rem_na['issue_month'] = pd.DatetimeIndex(accepted_rem_na['issue_d']).month
accepted_rem_na['earliest_cr_line'] = pd.to_datetime(accepted_rem_na['earliest_cr_line'])
# Fico score and term
accepted_rem_na['fico'] = [val for val in ((accepted_rem_na['fico_range_low'] + accepted_rem_na['fico_range_high'])/2)]
accepted_rem_na['fico'] = [fico(val) for val in accepted_rem_na['fico']]
accepted_rem_na['term_60'] = [1 if val == ' 60 months' else 0 for val in accepted_rem_na['term']]
accepted_rem_na['years_since_earliest_credit'] = [dt_ for dt_ in ((accepted_rem_na['issue_d'] - accepted_rem_na['earliest_cr_line'])/np.timedelta64(1,'Y'))]
accepted_rem_na['years_since_earliest_credit'].fillna(0, inplace=True)
accepted_rem_na['years_since_earliest_credit'] = (accepted_rem_na['years_since_earliest_credit']).astype('int64')
accepted_rem_na.drop(['fico_range_low', 'fico_range_high', 'term', 'earliest_cr_line'], axis=1, inplace=True)





In [40]:
accepted_rem_na['last_pay_d'] = pd.to_datetime(accepted_rem_na['last_pymnt_d'])

accepted_rem_na['months_of_pay'] = [dt_ for dt_ in ((accepted_rem_na['last_pay_d'] - accepted_rem_na['issue_d'])/np.timedelta64(1,'M'))]
accepted_rem_na['months_of_pay'].fillna(0, inplace=True)
accepted_rem_na['months_of_pay'] = (accepted_rem_na['months_of_pay']).astype('int64') - 1

In [41]:
for col in accepted_rem_na.columns:
    if accepted_rem_na[col].dtype == 'object':
        accepted_rem_na[col].fillna('na', inplace=True)
    else:
        accepted_rem_na[col].fillna(0, inplace=True)
                                    
                                    
col_accep = missing_data_df(accepted_rem_na)

In [42]:
# changing columns to cat data
cat_data = []
# clearing data from memory
del accepted
del hardship_col
del settle_col

accepted_rem_na = accepted_rem_na.fillna('na')
for col_ in col_accep['col_name'][1:]:
     #['unique_count']])#if accepted_rem_na[col_][['unique_count']<20]:
     if (len(accepted_rem_na[col_].unique()) < 55) and (col_ not in ['months_of_pay', 'issue_year', 'issue_month', 'issue_d']):
         print(col_)
         cat_data.append(col_)
         #accepted_rem_na(col_).fillna('na')
         accepted_rem_na[col_] = accepted_rem_na[col_].astype('category')



grade
sub_grade
emp_length
home_ownership
verification_status
loan_status
pymnt_plan
purpose
addr_state
delinq_2yrs
inq_last_6mths
pub_rec
initial_list_status
collections_12_mths_ex_med
policy_code
application_type
verification_status_joint
acc_now_delinq
open_acc_6m
open_act_il
open_il_12m
open_il_24m
open_rv_12m
open_rv_24m
inq_fi
inq_last_12m
chargeoff_within_12_mths
mort_acc
mths_since_recent_inq
num_accts_ever_120_pd
num_actv_bc_tl
num_rev_tl_bal_gt_0
num_tl_120dpd_2m
num_tl_30dpd
num_tl_90g_dpd_24m
num_tl_op_past_12m
pub_rec_bankruptcies
tax_liens
sec_app_inq_last_6mths
sec_app_mort_acc
sec_app_open_act_il
sec_app_chargeoff_within_12_mths
sec_app_collections_12_mths_ex_med
hardship_flag
deferral_term
payment_plan_start_date
disbursement_method
debt_settlement_flag
settlement_status
fico
term_60


In [43]:
df = accepted_rem_na

In [44]:
# change status to either charge off or Fully paid
df.loc[(df.loan_status == 'Does not meet the credit policy. Status:Fully Paid'),'loan_status'] = 'Fully Paid'
df.loc[(df.loan_status == 'Does not meet the credit policy. Status:Charged Off'),'loan_status'] = 'Charged Off' 

In [45]:
df['loan_status'] = (['Not Current' if x in
                      ['In Grace Period', 'Late (16-30 days)', 'Late (31-120 days)', 'Default'] else x
                     for x in df['loan_status']])

In [46]:
df['good']=[True if x in ['Current', 'Fully Paid'] else False for x in df['loan_status']]

In [47]:
# fixing some fields here, more to be added shortly
df = mort_acc_bin(df)
df['earliest_credit_10_more'] = [0 if x<10 else 1 for x in df['years_since_earliest_credit']]

In [48]:
# Fields that are dropped, more to come!
to_drop = [
    'pymnt_plan',
    'url',
    'desc',
    'emp_title',
    'pymnt_plan',
    'title',
    'next_pymnt_d',
    'deferral_term',
    'years_since_earliest_credit',
    'debt_settlement_flag', 
    'orig_projected_additional_accrued_interest', 
    'payment_plan_start_date', 
    'sec_app_fico_range_high',
    'addr_state',
    'sec_app_inq_last_6mths',
    'sec_app_mort_acc', 
    'sec_app_open_act_il' ,
    'sec_app_chargeoff_within_12_mths', 
    'sec_app_collections_12_mths_ex_med'
]
hard_ = [col for col in df if col.startswith('hard')]
settle_ = [col for col in df if col.startswith('settle')]
df.drop(to_drop, axis=1, inplace=True)
df.drop(hard_, axis=1, inplace=True)
df.drop(settle_, axis=1, inplace=True)

In [49]:
df = to_true_false(df, 'tax_liens')
df = to_true_false(df, 'disbursement_method', 'cash')

df.rename(columns={'disbursement_method': 'disb_direct'}, inplace=True)

In [50]:
col_ = ['inq_last_6mths', 
       'open_acc',
        'collections_12_mths_ex_med',
        'open_acc_6m',
        'collections_12_mths_ex_med',
        'delinq_2yrs',
        'pub_rec',
        'acc_now_delinq',
        'open_il_12m',
        'open_il_24m',
        'num_tl_90g_dpd_24m',
        'num_tl_op_past_12m',
        'policy_code'
        
       ]

for col in col_:
    df[col] = [m_bin(x) for x in df[col]]
df['delinq_2yrs'].unique

<bound method Series.unique of 0          0
1          1
2          0
3          0
4          1
          ..
2260694    0
2260695    0
2260696    2
2260697    0
2260698    0
Name: delinq_2yrs, Length: 2260668, dtype: int64>

In [51]:
# months since making it less than 36 or greater than Can Change number to fit what we want

m = 36 # using 36 as cutoff - might change to 50 or use this as a controlled variable later
for col_ in df.columns:
    if col_.startswith('mths'):
        df[col_] = [0 if x < m else 1 for x in df[col_]]

In [52]:
df['sec_fico'] = df['sec_app_fico_range_low']
df['sec_fico'].fillna(0, inplace=True)
df['sec_fico'] = [fico(val) for val in df['sec_fico']]

df.drop('sec_app_fico_range_low', axis=1, inplace=True)

In [53]:

finance_columns = [
'loan_amnt',
'funded_amnt', 
'funded_amnt_inv',
'int_rate',
'installment',
'annual_inc',
'total_pymnt',
'total_pymnt_inv',
'total_rec_prncp',
'total_rec_int',
'total_rec_late_fee',
'recoveries',
'collection_recovery_fee',
'last_pymnt_amnt',
'last_pymnt_d',
'annual_inc_joint',
'tot_coll_amt',
  'issue_d',
    'issue_year',
    'issue_month',
    'months_of_pay']  



In [54]:
f_ = ['funded_amnt','total_rec_prncp','total_rec_int','recoveries','collection_recovery_fee']
for f in f_:
    df[f].fillna(0, inplace=True)

In [55]:
df[finance_columns].head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,last_pymnt_d,annual_inc_joint,tot_coll_amt,issue_d,issue_year,issue_month,months_of_pay
0,3600.0,3600.0,3600.0,13.99,123.03,55000.0,4421.723917,4421.72,3600.0,821.72,0.0,0.0,0.0,122.67,Jan-2019,0.0,722.0,2015-12-01,2015,12,36
1,24700.0,24700.0,24700.0,11.99,820.28,65000.0,25679.66,25679.66,24700.0,979.66,0.0,0.0,0.0,926.35,Jun-2016,0.0,0.0,2015-12-01,2015,12,5
2,20000.0,20000.0,20000.0,10.78,432.66,63000.0,22705.924294,22705.92,20000.0,2705.92,0.0,0.0,0.0,15813.3,Jun-2017,71000.0,0.0,2015-12-01,2015,12,17
3,35000.0,35000.0,35000.0,14.85,829.9,110000.0,31464.01,31464.01,19102.35,12361.66,0.0,0.0,0.0,829.9,Feb-2019,0.0,0.0,2015-12-01,2015,12,37
4,10400.0,10400.0,10400.0,22.45,289.91,104433.0,11740.5,11740.5,10400.0,1340.5,0.0,0.0,0.0,10128.96,Jul-2016,0.0,0.0,2015-12-01,2015,12,5


In [56]:
def calc_balance(row):
    return (row['funded_amnt'] 
            - row['total_rec_prncp'] 
            - row['total_rec_int'] 
            - row['recoveries'] 
            + row['collection_recovery_fee'])
    
df['balance'] = df.apply(calc_balance, axis=1)

In [57]:
df.shape

(2260668, 119)

In [58]:
finance_columns.append('balance')

In [59]:
finance_columns.append('good')

In [60]:
pd.set_option('display.max_columns', None)

df[df['issue_year']<2015].sample(10)

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_open_acc,sec_app_revol_util,sec_app_num_rev_accts,sec_app_mths_since_last_major_derog,disb_direct,debt_settlement_flag_date,issue_year,issue_month,fico,term_60,last_pay_d,months_of_pay,good,earliest_credit_10_more,sec_fico,balance
1798999,9600.0,9600.0,9600.0,15.61,335.67,C,C3,7 years,RENT,45000.0,Source Verified,2013-09-01,Fully Paid,credit_card,8.91,0,0,0,0,3,0,7247.0,49.6,14.0,w,0.0,0.0,12163.73,12163.73,9600.0,2548.73,15.0,0.0,0.0,Jun-2016,1407.24,Mar-2019,609.0,605.0,0,0,1,Individual,0.0,0.0,na,0,0.0,8409.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,14600.0,0.0,0.0,0.0,6.0,934.0,4927.0,54.0,0.0,0.0,340.0,112.0,3.0,3.0,0,0,0,0,0,0.0,4.0,8.0,4.0,5.0,1.0,9.0,13.0,8.0,10.0,0.0,0.0,0,2,92.0,25.0,0.0,0,17100.0,8409.0,10700.0,2500.0,0.0,na,0.0,0.0,0.0,0.0,1,na,2013,9,3,0,2016-06-01 00:00:00,31,True,1,1,-2548.73
1851608,15000.0,15000.0,15000.0,13.11,506.21,B,B4,7 years,MORTGAGE,105000.0,Not Verified,2013-05-01,Fully Paid,debt_consolidation,3.62,0,1,0,0,3,0,10915.0,37.8,33.0,f,0.0,0.0,18223.314148,18223.31,15000.0,3223.31,0.0,0.0,0.0,May-2016,505.96,Feb-2018,679.0,675.0,0,0,1,Individual,0.0,0.0,na,0,0.0,138627.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,28900.0,0.0,0.0,0.0,4.0,15403.0,1460.0,84.0,0.0,0.0,123.0,404.0,0.0,0.0,1,0,0,0,0,0.0,2.0,5.0,6.0,11.0,4.0,13.0,26.0,5.0,15.0,0.0,0.0,0,1,97.0,100.0,0.0,0,156612.0,10915.0,9100.0,0.0,0.0,na,0.0,0.0,0.0,0.0,1,na,2013,5,3,0,2016-05-01 00:00:00,35,True,1,1,-3223.31
1637771,3600.0,3600.0,3600.0,7.51,112.0,A,A4,2 years,RENT,55000.0,Not Verified,2010-09-01,Fully Paid,debt_consolidation,18.7,0,0,0,0,2,0,1953.0,17.0,18.0,f,0.0,0.0,3622.8,3622.8,3600.0,22.8,0.0,0.0,0.0,Oct-2010,3623.19,Oct-2010,769.0,765.0,0,0,1,Individual,0.0,0.0,na,0,0.0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,na,0.0,0.0,0.0,0.0,1,na,2010,9,3,0,2010-10-01 00:00:00,-1,True,1,1,-22.8
1900603,19300.0,19300.0,19300.0,18.49,495.26,D,D2,10+ years,RENT,55000.0,Verified,2012-10-01,Charged Off,debt_consolidation,22.95,0,0,0,0,3,0,9858.0,80.8,25.0,w,0.0,0.0,9701.05,9701.05,3310.73,4118.17,0.0,2272.15,22.7215,Feb-2014,495.26,Oct-2016,599.0,595.0,0,0,1,Individual,0.0,0.0,na,0,0.0,27110.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,12200.0,0.0,0.0,0.0,4.0,3012.0,688.0,93.4,0.0,0.0,128.0,243.0,8.0,8.0,0,0,0,0,0,0.0,4.0,5.0,5.0,13.0,10.0,7.0,15.0,5.0,10.0,0.0,0.0,0,1,100.0,75.0,0.0,0,35791.0,27110.0,10500.0,23591.0,0.0,na,0.0,0.0,0.0,0.0,1,na,2012,10,3,1,2014-02-01 00:00:00,15,False,1,1,9621.6715
1246326,2500.0,2500.0,2500.0,19.52,92.3,E,E2,1 year,OWN,25000.0,Source Verified,2014-07-01,Charged Off,small_business,5.28,1,0,0,0,2,0,2032.0,27.8,11.0,w,0.0,0.0,528.18,528.18,104.11,80.49,0.0,343.58,3.4358,Sep-2014,92.3,Oct-2016,554.0,550.0,0,0,1,Individual,0.0,0.0,na,0,0.0,2032.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,7300.0,0.0,0.0,0.0,2.0,339.0,4603.0,23.3,0.0,0.0,141.0,106.0,19.0,19.0,0,0,0,0,0,0.0,2.0,4.0,3.0,3.0,5.0,6.0,6.0,4.0,6.0,0.0,0.0,0,0,81.8,66.7,0.0,0,7300.0,2032.0,6000.0,0.0,0.0,na,0.0,0.0,0.0,0.0,1,na,2014,7,3,0,2014-09-01 00:00:00,1,False,1,1,1975.2558
1330639,10000.0,10000.0,10000.0,19.97,371.49,D,D5,na,MORTGAGE,56000.0,Verified,2014-02-01,Fully Paid,debt_consolidation,21.82,0,1,1,0,3,0,12552.0,76.5,30.0,w,0.0,0.0,13373.298316,13373.3,10000.0,3373.3,0.0,0.0,0.0,Feb-2017,371.15,Jul-2017,669.0,665.0,0,1,1,Individual,0.0,0.0,na,0,1332.0,135569.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,16400.0,0.0,0.0,0.0,8.0,10428.0,469.0,94.0,0.0,0.0,99.0,100.0,3.0,1.0,1,1,1,0,1,5.0,4.0,9.0,4.0,5.0,11.0,9.0,14.0,9.0,13.0,0.0,0.0,0,2,83.3,75.0,0.0,0,155754.0,21733.0,7800.0,16337.0,0.0,na,0.0,0.0,0.0,0.0,1,na,2014,2,2,0,2017-02-01 00:00:00,35,True,1,1,-3373.3
1851012,19000.0,19000.0,19000.0,16.29,670.71,C,C4,8 years,RENT,48500.0,Source Verified,2013-05-01,Fully Paid,debt_consolidation,23.31,0,1,0,0,2,0,1404.0,7.9,24.0,f,0.0,0.0,19509.215207,19509.22,19000.0,509.22,0.0,0.0,0.0,Jul-2013,18740.33,Feb-2019,579.0,575.0,0,0,1,Individual,0.0,0.0,na,0,0.0,46714.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,17700.0,0.0,0.0,0.0,7.0,6673.0,6596.0,17.5,0.0,0.0,127.0,98.0,2.0,2.0,0,0,0,0,0,0.0,1.0,1.0,3.0,5.0,16.0,5.0,8.0,1.0,7.0,0.0,0.0,0,2,100.0,0.0,0.0,0,66770.0,46714.0,8000.0,49070.0,0.0,na,0.0,0.0,0.0,0.0,1,na,2013,5,4,0,2013-07-01 00:00:00,1,True,1,1,-509.22
1939120,28000.0,28000.0,28000.0,7.51,871.11,A,A3,5 years,MORTGAGE,113200.0,Source Verified,2012-01-01,Fully Paid,debt_consolidation,24.36,0,1,0,0,3,0,44111.0,24.3,49.0,f,0.0,0.0,29553.064211,29553.06,28000.0,1553.06,0.0,0.0,0.0,Nov-2012,21716.82,Apr-2014,789.0,785.0,0,0,1,Individual,0.0,0.0,na,0,0.0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,na,0.0,0.0,0.0,0.0,1,na,2012,1,4,0,2012-11-01 00:00:00,9,True,1,1,-1553.06
1835519,10000.0,10000.0,9975.0,15.8,350.59,C,C3,na,RENT,32400.0,Verified,2013-06-01,Fully Paid,debt_consolidation,6.96,0,1,1,1,2,1,7003.0,63.7,17.0,w,0.0,0.0,12579.57,12548.12,10000.0,2579.57,0.0,0.0,0.0,Feb-2016,1711.28,Sep-2018,754.0,750.0,0,0,1,Individual,0.0,0.0,na,0,0.0,7003.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,11000.0,0.0,0.0,0.0,4.0,1167.0,3197.0,68.7,0.0,0.0,138.0,163.0,6.0,6.0,1,0,0,0,0,0.0,5.0,5.0,5.0,7.0,5.0,6.0,10.0,5.0,6.0,0.0,0.0,0,1,100.0,20.0,1.0,1,11000.0,7003.0,10200.0,0.0,0.0,na,0.0,0.0,0.0,0.0,1,na,2013,6,2,0,2016-02-01 00:00:00,31,True,1,1,-2579.57
1934096,9000.0,9000.0,8950.0,15.31,313.36,C,C3,5 years,MORTGAGE,53050.38,Not Verified,2012-03-01,Fully Paid,debt_consolidation,11.7,0,1,1,0,3,0,9096.0,67.9,32.0,f,0.0,0.0,11305.560006,11242.75,9000.0,2289.89,15.67,0.0,0.0,Mar-2015,638.16,Mar-2019,699.0,695.0,0,0,1,Individual,0.0,0.0,na,0,0.0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,na,0.0,0.0,0.0,0.0,1,na,2012,3,3,0,2015-03-01 00:00:00,34,True,1,1,-2289.89


In [61]:
# This is the magic! This little bit of lovely code will print all the columns that it can, 
# based on unique values in the columns with percent GOOD, BAD, Percent of Column, 
# Number of Loans in, Category and Interest Rate
cant_be_converted = []
for col in df.columns:
    if (len(df[col].unique()) < 10) or (col == 'sub_grade'): 
        percent_good(df, col)
    elif col in finance_columns:
        col_ = col + '_bin'
        df[col_]=df[col]
        df[col_].fillna(0, inplace=True)
        try:
            df[col_]=pd.qcut(df[col_], q=4)
            percent_good(df, col_)
        except:
            cant_be_converted.append(col)
            print('Column : ', col, " can't be converted")
    else:
        if col != 'int_rate':
            print('-' * 100)
            print('- '* 20 , col, ' -' * 20)
            print('Unique Items = ', len(df[col].unique()), type(col))
            try:
                df[col]=pd.qcut(df[col], q=4)
                percent_good(df, col)
            except:
                cant_be_converted.append(col)
                print('Column : ', col, " can't be converted")

____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  loan_amnt_bin  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
(499.999, 8000.0]                  88.7%     11.3%     26.9%         607,200     12.4%
(20000.0, 40000.0]                 85.3%     14.7%     23.8%         537,096     13.9%
(12900.0, 20000.0]                 85.4%     14.6%     26.2%         593,192     13.3%
(8000.0, 12900.0]                  86.7%     13.3%     23.1%         523,180     12.8%
____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  funded_amnt_bin  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
(499.999, 8000.0]                  88.7%     11.3%     26.9%         607,461     1

____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  annual_inc_bin  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
(46000.0, 65000.0]                 85.6%     14.4%     25.7%         581,472     13.3%
(93000.0, 110000000.0]             89.3%     10.7%     24.9%         563,201     12.3%
(-0.001, 46000.0]                  84.4%     15.6%     25.2%         569,041     13.8%
(65000.0, 93000.0]                 87.1%     12.9%     24.2%         546,954     12.9%
____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  verification_status  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
Not Verified                       90.7%      9.3%     32.9%         744,806 

(-0.001, 31.5]                     89.8%     10.2%     25.1%         567,189     11.3%
(50.2, 69.3]                       85.3%     14.7%     25.0%         565,113     13.7%
(69.3, 892.3]                      84.2%     15.8%     25.0%         564,896     14.7%
(31.5, 50.2]                       87.0%     13.0%     24.9%         563,470     12.7%
----------------------------------------------------------------------------------------------------
- - - - - - - - - - - - - - - - - - - -  total_acc  - - - - - - - - - - - - - - - - - - - -
Unique Items =  153 <class 'str'>
____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  total_acc  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
(-0.001, 15.0]                     87.0%     13.0%     25.4%         573,404     13.5%
(31.0, 176.0]                      86.0%     14.0%     23.

0                                  86.8%     13.2%     83.8%       1,894,519     13.0%
1                                  85.2%     14.8%     16.2%         366,149     13.7%
____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  policy_code  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
1                                  86.6%     13.4%    100.0%       2,260,668     13.1%
____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  application_type  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
Individual                         86.3%     13.7%     94.7%       2,139,958     13.1%
Joint App                          91.6%      8.4%      5.3%         120,710     13

(-0.001, 13700.0]                  84.2%     15.8%     25.1%         566,988     14.2%
(42300.0, 9999999.0]               90.2%      9.8%     25.0%         565,084     11.6%
(13700.0, 24600.0]                 85.1%     14.9%     24.9%         563,821     13.6%
(24600.0, 42300.0]                 86.8%     13.2%     25.0%         564,775     13.0%
----------------------------------------------------------------------------------------------------
- - - - - - - - - - - - - - - - - - - -  inq_fi  - - - - - - - - - - - - - - - - - - - -
Unique Items =  33 <class 'str'>
Column :  inq_fi  can't be converted
----------------------------------------------------------------------------------------------------
- - - - - - - - - - - - - - - - - - - -  total_cu_tl  - - - - - - - - - - - - - - - - - - - -
Unique Items =  62 <class 'str'>
Column :  total_cu_tl  can't be converted
----------------------------------------------------------------------------------------------------
- - - - - - - - - - -

(-0.001, 3.0]                      84.1%     15.9%     33.2%         749,553     13.8%
(10.0, 382.0]                      90.0%     10.0%     24.4%         551,065     12.1%
(3.0, 6.0]                         86.0%     14.0%     23.5%         530,757     13.3%
(6.0, 10.0]                        87.2%     12.8%     19.0%         429,293     12.9%
____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  mort_acc  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
1                                  87.8%     12.2%     56.7%       1,281,032     12.7%
0                                  84.9%     15.1%     43.3%         979,636     13.5%
____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  mths_since_recent_bc  - - - - - - - - - - - - - - - - - -

____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  num_rev_accts  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
(8.0, 12.0]                        86.9%     13.1%     22.9%         517,669     13.1%
(18.0, 151.0]                      85.7%     14.3%     22.8%         514,983     12.8%
(-0.001, 8.0]                      87.1%     12.9%     29.1%         658,130     13.4%
(12.0, 18.0]                       86.4%     13.6%     25.2%         569,886     13.0%
----------------------------------------------------------------------------------------------------
- - - - - - - - - - - - - - - - - - - -  num_rev_tl_bal_gt_0  - - - - - - - - - - - - - - - - - - - -
Unique Items =  50 <class 'str'>
____________________________________________________________________________________________________
- - - - - - - - - - - - - - - -

(19797.0, 37001.0]                 86.1%     13.9%     25.0%         565,149     13.1%
----------------------------------------------------------------------------------------------------
- - - - - - - - - - - - - - - - - - - -  total_bc_limit  - - - - - - - - - - - - - - - - - - - -
Unique Items =  20309 <class 'str'>
____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  total_bc_limit  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
(-0.001, 7800.0]                   83.5%     16.5%     25.1%         567,596     14.4%
(29900.0, 1569000.0]               90.6%      9.4%     24.9%         563,146     11.4%
(15900.0, 29900.0]                 87.2%     12.8%     24.9%         562,548     12.9%
(7800.0, 15900.0]                  84.9%     15.1%     25.1%         567,378     13.7%
----------------------------------------------

1                                  86.9%     13.1%     82.8%       1,872,416     12.9%
0                                  85.0%     15.0%     17.2%         388,252     13.9%
____________________________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - -  sec_fico  - - - - - - - - - - - - - - - - - - - -
Category                          Good %     Bad %     Col %       Tot Count     Int %
1                                  86.3%     13.7%     95.4%       2,155,607     13.1%
2                                  90.3%      9.7%      2.2%          48,884     15.2%
3                                  94.7%      5.3%      2.2%          49,182     12.6%
4                                  98.1%      1.9%      0.3%           6,378      8.7%
5                                  99.5%      0.5%      0.0%             617      7.2%
____________________________________________________________________________________________________
- - - - - -

In [62]:
for col in df.columns:
    if len(df[col].unique()) > 4:
        print(col, len(df[col].unique()))

loan_amnt 1572
funded_amnt 1572
funded_amnt_inv 10057
int_rate 673
installment 93301
grade 7
sub_grade 35
emp_length 12
home_ownership 6
annual_inc 89368
issue_d 139
purpose 14
out_prncp 356141
out_prncp_inv 368481
total_pymnt 1633857
total_pymnt_inv 1311099
total_rec_prncp 486463
total_rec_int 635921
total_rec_late_fee 18375
recoveries 132777
collection_recovery_fee 146222
last_pymnt_d 137
last_pymnt_amnt 704467
last_credit_pull_d 142
annual_inc_joint 17634
dti_joint 4018
tot_coll_amt 15574
open_act_il 54
total_bal_il 162249
il_util 280
open_rv_12m 29
open_rv_24m 50
max_bal_bc 33726
all_util 188
inq_fi 33
total_cu_tl 62
inq_last_12m 48
chargeoff_within_12_mths 11
delinq_amnt 2617
num_accts_ever_120_pd 44
num_tl_120dpd_2m 7
num_tl_30dpd 5
pct_tl_nvr_dlq 690
percent_bc_gt_75 284
pub_rec_bankruptcies 12
revol_bal_joint 56875
sec_app_earliest_cr_line 664
sec_app_open_acc 67
sec_app_revol_util 1216
sec_app_num_rev_accts 86
sec_app_mths_since_last_major_derog 140
debt_settlement_flag_date 8

In [63]:
df.to_pickle('data/lending_club_binned_data_1.pkl')
#df.to_csv('data/lending_club_binned_data.csv')

In [64]:
print(cant_be_converted)

['emp_length', 'purpose', 'out_prncp', 'out_prncp_inv', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_credit_pull_d', 'annual_inc_joint', 'dti_joint', 'tot_coll_amt', 'open_act_il', 'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'inq_fi', 'total_cu_tl', 'inq_last_12m', 'chargeoff_within_12_mths', 'delinq_amnt', 'num_accts_ever_120_pd', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'revol_bal_joint', 'sec_app_earliest_cr_line', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_num_rev_accts', 'sec_app_mths_since_last_major_derog', 'debt_settlement_flag_date', 'last_pay_d']
