<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preprocessing</a></span><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Preliminary-feature-selection" data-toc-modified-id="Preliminary-feature-selection-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Preliminary feature selection</a></span></li></ul></li></ul></div>

# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
warnings.filterwarnings(action="ignore")

## Load Data

In [2]:
accepted = pd.read_csv('../data/accepted_2007_to_2018Q4.csv')

In [3]:
# Parse Years
accepted['year'] = pd.to_datetime(accepted.issue_d).dt.year
# Bin late loans into one group
accepted.loan_status = accepted.loan_status.apply(lambda x: np.where(x == 'Late (31-120 days)','Late',x))
accepted.loan_status = accepted.loan_status.apply(lambda x: np.where(x == 'In Grace Period','Late',x))
accepted.loan_status = accepted.loan_status.apply(lambda x: np.where(x == 'Late (16-30 days)','Late',x))
# Reduce the size of the dataset
accepted = accepted[(accepted.year.isin([2016,2017,2018]))& 
                    accepted.loan_status.isin(['Fully Paid','Charged Off','Late'])]

In [4]:
print(accepted.shape)
accepted.head()

(550097, 152)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term,year
421101,130956066,,3000.0,3000.0,3000.0,36 months,7.34,93.1,A,A4,Scale Technician,9 years,RENT,52000.0,Source Verified,Mar-2018,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,major_purchase,Major purchase,988xx,WA,0.58,0.0,Jan-1998,760.0,764.0,0.0,26.0,,7.0,0.0,141.0,0.5,30.0,w,0.0,0.0,3011.577285,3011.58,3000.0,11.58,0.0,0.0,0.0,May-2018,614.03,,Nov-2018,764.0,760.0,0.0,,1.0,Individual,,,,0.0,0.0,150592.0,0.0,0.0,1.0,2.0,7.0,0.0,,0.0,1.0,141.0,1.0,31000.0,1.0,2.0,2.0,3.0,25099.0,30359.0,0.5,0.0,0.0,132.0,242.0,18.0,7.0,4.0,18.0,,7.0,,0.0,1.0,1.0,4.0,15.0,7.0,6.0,19.0,1.0,7.0,0.0,0.0,0.0,1.0,96.7,0.0,0.0,0.0,191216.0,141.0,30500.0,0.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018.0
421113,130968727,,5000.0,5000.0,5000.0,36 months,11.98,166.03,B,B5,SDO Supervisor,10+ years,OWN,55000.0,Not Verified,Mar-2018,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,other,Other,300xx,GA,14.18,0.0,Aug-2001,675.0,679.0,0.0,74.0,82.0,14.0,1.0,11449.0,33.9,24.0,w,0.0,0.0,5013.306667,5013.31,5000.0,13.31,0.0,0.0,0.0,Apr-2018,5019.97,,Aug-2018,679.0,675.0,0.0,74.0,1.0,Individual,,,,0.0,0.0,28880.0,1.0,1.0,0.0,0.0,33.0,17431.0,63.0,2.0,2.0,4829.0,47.0,33800.0,0.0,1.0,1.0,2.0,2222.0,10551.0,52.0,0.0,0.0,77.0,199.0,3.0,3.0,0.0,3.0,,12.0,,1.0,3.0,3.0,4.0,7.0,6.0,13.0,18.0,3.0,14.0,0.0,0.0,0.0,2.0,95.7,33.3,1.0,0.0,61551.0,28880.0,22000.0,27751.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018.0
421120,130910225,,7000.0,7000.0,7000.0,36 months,11.98,232.44,B,B5,Parole,< 1 year,MORTGAGE,40000.0,Verified,Mar-2018,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,home_improvement,Home improvement,797xx,TX,20.25,0.0,Mar-2007,695.0,699.0,0.0,60.0,,13.0,0.0,5004.0,36.0,29.0,w,0.0,0.0,7693.314943,7693.31,7000.0,693.31,0.0,0.0,0.0,Mar-2019,5364.25,,Mar-2019,644.0,640.0,0.0,60.0,1.0,Individual,,,,0.0,0.0,131726.0,1.0,6.0,0.0,2.0,16.0,126722.0,102.0,2.0,2.0,3944.0,90.0,13900.0,2.0,1.0,4.0,4.0,10977.0,4996.0,50.0,0.0,0.0,122.0,132.0,1.0,1.0,0.0,10.0,64.0,5.0,60.0,3.0,2.0,2.0,3.0,4.0,19.0,7.0,10.0,2.0,13.0,0.0,0.0,0.0,2.0,89.7,33.3,0.0,0.0,132817.0,131726.0,10000.0,118917.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018.0
421121,130962380,,20300.0,20300.0,20300.0,60 months,17.47,509.66,D,D1,Administrative Assistant,< 1 year,RENT,55000.0,Verified,Mar-2018,Late,n,https://lendingclub.com/browse/loanDetail.acti...,,credit_card,Credit card refinancing,112xx,NY,26.63,0.0,Nov-2007,705.0,709.0,0.0,,,10.0,0.0,17506.0,70.9,26.0,w,18497.12,18497.12,4037.88,4037.88,1802.88,2235.0,0.0,0.0,0.0,Dec-2018,509.66,Apr-2019,Mar-2019,704.0,700.0,0.0,,1.0,Individual,,,,0.0,0.0,60594.0,0.0,3.0,0.0,1.0,24.0,43088.0,74.0,0.0,1.0,8215.0,69.0,24700.0,0.0,1.0,0.0,3.0,6733.0,7194.0,70.9,0.0,0.0,124.0,111.0,22.0,22.0,0.0,22.0,,,,0.0,3.0,3.0,6.0,8.0,14.0,6.0,11.0,3.0,10.0,0.0,0.0,0.0,0.0,100.0,60.0,0.0,0.0,87959.0,60594.0,24700.0,58404.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018.0
421135,130966492,,30000.0,30000.0,30000.0,36 months,21.85,1143.39,D,D5,teacher,10+ years,OWN,57000.0,Verified,Mar-2018,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,341xx,FL,27.58,0.0,Apr-2000,680.0,684.0,1.0,68.0,,11.0,0.0,29222.0,53.2,26.0,w,0.0,0.0,34389.550341,34389.55,30000.0,4389.55,0.0,0.0,0.0,Dec-2018,26458.65,,Dec-2018,699.0,695.0,0.0,68.0,1.0,Individual,,,,0.0,0.0,157566.0,1.0,1.0,2.0,2.0,6.0,33030.0,94.0,2.0,2.0,6236.0,69.0,55500.0,2.0,8.0,5.0,5.0,14324.0,19752.0,33.5,0.0,0.0,195.0,215.0,11.0,6.0,2.0,11.0,,0.0,,1.0,3.0,4.0,6.0,7.0,10.0,9.0,14.0,4.0,11.0,0.0,0.0,0.0,5.0,96.0,33.3,0.0,0.0,188780.0,62252.0,29400.0,35000.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018.0


## Preliminary feature selection

In [7]:
accepted.groupby(['loan_status','pymnt_plan']).size()

loan_status  pymnt_plan
Charged Off  n             116257
Fully Paid   n             402449
Late         n              30818
             y                573
dtype: int64

In [12]:
accepted.title.nunique()

13

In [None]:
'loan_amnt','funded_amnt','funded_amnt_inv','term','int_rate','installment','grade','sub_grade','emp_length',
'home_ownership','annual_inc','verification_status','issue_d','year','loan_status','purpose','addr_state','dti',
'earliest_cr_line',
'inq_last_6mths',
'open_acc',
'pub_rec',
'revol_bal',
'revol_util',
'total_acc',
'out_prncp',
'out_prncp_inv',
'total_pymnt',
'total_pymnt_inv',
'total_rec_prncp',
'total_rec_int',
'loan_status'