## LendingClubCaseStudy


The purpose of this CaseStudy is to understand EDA methods and apply them to understand the attributes that impact the tendency of `LoanDefaults`.

In [3]:
#Import the necessary libraries for our DataAnalysis
import numpy as np, pandas as pd                #for dataframes,operations    
import matplotlib.pyplot as plt, seaborn as sns #for plots
from datetime import datetime                   #for time related operations/anlysis


### 1. DataSourcing

In [70]:
#Data is already provided, importing the data to DataFrame, Since it is big file with various DataTypes set low_memory=False
loan_data=pd.read_csv(r'C:\Users\Sumanth Sarva\Documents\IIITB\LendingClub\loan\loan.csv',low_memory=False)

In [71]:
#DataFrame shape
loan_data.shape

(39717, 111)

### 2. DataCleaning

In [72]:
list(loan_data.columns.values) #listing the column names

['id',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'policy_code',
 'application_type',
 'annual_inc_joint',
 'dti_joint',
 'verification_status_joint',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_

In [73]:
# There are many columns let's see if there are empty columns
loan_data.columns[loan_data.isnull().all()]

Index(['mths_since_last_major_derog', 'annual_inc_joint', 'dti_joint',
       'verification_status_joint', 'tot_coll_amt', 'tot_cur_bal',
       'open_acc_6m', 'open_il_6m', 'open_il_12m', 'open_il_24m',
       'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',
       'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
       'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal',
       'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq',
       'mths_since_recent_inq', 'mths_since_recent_revol_delinq',
       'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
       'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
       'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m',
       'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m',
       'pct_tl_nvr_dl

#### 2.1 Dropping empty columns

In [74]:
#dropping all the empty columns
loan_data.dropna(how='all',axis = 1 , inplace = True)

#table shape after dropping
loan_data.shape


(39717, 57)

In [75]:
#lets the see percentage of missing values in the remaining columns, sorted and listed by top 10
round( (loan_data.isnull().sum()/len(loan_data) * 100).sort_values(ascending=False),0).head(10)


next_pymnt_d                  97.0
mths_since_last_record        93.0
mths_since_last_delinq        65.0
desc                          33.0
emp_title                      6.0
emp_length                     3.0
pub_rec_bankruptcies           2.0
last_pymnt_d                   0.0
chargeoff_within_12_mths       0.0
collections_12_mths_ex_med     0.0
dtype: float64

#### 2.2 Dropping columns with more percentage of missing values

In [76]:
#lets drop the top 3 columns as they are not using considering the number of missing values.
loan_data.drop(["next_pymnt_d", "mths_since_last_record", "mths_since_last_delinq"],axis=1,inplace=True)

In [77]:
#Listing the remaining column names
loan_data.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'open_acc', 'pub_rec',
       'revol_bal', 'revol_util', 'total_acc', 'initial_list_status',
       'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
       'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
       'last_credit_pull_d', 'collections_12_mths_ex_med', 'policy_code',
       'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths',
       'delinq_amnt', 'pub_rec_bankruptcies', 'tax_liens'],
      dtype='object')

#### 2.3 Dropping columns which have only `One Value` 

In [78]:
print(loan_data.nunique())
print(loan_data.shape)

id                            39717
member_id                     39717
loan_amnt                       885
funded_amnt                    1041
funded_amnt_inv                8205
term                              2
int_rate                        371
installment                   15383
grade                             7
sub_grade                        35
emp_title                     28820
emp_length                       11
home_ownership                    5
annual_inc                     5318
verification_status               3
issue_d                          55
loan_status                       3
pymnt_plan                        1
url                           39717
desc                          26527
purpose                          14
title                         19615
zip_code                        823
addr_state                       50
dti                            2868
delinq_2yrs                      11
earliest_cr_line                526
inq_last_6mths              

In [81]:
#dropping columns with only one unique value

to_drop  = loan_data.nunique()                            #stroring all columns unique count to to_drop 
to_drop  = to_drop.loc[to_drop.values==1].index           #getting column indices where unique count is 1
loan_data.drop(to_drop, inplace=True, axis=1)             #dropping columns in the list to_drop


Index(['pymnt_plan', 'initial_list_status', 'collections_12_mths_ex_med',
       'policy_code', 'application_type', 'acc_now_delinq',
       'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens'],
      dtype='object')

In [86]:
loan_data.shape

(39717, 45)

#### 2.4 Dropping columns that are `text/descriptions`