# Lending Club Case Study - Data Cleansing, EDA & Extracting Insights

#### Import Necessary Libraries

In [64]:
#Numerical and data analysis
import pandas as pd
import numpy as np

#Data visualization
from matplotlib import pyplot as plt
import seaborn as sb

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#### Load the dataset and prepare dataframe

In [65]:
df = pd.read_csv('/Users/nikhilnaveen/Desktop/Learning/Assignments/LendingClubCaseStudy/loan.csv')
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,1296599,5000,5000,4975.0,36 months,10.65%,162.87,B,B2,...,,,,,0.0,0.0,,,,
1,1077430,1314167,2500,2500,2500.0,60 months,15.27%,59.83,C,C4,...,,,,,0.0,0.0,,,,
2,1077175,1313524,2400,2400,2400.0,36 months,15.96%,84.33,C,C5,...,,,,,0.0,0.0,,,,
3,1076863,1277178,10000,10000,10000.0,36 months,13.49%,339.31,C,C1,...,,,,,0.0,0.0,,,,
4,1075358,1311748,3000,3000,3000.0,60 months,12.69%,67.79,B,B5,...,,,,,0.0,0.0,,,,


#### Problem Statement:
- Based on the dataset given, perform EDA and derive some metrics & conclusions that can help the company to make decisions like loan approval and rejection.

#### Print the no. of records present in the dataframe before doing any changes (excluding headers)

In [66]:
df.shape[0]-1

39716

#### Remove "Current" customers as the analysis is on fully paid & charged-off customers

In [67]:
df_filtered = df[~df['loan_status'].str.contains('Current')]
df_filtered.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,1296599,5000,5000,4975.0,36 months,10.65%,162.87,B,B2,...,,,,,0.0,0.0,,,,
1,1077430,1314167,2500,2500,2500.0,60 months,15.27%,59.83,C,C4,...,,,,,0.0,0.0,,,,
2,1077175,1313524,2400,2400,2400.0,36 months,15.96%,84.33,C,C5,...,,,,,0.0,0.0,,,,
3,1076863,1277178,10000,10000,10000.0,36 months,13.49%,339.31,C,C1,...,,,,,0.0,0.0,,,,
5,1075269,1311441,5000,5000,5000.0,36 months,7.90%,156.46,A,A4,...,,,,,0.0,0.0,,,,


#### Print the no. of records present in the dataframe after dropping current customers (excluding headers)

In [68]:
df_filtered.shape[0]-1

38576

#### Check for missing values (null or NaN) in the filtered dataframe

In [69]:
100*df_filtered.isnull().mean()

id                              0.000000
member_id                       0.000000
loan_amnt                       0.000000
funded_amnt                     0.000000
funded_amnt_inv                 0.000000
                                 ...    
tax_liens                       0.101097
tot_hi_cred_lim               100.000000
total_bal_ex_mort             100.000000
total_bc_limit                100.000000
total_il_high_credit_limit    100.000000
Length: 111, dtype: float64

#### Display columns with missing data only

In [70]:
null_value_mean = 100*df_filtered.isnull().mean()
columns_to_display = null_value_mean[null_value_mean >0].index
for column in columns_to_display:
    print(column, null_value_mean[column])

emp_title 6.185032532337922
emp_length 2.6777613603960906
desc 32.47790134017679
title 0.028514399771884805
mths_since_last_delinq 64.55919330170828
mths_since_last_record 92.8973222386396
revol_util 0.12961090805402184
last_pymnt_d 0.184047489436711
next_pymnt_d 100.0
last_credit_pull_d 0.005184436322160873
collections_12_mths_ex_med 0.14516421702050444
mths_since_last_major_derog 100.0
annual_inc_joint 100.0
dti_joint 100.0
verification_status_joint 100.0
tot_coll_amt 100.0
tot_cur_bal 100.0
open_acc_6m 100.0
open_il_6m 100.0
open_il_12m 100.0
open_il_24m 100.0
mths_since_rcnt_il 100.0
total_bal_il 100.0
il_util 100.0
open_rv_12m 100.0
open_rv_24m 100.0
max_bal_bc 100.0
all_util 100.0
total_rev_hi_lim 100.0
inq_fi 100.0
total_cu_tl 100.0
inq_last_12m 100.0
acc_open_past_24mths 100.0
avg_cur_bal 100.0
bc_open_to_buy 100.0
bc_util 100.0
chargeoff_within_12_mths 0.14516421702050444
mo_sin_old_il_acct 100.0
mo_sin_old_rev_tl_op 100.0
mo_sin_rcnt_rev_tl_op 100.0
mo_sin_rcnt_tl 100.0
mort_

#### Identify the null value columns whose average missing percentage is greater than 50%

In [71]:
columns_to_discard = null_value_mean[null_value_mean >50].index
for column in columns_to_discard:
    print(column, null_value_mean[column])

mths_since_last_delinq 64.55919330170828
mths_since_last_record 92.8973222386396
next_pymnt_d 100.0
mths_since_last_major_derog 100.0
annual_inc_joint 100.0
dti_joint 100.0
verification_status_joint 100.0
tot_coll_amt 100.0
tot_cur_bal 100.0
open_acc_6m 100.0
open_il_6m 100.0
open_il_12m 100.0
open_il_24m 100.0
mths_since_rcnt_il 100.0
total_bal_il 100.0
il_util 100.0
open_rv_12m 100.0
open_rv_24m 100.0
max_bal_bc 100.0
all_util 100.0
total_rev_hi_lim 100.0
inq_fi 100.0
total_cu_tl 100.0
inq_last_12m 100.0
acc_open_past_24mths 100.0
avg_cur_bal 100.0
bc_open_to_buy 100.0
bc_util 100.0
mo_sin_old_il_acct 100.0
mo_sin_old_rev_tl_op 100.0
mo_sin_rcnt_rev_tl_op 100.0
mo_sin_rcnt_tl 100.0
mort_acc 100.0
mths_since_recent_bc 100.0
mths_since_recent_bc_dlq 100.0
mths_since_recent_inq 100.0
mths_since_recent_revol_delinq 100.0
num_accts_ever_120_pd 100.0
num_actv_bc_tl 100.0
num_actv_rev_tl 100.0
num_bc_sats 100.0
num_bc_tl 100.0
num_il_tl 100.0
num_op_rev_tl 100.0
num_rev_accts 100.0
num_rev_

#### Drop the missing value columns whose average missing percentage is greater than 50%

In [72]:
df_new = df_filtered.drop(columns=columns_to_discard, axis=1) #axis=1 represents dropping columns

#### Print the remaining columns after dropping null value columns from the dataframe

In [73]:
df_new.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599,5000,5000,4975.0,36 months,10.65%,162.87,B,B2,...,171.62,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
1,1077430,1314167,2500,2500,2500.0,60 months,15.27%,59.83,C,C4,...,119.66,Sep-13,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
2,1077175,1313524,2400,2400,2400.0,36 months,15.96%,84.33,C,C5,...,649.91,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
3,1076863,1277178,10000,10000,10000.0,36 months,13.49%,339.31,C,C1,...,357.48,Apr-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
5,1075269,1311441,5000,5000,5000.0,36 months,7.90%,156.46,A,A4,...,161.03,Jan-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0


#### Print the no. of records present after changes done so far (excluding headers)

In [74]:
df_new.shape[0]-1

38576

#### Identify the columns whose missing value percentage is under acceptable range

In [None]:
#Numerical Columns