In [41]:
import numpy as np
import pandas as pd

#### Load Dataframe

In [42]:
#load data in dataframe
loan_master = pd.read_csv('loan.csv',low_memory=False)

In [43]:
#check number of rows and columns
loan_master.shape

(39717, 111)

#### User Defined Functions 

In [44]:
#function to check for nulls and only display the columns having nulls 
def check_null(dataframe, percentage=True):
    if percentage==True:
        return round(dataframe.isnull().mean()[dataframe.isnull().mean()>0].apply(lambda x: x*100),3)
    else:
        return dataframe.isnull().sum()[dataframe.isnull().sum()>0]

In [45]:
#function to drop columns based on NULLs percentage
def drop_columns_null_perc(dataframe, percentage):
    n_thresh = int(len(loan_master)*(1-(percentage/100)))
    dataframe.dropna(axis=1, thresh=n_thresh, inplace=True)

In [46]:
#function to drop irrelevant columns
def drop_irrelevant_columns(dataframe, column_list):
    dataframe.drop(column_list, axis=1, inplace=True, errors='ignore')

## Data Cleaning

In [47]:
#check columns having nulls
check_null(loan_master)

emp_title                       6.191
emp_length                      2.707
desc                           32.581
title                           0.028
mths_since_last_delinq         64.662
                               ...   
tax_liens                       0.098
tot_hi_cred_lim               100.000
total_bal_ex_mort             100.000
total_bc_limit                100.000
total_il_high_credit_limit    100.000
Length: 68, dtype: float64

##### Drop columns having ALL nulls

In [48]:
loan_master.dropna(axis=1, how='all', inplace=True)
#check shape again
loan_master.shape
#54 such columns removed

(39717, 57)

In [49]:
check_null(loan_master)

emp_title                      6.191
emp_length                     2.707
desc                          32.581
title                          0.028
mths_since_last_delinq        64.662
mths_since_last_record        92.985
revol_util                     0.126
last_pymnt_d                   0.179
next_pymnt_d                  97.130
last_credit_pull_d             0.005
collections_12_mths_ex_med     0.141
chargeoff_within_12_mths       0.141
pub_rec_bankruptcies           1.755
tax_liens                      0.098
dtype: float64

##### Drop columns with >90% null rows
Columns: next_pymnt_d and mths_since_last_record

In [50]:
drop_columns_null_perc(loan_master, 90)

In [51]:
check_null(loan_master)

emp_title                      6.191
emp_length                     2.707
desc                          32.581
title                          0.028
mths_since_last_delinq        64.662
revol_util                     0.126
last_pymnt_d                   0.179
last_credit_pull_d             0.005
collections_12_mths_ex_med     0.141
chargeoff_within_12_mths       0.141
pub_rec_bankruptcies           1.755
tax_liens                      0.098
dtype: float64

##### Drop rows with <7% of nulls
For columns: 
emp_title, emp_length, title, revol_util, last_pymnt_d, last_credit_pull_d,
collections_12_mths_ex_med, chargeoff_within_12_mths, pub_rec_bankruptcies, tax_liens

In [52]:
#trying to find a way to do it without writing individual lines as in next cell
#int(len(loan_master)*0.07)
#df = loan_master.dropna(thresh=5)
#df.shape

In [53]:
# loan_master = loan_master.loc[loan_master[''].notnull(),:]
# loan_master = loan_master.loc[loan_master['emp_length'].notnull(),:]
# loan_master = loan_master.loc[loan_master['title'].notnull(),:]
# loan_master = loan_master.loc[loan_master[''].notnull(),:]revol_util
# loan_master = loan_master.loc[loan_master[''].notnull(),:]last_pymnt_d
# loan_master = loan_master.loc[loan_master['last_credit_pull_d'].notnull(),:]
# loan_master = loan_master.loc[loan_master[''].notnull(),:]collections_12_mths_ex_med
# loan_master = loan_master.loc[loan_master[''].notnull(),:]chargeoff_within_12_mths
# loan_master = loan_master.loc[loan_master[''].notnull(),:]pub_rec_bankruptcies
# loan_master = loan_master.loc[loan_master[''].notnull(),:]tax_liens

##### Drop irrelevant columns

In [54]:
#listed columns were determined to be irrelevant to our analysis
list_ir_col = ['desc','zip_code','url']

In [55]:
drop_irrelevant_columns(loan_master,list_ir_col)

In [56]:
loan_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39717 entries, 0 to 39716
Data columns (total 52 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          39717 non-null  int64  
 1   member_id                   39717 non-null  int64  
 2   loan_amnt                   39717 non-null  int64  
 3   funded_amnt                 39717 non-null  int64  
 4   funded_amnt_inv             39717 non-null  float64
 5   term                        39717 non-null  object 
 6   int_rate                    39717 non-null  object 
 7   installment                 39717 non-null  float64
 8   grade                       39717 non-null  object 
 9   sub_grade                   39717 non-null  object 
 10  emp_title                   37258 non-null  object 
 11  emp_length                  38642 non-null  object 
 12  home_ownership              39717 non-null  object 
 13  annual_inc                  397

In [23]:
#Drop title column
#loan_master.drop(['title'],axis=1, inplace=True)
#chcek for analysis

In [24]:
#mths_since_last_delinq-remove?can use delinq 2 years instead?

#### Data Cleaning - Handle Data Types

In [26]:
#check for object dtypes 
loan_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39717 entries, 0 to 39716
Data columns (total 52 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          39717 non-null  int64  
 1   member_id                   39717 non-null  int64  
 2   loan_amnt                   39717 non-null  int64  
 3   funded_amnt                 39717 non-null  int64  
 4   funded_amnt_inv             39717 non-null  float64
 5   term                        39717 non-null  object 
 6   int_rate                    39717 non-null  object 
 7   installment                 39717 non-null  float64
 8   grade                       39717 non-null  object 
 9   sub_grade                   39717 non-null  object 
 10  emp_title                   37258 non-null  object 
 11  emp_length                  38642 non-null  object 
 12  home_ownership              39717 non-null  object 
 13  annual_inc                  397

In [27]:
#fix data type for 'term' as int64
loan_master['term'] = loan_master['term'].apply(lambda x: str(x).replace('months','').strip()).astype('int64')
loan_master = loan_master.rename(columns={'term':'term_months'})

In [28]:
#fix data type for 'int_rate' as 'float64' 
loan_master['int_rate'] = loan_master['int_rate'].apply(lambda x: str(x).replace('%','').strip()).astype('float64')

In [29]:
#fix data type emp_length
loan_master['emp_length'].value_counts()

10+ years    8879
< 1 year     4583
2 years      4388
3 years      4095
4 years      3436
5 years      3282
1 year       3240
6 years      2229
7 years      1773
8 years      1479
9 years      1258
Name: emp_length, dtype: int64

In [30]:
loan_master.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term_months,int_rate,installment,grade,sub_grade,...,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599,5000,5000,4975.0,36,10.65,162.87,B,B2,...,171.62,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
1,1077430,1314167,2500,2500,2500.0,60,15.27,59.83,C,C4,...,119.66,Sep-13,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
2,1077175,1313524,2400,2400,2400.0,36,15.96,84.33,C,C5,...,649.91,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
3,1076863,1277178,10000,10000,10000.0,36,13.49,339.31,C,C1,...,357.48,Apr-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
4,1075358,1311748,3000,3000,3000.0,60,12.69,67.79,B,B5,...,67.79,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0


In [25]:
#check uniqueness of id and member id
print('unique id: ',loan_master['id'].nunique())
print('unique member id: ',loan_master['member_id'].nunique())
#both are unique

unique id:  39717
unique member id:  39717


### Analysis

In [31]:
#create separate data frames based on loan status
charged_off_loans = loan_master[loan_master['loan_status']=='Charged Off']
current_loans = loan_master[loan_master['loan_status']=='Current']
fully_paid_loans = loan_master[loan_master['loan_status']=='Fully Paid']

In [32]:
#avg open accounts
loan_custattr = pd.pivot_table(loan_master,index='loan_status',values='open_acc',aggfunc=np.median)
loan_custattr
#doesn't matter

Unnamed: 0_level_0,open_acc
loan_status,Unnamed: 1_level_1
Charged Off,8
Current,9
Fully Paid,9


In [33]:
#avg total accounts
loan_custattr = pd.pivot_table(loan_master,index='loan_status',values='total_acc',aggfunc=np.median)
loan_custattr
#doesn't matter

Unnamed: 0_level_0,total_acc
loan_status,Unnamed: 1_level_1
Charged Off,20
Current,22
Fully Paid,20


In [34]:
#verification status
charged_off_loans['verification_status'].value_counts()

Not Verified       2142
Verified           2051
Source Verified    1434
Name: verification_status, dtype: int64

In [35]:
#verification status
fully_paid_loans['verification_status'].value_counts()

Not Verified       14552
Verified           10155
Source Verified     8243
Name: verification_status, dtype: int64

In [36]:
#loan_master = loan_master.groupby(by='loan_status')
loan_master.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term_months,int_rate,installment,grade,sub_grade,...,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599,5000,5000,4975.0,36,10.65,162.87,B,B2,...,171.62,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
1,1077430,1314167,2500,2500,2500.0,60,15.27,59.83,C,C4,...,119.66,Sep-13,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
2,1077175,1313524,2400,2400,2400.0,36,15.96,84.33,C,C5,...,649.91,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
3,1076863,1277178,10000,10000,10000.0,36,13.49,339.31,C,C1,...,357.48,Apr-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
4,1075358,1311748,3000,3000,3000.0,60,12.69,67.79,B,B5,...,67.79,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0


In [37]:
loan_master['loan_status'].value_counts()

Fully Paid     32950
Charged Off     5627
Current         1140
Name: loan_status, dtype: int64

In [38]:
loan_master['funded_amnt'].count()

39717

In [39]:
#charged_off_loans = loan_master[loan_master['loan_status']=='Charged Off']

In [40]:
loan_master['loan_status'].value_counts()

Fully Paid     32950
Charged Off     5627
Current         1140
Name: loan_status, dtype: int64

#loan attributes 
funded amount inv
int rate
term
installment
grade
total pymt


#customer attributes
verification
annual income
emp length
purpose
addr state
total acc