In [1]:
import numpy as np
import pandas as pd

#### Load Dataframe

In [7]:
#load data in dataframe
loan_master = pd.read_csv('loan.csv',low_memory=False)

In [8]:
#check number of rows and columns
loan_master.shape

(39717, 111)

#### User Defined Functions 

In [19]:
#function to check for nulls and only display the columns having nulls 
def check_null(d):
    return round((d.isnull().sum()[d.isnull().sum()>0]/d.shape[0])*100,1)

#function to list colums with obj data types
def info_obj(df):
    df.select_dtypes(include=['object','int64']).info()

## Data Cleaning

#### Data Cleaning - Handle Nulls

In [22]:
#check columns having nulls
check_null(loan_master)

emp_title                       6.2
emp_length                      2.7
desc                           32.6
title                           0.0
mths_since_last_delinq         64.7
                              ...  
tax_liens                       0.1
tot_hi_cred_lim               100.0
total_bal_ex_mort             100.0
total_bc_limit                100.0
total_il_high_credit_limit    100.0
Length: 68, dtype: float64

In [23]:
#drop columns having all nulls
loan_master.dropna(axis=1, how='all', inplace=True)
#check shape again
loan_master.shape
#54 such columns removed

(39717, 57)

In [26]:
check_null(loan_master)

emp_title                      6.2
emp_length                     2.7
desc                          32.6
title                          0.0
mths_since_last_delinq        64.7
mths_since_last_record        93.0
revol_util                     0.1
last_pymnt_d                   0.2
next_pymnt_d                  97.1
last_credit_pull_d             0.0
collections_12_mths_ex_med     0.1
chargeoff_within_12_mths       0.1
pub_rec_bankruptcies           1.8
tax_liens                      0.1
dtype: float64

In [27]:
#drop next_pymnt_d as having 97% nulls
loan_master.drop(['next_pymnt_d'],axis=1, inplace=True)

In [28]:
#drop mths_since_last_record as having 93% nulls
loan_master.drop(['mths_since_last_record'],axis=1, inplace=True)

In [29]:
#drop 2 rows with null last_credit_pull_d
loan_master = loan_master.loc[loan_master['last_credit_pull_d'].notnull(),:]

In [31]:
#drop rows with null emp_length - 2.7% rows
loan_master = loan_master.loc[loan_master['emp_length'].notnull(),:]

#### Data Cleaning - Drop irrelevant columns

In [33]:
#Drop desc column
loan_master.drop(['desc'],axis=1, inplace=True)

KeyError: "['desc'] not found in axis"

In [285]:
#Drop title column
#loan_master.drop(['title'],axis=1, inplace=True)
#chcek for analysis

In [286]:
#mths_since_last_delinq-remove?can use delinq 2 years instead?

In [287]:
#loan_master['next_pymnt_d'][~loan_master['next_pymnt_d'].isnull()].value_counts()
#keeping this column for now as those 1140 entries are for Current loan status
#removed!!! check later

In [35]:
#Drop zip code
loan_master.drop(['zip_code'],axis=1, inplace=True)

In [34]:
#Drop url as not needed for our analysis
#Loan id is a part of url, but is already available in id column
loan_master.drop(['url'],axis=1, inplace=True)

In [36]:
#check columns having nulls
check_null(loan_master)

emp_title                      3.7
title                          0.0
mths_since_last_delinq        64.6
revol_util                     0.1
last_pymnt_d                   0.2
collections_12_mths_ex_med     0.1
chargeoff_within_12_mths       0.1
pub_rec_bankruptcies           1.8
tax_liens                      0.1
dtype: float64

In [254]:
#check uniqueness of id and member id
print('unique id: ',loan_master['id'].nunique())
print('unique member id: ',loan_master['member_id'].nunique())
#both are unique

unique id:  39715
unique member id:  39715


#### Data Cleaning - Handle Data Types

In [255]:
#check for object dtypes 
info_obj(loan_master)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39715 entries, 0 to 39716
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   term                 39715 non-null  object
 1   int_rate             39715 non-null  object
 2   grade                39715 non-null  object
 3   sub_grade            39715 non-null  object
 4   emp_title            37258 non-null  object
 5   emp_length           38640 non-null  object
 6   home_ownership       39715 non-null  object
 7   verification_status  39715 non-null  object
 8   issue_d              39715 non-null  object
 9   loan_status          39715 non-null  object
 10  pymnt_plan           39715 non-null  object
 11  purpose              39715 non-null  object
 12  addr_state           39715 non-null  object
 13  earliest_cr_line     39715 non-null  object
 14  revol_util           39665 non-null  object
 15  initial_list_status  39715 non-null  object
 16  last

In [256]:
#fix data type for 'term' as int64
loan_master['term'] = loan_master['term'].apply(lambda x: str(x).replace('months','').strip()).astype('int64')
loan_master = loan_master.rename(columns={'term':'term_months'})

In [257]:
#fix data type for 'int_rate' as 'float64' 
loan_master['int_rate'] = loan_master['int_rate'].apply(lambda x: str(x).replace('%','').strip()).astype('float64')

In [294]:
#fix data type emp_length
loan_master['emp_length'].value_counts()

10+ years    8879
< 1 year     4582
2 years      4388
3 years      4095
4 years      3436
5 years      3281
1 year       3240
6 years      2229
7 years      1773
8 years      1479
9 years      1258
Name: emp_length, dtype: int64

In [258]:
loan_master.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term_months,int_rate,installment,grade,sub_grade,...,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599,5000,5000,4975.0,36,10.65,162.87,B,B2,...,171.62,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
1,1077430,1314167,2500,2500,2500.0,60,15.27,59.83,C,C4,...,119.66,Sep-13,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
2,1077175,1313524,2400,2400,2400.0,36,15.96,84.33,C,C5,...,649.91,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
3,1076863,1277178,10000,10000,10000.0,36,13.49,339.31,C,C1,...,357.48,Apr-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
4,1075358,1311748,3000,3000,3000.0,60,12.69,67.79,B,B5,...,67.79,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0


### Analysis

In [259]:
loan_master = loan_master.groupby(by='loan_status')
loan_master.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term_months,int_rate,installment,grade,sub_grade,...,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599,5000,5000,4975.0,36,10.65,162.87,B,B2,...,171.62,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
1,1077430,1314167,2500,2500,2500.0,60,15.27,59.83,C,C4,...,119.66,Sep-13,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
2,1077175,1313524,2400,2400,2400.0,36,15.96,84.33,C,C5,...,649.91,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
3,1076863,1277178,10000,10000,10000.0,36,13.49,339.31,C,C1,...,357.48,Apr-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
4,1075358,1311748,3000,3000,3000.0,60,12.69,67.79,B,B5,...,67.79,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
5,1075269,1311441,5000,5000,5000.0,36,7.9,156.46,A,A4,...,161.03,Jan-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
6,1069639,1304742,7000,7000,7000.0,60,15.96,170.08,C,C5,...,1313.76,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
8,1071795,1306957,5600,5600,5600.0,60,21.28,152.39,F,F2,...,152.39,Aug-12,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
9,1071570,1306721,5375,5375,5350.0,60,12.69,121.45,B,B5,...,121.45,Mar-13,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
12,1064687,1298717,9000,9000,9000.0,36,13.49,305.38,C,C1,...,305.38,Nov-12,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0


In [38]:
loan_master['loan_status'].value_counts()

Fully Paid     32144
Charged Off     5398
Current         1098
Name: loan_status, dtype: int64

In [37]:
loan_master['funded_amnt'].count()

38640

In [261]:
#charged_off_loans = loan_master[loan_master['loan_status']=='Charged Off']

KeyError: 'Column not found: False'

In [None]:
loan_master['loan_status'].value_counts()

#loan attributes 
funded amount inv
int rate
term
installment
grade
total pymt


#customer attributes
verification
annual income
emp length
purpose
addr state
total acc