# Exploratory Data Analysis on lending club case study

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### loading loan dataset
Load the loan dataset and explore its basic properties

In [None]:
loan_df = pd.read_csv("loan.csv", low_memory=False)
# checking the shape(dimensions) of dataset
loan_df.shape

In [None]:
# since dataset has lots of columns
pd.set_option('display.max_columns', None)
loan_df.head()

In [None]:
loan_df.info()

In [None]:
loan_df.emp_length.unique()

## Understanding the Dataset
Need to understand the loan dataset column variables and pick the required ones

In [None]:
# Now since the DF has too many columns splitting into multiple small DF's of 25 columns each
df1 = loan_df.iloc[:, :25]
df2 = loan_df.iloc[:, 25:50]
df3 = loan_df.iloc[:, 50:75]
df4 = loan_df.iloc[:, 75:100]
df5 = loan_df.iloc[:, 100:111]

In [None]:
df1.info()

**The following variables can be used for univariate, segmented univariate and bivariate analysis.**
1. <font color = blue>loan_amnt</font> - The amount of loan requested by applicant - int64
2. <font color = blue>funded_amnt</font> - The amount of loan appoved for applicant - int64
3. <font color = blue>funded_amnt_inv</font> - The amount of loan sanctioned for applicant - float64
4. <font color = blue>term</font> - The term of loan - string object
5. <font color = blue>int_rate</font> - The interest charged for the loan - string object (needs to be converted to numerical variable float64)
6. <font color = blue>installment</font> - The installment amount decided for the loan - float64
7. <font color = blue>grade</font> - LC assigned id for loan - string object
8. <font color = blue>sub_grade</font> - LC assigned sub grade id for loan - string object
9. <font color = blue>emp_length</font> - emploment status of applicant - string object
10. <font color = blue>home_ownership</font> - home ownership status of applicant - string object
11. <font color = blue>annual_inc</font> - annual income of applicant - float64
12. <font color = blue>issue_d</font> - loan issued date - date object (additional information can be derived from this like year information)
13. <font color = blue>loan_status</font> - applicant loan status - string object
14. <font color = blue>purpose</font> - pupose of loan - object
15. <font color = blue>addr_state</font> - address information of user - string object
16. <font color = blue>dti</font> - Debt to Income ratio of applicant

The following variables can be ignored since they may not offer much insight for the given problem statement.
1. id
2. member_id
3. emp_title
4. verification_status
5. pymnt_plan
6. url
7. desc
8. title
9. zip_code

<font color = red>Note:</font> For now wont be selecting any column variables from df2, df3, df4 and df5, as performing EDA on the above 16 variables can give us lots of sights as well. Incase if required to calculate Business derived metrics like CIBIL/CREDIT SCORE of an applicant, we can pick few later.

In [None]:
df2.info()
df3.info()
df4.info()
df5.info()

## Cleaning Manipulating Master Dataset
1. First step is to create master data set from above identified columns in loan DF
2. Cleaning the dataset
    1. Changing cloumn names
    2. Removing missing data rows
    3. Converting column data types
    4. Deriving new column data from existing columns

### Create Master dataset

In [None]:
#  Creating the master dataset
loan_master_df = loan_df[['loan_amnt','funded_amnt','funded_amnt_inv','term','int_rate','installment',
                          'grade','sub_grade','loan_status','annual_inc','purpose',
                          'home_ownership','dti','emp_length','issue_d','addr_state']].copy()
loan_master_df.head()

### Cleaning the dataset
#### Step 1: Changing cloumn names

In [None]:
# Renaming some of the columns
loan_master_df = loan_master_df.rename(columns={"loan_amnt": "applied_amount", 
                                                "funded_amnt": "approved_amount", 
                                                "funded_amnt_inv": "sanctioned_amount",
                                                "int_rate": "interest_rate",
                                                "annual_inc": "annual_income",
                                                "grade": "loan_grade",
                                                "sub_grade": "loan_sub_grade",
                                                "emp_length":"employment_length",
                                                "issue_d":"issue_date",
                                                "addr_state":"address_state"
                                               })
loan_master_df.columns

In [None]:
# Checking the shape of master DF
loan_master_df.shape

In [None]:
# printing the percentage of missing values in each column
print(round(100*(loan_master_df.isnull().sum()/len(loan_master_df.index)), 2))

#### Step 2: Removing missing data rows

The column **"employment_length"** is having **2.71%** missing values.
Now since the percentage of missing values is not significant amount, we decide to delete the rows having missing values in **employment_length** column.

In [None]:
loan_master_df.isna
loan_master_df = loan_master_df.dropna(how='any')

In [None]:
print(round(100*(loan_master_df.isnull().sum()/len(loan_master_df.index)), 2))
loan_master_df.info()
loan_master_df.shape

We lost **39717 - 38642 = 1075 rows**

#### Step 3: Converting column data types

We need to convert datatype of column **"interest_rate"** to numeric type

In [None]:
loan_master_df.interest_rate = loan_master_df.interest_rate.apply(lambda x:x.strip('%'))
loan_master_df["interest_rate"] = pd.to_numeric(loan_master_df["interest_rate"])

In [None]:
loan_master_df.interest_rate.describe()

We need to convert the values of **employment_lenth** to numeric values

In [None]:
loan_master_df['employment_length'] = loan_master_df['employment_length'].str.replace('<','').str.replace(' ','').str.replace('year','').str.replace('s','').str.replace('+','').astype(float)

Removing leading empty space from term column values

In [None]:
loan_master_df['term'] = loan_master_df['term'].str.strip()

In [None]:
# loan_master_df['annual_income'] = loan_master_df['annual_income'].astype(int64)
# loan_master_df.info()
# loan_master_df.annual_income = loan_master_df.astype({'annual_income': 'int64'}).dtypes
loan_master_df.annual_income = loan_master_df.annual_income.astype('int64')
loan_master_df.sanctioned_amount = loan_master_df.sanctioned_amount.astype('int64')

#### Step 4: Deriving new column data from existing columns

We can derive a new column **"loan_issued_year"** information from **"issue_date"** column

In [None]:
# loan_master_df['loan_issued_year'] = pd.to_datetime(loan_master_df['issue_date']).dt.year
loan_master_df['loan_issued_year'] =  pd.to_datetime(loan_master_df['issue_date'],format='%b-%y').dt.year
print(loan_master_df.info())

In [None]:
loan_master_df.loan_sub_grade.unique()

In [None]:
# Removing the Initial 'Grade' Letter from 'loan_sub_grade'
loan_master_df.loan_sub_grade = loan_master_df.loan_sub_grade.str[1:]

In [None]:
loan_master_df

## Analysing the data from masterdata

1. Creating sub data sets as per business requirement for analysis
    1. Creating a dataframe for loan defaulters(Bad Loans)
    2. Creating a dataframe for loan non-defaulters(Good Loans)
    3. Plotting Defaulters vs Non-Defaulters
    
2. Univariate Analysis
    1. Determing the variable types 
    2. Analyzing individual variables
    3. Creating Distribution Plots
    4. Summary Metrics
    
3. Segmented Univariate Analysis
    1. Analysing groups
    2. Visualizing Groups via Plotting 
    3. Summary Metric
    
4. Bivariate Analysis
    1. Analysis
    2. Plots
    3. Summary
    

### Creating sub data sets for Good Loans and Bad Loans

Three sub dataframes can be created by grouping rows based on the values present in **"loan_status"** column

In [None]:
loan_master_df.loan_status.unique()

There are 3 unique values in **loan_status** column : **"Fully Paid", "Charged Off", "Current"**

1. The rows with **Fully Paid** value indicates loans that turned out good for the company, i.e company made profit
2. The rows with **Charged Off** value indicates loans that turned out bad for the company, i.e company may have made loss
3. The rows with **Current** value indicates loans that are still ongoing, they can turn anything Good or Bad, hence we can ignore those rows

#### Creating Defaulter dataframe

In [None]:
bad_loans_df = loan_master_df.loc[loan_master_df.loan_status == 'Charged Off'].copy()
bad_loans_df = bad_loans_df.drop(columns=['loan_status','issue_date'])
bad_loans_df.shape

#### Creating Non defaulter dataframe

In [None]:
good_loans_df = loan_master_df.loc[loan_master_df.loan_status == 'Fully Paid'].copy()
good_loans_df = good_loans_df.drop(columns=['loan_status','issue_date'])
good_loans_df.shape

In [None]:
loan_status_df = pd.DataFrame(loan_master_df.groupby('loan_status').sanctioned_amount.count())
loan_status_df.head()

good_bad_loans = loan_master_df.loc[(loan_master_df.loan_status == 'Charged Off') | (loan_master_df.loan_status == 'Fully Paid')]
good_bad_loans_status = pd.DataFrame(good_bad_loans.groupby('loan_status').sanctioned_amount.count())

In [None]:
plot = loan_status_df.plot.pie(subplots=True, figsize=(6, 6),autopct='%1.2f%%')

Distribution of all loans based on **loan_status** values

In [None]:
plot = good_bad_loans_status.plot.pie(subplots=True, figsize=(6, 6),autopct='%1.2f%%')

Distribution of good and bad loans

### Univariate Analysis

#### Step 1: Determining the variable types

In [None]:
bad_loans_df.info()

In [None]:
bad_loans_df.term.unique()

In [None]:
bad_loans_df.loan_grade.unique()

In [None]:
bad_loans_df.loan_sub_grade.unique()

In [None]:
bad_loans_df.purpose.unique()

In [None]:
bad_loans_df.home_ownership.unique()

In [None]:
bad_loans_df.address_state.unique()

In [None]:
bad_loans_df.loan_issued_year.unique()

In [None]:
bad_loans_df.employment_length.unique()

# We can divide variables across 2 broad categories

**1. Numerical**
    1. applied_amount
    2. approved_amount
    3. sanctioned_amount
    4. interest_rate
    5. installment
    6. annual_income
    7. dti
    8. employment_length - [ 1.,  4.,  3., 10.,  9.,  2.,  8.,  7.,  5.,  6.]
    
**2. Categorical**
    1. term (unordered) - ['60 months', '36 months']
    2. loan_grade (ordered) - ['C', 'F', 'B', 'D', 'A', 'E', 'G']
    3. loan_sub_grade (ordered) - ['1', '2', '3', '4', '5']
    4. purpose(unordered) - ['car', 'small_business', 'other', 'debt_consolidation',
       'major_purchase', 'credit_card', 'home_improvement', 'moving',
       'vacation', 'house', 'medical', 'wedding', 'renewable_energy',
       'educational']
    5. home_ownership (unordered) - ['RENT', 'OWN', 'MORTGAGE', 'OTHER']
    6. address_state (unordered) - ['GA', 'CA', 'TX', 'VA', 'FL', 'NY', 'PA', 'OH', 'IL', 'WA', 'MN',
       'AK', 'OR', 'AZ', 'WI', 'NC', 'CO', 'MO', 'WV', 'NV', 'SC', 'RI',
       'MA', 'KY', 'HI', 'NJ', 'MT', 'MI', 'SD', 'DC', 'OK', 'VT', 'NM',
       'MD', 'AL', 'KS', 'UT', 'LA', 'AR', 'CT', 'NH', 'DE', 'WY', 'MS',
       'TN', 'ID', 'NE']
    7. loan_issued_year (ordered) - [2011, 2010, 2009, 2008, 2007]

#### Step 2 & 3: Analyzing individual variables and Creating Distribution Plots

#### Analysing and Plotting BAD Loans

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1, 3, 1)
plt.title('applied_amount')
sns.distplot(bad_loans_df['applied_amount'])

plt.subplot(1, 3, 2)
plt.title('approved_amount')
sns.distplot(bad_loans_df['approved_amount'])

plt.subplot(1, 3, 3)
plt.title('sanctioned_amount')
sns.distplot(bad_loans_df['sanctioned_amount'])
plt.show()

# boxplots
plt.figure(figsize=(20,5))
plt.subplot(1, 3, 1)
plt.title('applied_amount')
sns.boxplot(bad_loans_df['applied_amount'])

plt.subplot(1, 3, 2)
plt.title('approved_amount')
sns.boxplot(bad_loans_df['approved_amount'])

plt.subplot(1, 3, 3)
plt.title('sanctioned_amount')
sns.boxplot(bad_loans_df['sanctioned_amount'])
plt.show()

In [None]:
print(bad_loans_df.applied_amount.describe(percentiles=[.25, .5, .75,.100]))
print(bad_loans_df.approved_amount.describe(percentiles=[.25, .5, .75,.100]))
print(bad_loans_df.sanctioned_amount.describe(percentiles=[.25, .5, .75,.100]))

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1, 3, 1)
plt.title('interest_rate')
sns.distplot(bad_loans_df['interest_rate'])

plt.subplot(1, 3, 2)
plt.title('installment')
sns.distplot(bad_loans_df['installment'])

plt.subplot(1, 3, 3)
plt.title('dti')
sns.distplot(bad_loans_df['dti'])
plt.show()

# boxplots
plt.figure(figsize=(20,5))
plt.subplot(1, 3, 1)
plt.title('interest_rate')
sns.boxplot(bad_loans_df['interest_rate'])

plt.subplot(1, 3, 2)
plt.title('installment')
sns.boxplot(bad_loans_df['installment'])

plt.subplot(1, 3, 3)
plt.title('dti')
sns.boxplot(bad_loans_df['dti'])
plt.show()

In [None]:
print(bad_loans_df.interest_rate.describe(percentiles=[.25, .5, .75,.100]))
print(bad_loans_df.installment.describe(percentiles=[.25, .5, .75,.100]))
print(bad_loans_df.dti.describe(percentiles=[.25, .5, .75,.100]))

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1, 2, 1)
plt.title('annual_income')
sns.distplot(bad_loans_df['annual_income'])

plt.subplot(1, 2, 2)
plt.title('employment_length')
sns.distplot(bad_loans_df['employment_length'])
plt.show()

# boxplots
plt.figure(figsize=(20,5))
plt.subplot(1, 2, 1)
plt.title('annual_income')
sns.boxplot(bad_loans_df['annual_income'])

plt.subplot(1, 2, 2)
plt.title('employment_length')
sns.boxplot(bad_loans_df['employment_length'])
plt.show()

In [None]:
print(bad_loans_df.annual_income.describe(percentiles=[.25, .5, .75,.100]))
print(bad_loans_df.employment_length.describe(percentiles=[.25, .5, .75,.100]))

#### Analysing and Plotting GOOD Loans

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1, 3, 1)
plt.title('applied_amount')
sns.distplot(good_loans_df['applied_amount'])

plt.subplot(1, 3, 2)
plt.title('approved_amount')
sns.distplot(good_loans_df['approved_amount'])

plt.subplot(1, 3, 3)
plt.title('sanctioned_amount')
sns.distplot(good_loans_df['sanctioned_amount'])
plt.show()

# boxplots
plt.figure(figsize=(20,5))
plt.subplot(1, 3, 1)
plt.title('applied_amount')
sns.boxplot(good_loans_df['applied_amount'])

plt.subplot(1, 3, 2)
plt.title('approved_amount')
sns.boxplot(good_loans_df['approved_amount'])

plt.subplot(1, 3, 3)
plt.title('sanctioned_amount')
sns.boxplot(good_loans_df['sanctioned_amount'])
plt.show()

In [None]:
print(good_loans_df.applied_amount.describe(percentiles=[.25, .5, .75,.100]))
print(good_loans_df.approved_amount.describe(percentiles=[.25, .5, .75,.100]))
print(good_loans_df.sanctioned_amount.describe(percentiles=[.25, .5, .75,.100]))

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1, 3, 1)
plt.title('interest_rate')
sns.distplot(good_loans_df['interest_rate'])

plt.subplot(1, 3, 2)
plt.title('installment')
sns.distplot(good_loans_df['installment'])

plt.subplot(1, 3, 3)
plt.title('dti')
sns.distplot(good_loans_df['dti'])
plt.show()

# boxplots
plt.figure(figsize=(20,5))
plt.subplot(1, 3, 1)
plt.title('interest_rate')
sns.boxplot(good_loans_df['interest_rate'])

plt.subplot(1, 3, 2)
plt.title('installment')
sns.boxplot(good_loans_df['installment'])

plt.subplot(1, 3, 3)
plt.title('dti')
sns.boxplot(good_loans_df['dti'])
plt.show()

In [None]:
print(good_loans_df.interest_rate.describe(percentiles=[.25, .5, .75,.100]))
print(good_loans_df.installment.describe(percentiles=[.25, .5, .75,.100]))
print(good_loans_df.dti.describe(percentiles=[.25, .5, .75,.100]))

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1, 2, 1)
plt.title('annual_income')
sns.distplot(good_loans_df['annual_income'])

plt.subplot(1, 2, 2)
plt.title('employment_length')
sns.distplot(good_loans_df['employment_length'])
plt.show()

# boxplots
plt.figure(figsize=(20,5))
plt.subplot(1, 2, 1)
plt.title('annual_income')
sns.boxplot(good_loans_df['annual_income'])

plt.subplot(1, 2, 2)
plt.title('employment_length')
sns.boxplot(good_loans_df['employment_length'])
plt.show()

In [None]:
print(good_loans_df.annual_income.describe(percentiles=[.25, .5, .75,.100]))
print(good_loans_df.employment_length.describe(percentiles=[.25, .5, .75,.100]))

#### Step 4: Summary metrics for GOOD loans and BAD loans

**Bad Loan Metrics**

1. median applied_amount is **10000**
2. median approved_amount is **10000**
3. median sanctioned_amount is **9575**
4. mean interest_rate is **13.88**
5. median installment is **297.53**
6. mean annual_income is **0.054**
7. median dti is **13.97**
8. median employment_length is **5.19**


**Good Loan Metrics**

1. median applied_amount is **9600**
2. median approved_amount is **9450**
3. median sanctioned_amount is **8741**
4. mean interest_rate is **11.64**
5. median installment is **277.7**
6. mean annual_income is **0.07 Million**
7. median dti is **13.21**
8. median employment_length is **4**

### Segmented Univariate Analysis

#### Plotting Year wise mean sanctioned amount for bad/good loans

In [None]:
plt.figure(figsize=(20,5))

plt.subplot(1, 2, 1)
plt.title('Year wise mean sanctioned amount for bad loans')
sns.barplot(x='loan_issued_year', y='sanctioned_amount', data=bad_loans_df, estimator=np.mean)

plt.subplot(1, 2, 2)
plt.title('Year wise mean sanctioned amount for good loans')
sns.barplot(x='loan_issued_year', y='sanctioned_amount', data=good_loans_df, estimator=np.mean)

plt.show()

#### Plotting Employment length wise mean sanctioned amount for bad/good loans

In [None]:
plt.figure(figsize=(20,5))

plt.subplot(1, 2, 1)
plt.title('Employment length wise mean sanctioned amount for bad loans')
sns.barplot(x='employment_length', y='sanctioned_amount', data=bad_loans_df, estimator=np.mean)

plt.subplot(1, 2, 2)
plt.title('Employment length wise mean sanctioned amount for good loans')
sns.barplot(x='employment_length', y='sanctioned_amount', data=good_loans_df, estimator=np.mean)

plt.show()

#### Plotting Term wise mean sanctioned amount for bad/good loans

In [None]:
plt.figure(figsize=(20,5))

plt.subplot(1, 2, 1)
plt.title('Term wise mean sanctioned amount for bad loans')
sns.barplot(x='term', y='sanctioned_amount', data=bad_loans_df, estimator=np.mean)

plt.subplot(1, 2, 2)
plt.title('Term wise mean sanctioned amount for good loans')
sns.barplot(x='term', y='sanctioned_amount', data=good_loans_df, estimator=np.mean)

plt.show()

#### Plotting Home Ownership wise mean sanctioned amount for bad/good loans

In [None]:
plt.figure(figsize=(20,5))

plt.subplot(121)
plt.title('Home Ownership wise mean sanctioned amount for bad loans')
sns.barplot(x='home_ownership', y='sanctioned_amount', data=bad_loans_df, estimator=np.mean)

plt.subplot(122)
plt.title('Home Ownership wise mean sanctioned amount for good loans')
sns.barplot(x='home_ownership', y='sanctioned_amount', data=good_loans_df, estimator=np.mean)

plt.show()

#### Plotting Grade wise mean sanctioned amount for bad/good loans

In [None]:
plt.figure(figsize=(20,5))

plt.subplot(121)
plt.title('Loan grade wise mean sanctioned amount for bad loans')
sns.barplot(x='loan_grade', y='sanctioned_amount', data=bad_loans_df, estimator=np.mean)

plt.subplot(122)
plt.title('Loan grade wise mean sanctioned amount for good loans')
sns.barplot(x='loan_grade', y='sanctioned_amount', data=good_loans_df, estimator=np.mean)

#### Plotting Purpose wise mean sanctioned amount for bad/good loans

In [None]:
plt.figure(figsize=(40,10))

plt.subplot(221)
plt.title('Loan Purpose wise mean sanctioned amount for bad loans')
sns.barplot(x='purpose', y='sanctioned_amount', data=bad_loans_df, estimator=np.mean)

plt.subplot(223)
plt.title('Loan Purpose wise mean sanctioned amount for good loans')
sns.barplot(x='purpose', y='sanctioned_amount', data=good_loans_df, estimator=np.mean)

#### Plotting Address State wise mean sanctioned amount for bad/good loans

In [None]:
plt.figure(figsize=(40,10))

plt.subplot(221)
plt.title('Address State wise mean sanctioned amount for bad loans')
sns.barplot(x='address_state', y='sanctioned_amount', data=bad_loans_df, estimator=np.mean)

plt.subplot(223)
plt.title('Address State wise mean sanctioned amount for good loans')
sns.barplot(x='address_state', y='sanctioned_amount', data=good_loans_df, estimator=np.mean)

#### Distribution of sub grades inside Loan grade for bad/good loans

In [None]:
# bad_loan_tab = pd.crosstab(bad_loans_df.loan_grade, bad_loans_df.loan_sub_grade)
# bad_loan_tab.plot.bar(stacked=True, width = 1)
# plt.legend(title='loan_sub_grade')
plt.figure(figsize=(20,5))

plt.subplot(121)
bad_loan_tab = pd.crosstab(bad_loans_df.loan_grade, bad_loans_df.loan_sub_grade)
stacked = bad_loan_tab.stack().reset_index().rename(columns={0:'value'})
plt.title('Distribution of sub grades inside Loan grade for bad loans')
sns.barplot(x=stacked.loan_grade, y=stacked.value, hue=stacked.loan_sub_grade)

plt.subplot(122)
good_loan_tab = pd.crosstab(good_loans_df.loan_grade, good_loans_df.loan_sub_grade)
stacked = good_loan_tab.stack().reset_index().rename(columns={0:'value'})
plt.title('Distribution of sub grades inside Loan grade for good loans')
sns.barplot(x=stacked.loan_grade, y=stacked.value, hue=stacked.loan_sub_grade)

plt.show()  

#### Plotting Income Category wise mean sanctioned amount for bad/good loans

For this we need to derive a new categorical variable called **income_category**

In [None]:
bins = [0, 60000, 120000, 240000, 480000, 960000, 1920000]
labels = ["A","B","C","D","E","F"]

bad_loans_df['income_category'] = pd.cut(bad_loans_df['annual_income'], bins=bins, labels=labels)
good_loans_df['income_category'] = pd.cut(good_loans_df['annual_income'], bins=bins, labels=labels)

In [None]:
plt.figure(figsize=(20,5))

plt.subplot(121)
plt.title('income_category wise mean sanctioned amount for bad loans')
sns.barplot(x='income_category', y='sanctioned_amount', data=bad_loans_df, estimator=np.mean)

plt.subplot(122)
plt.title('income category wise mean sanctioned amount for good loans')
sns.barplot(x='income_category', y='sanctioned_amount', data=good_loans_df, estimator=np.mean)

#### Calculating the percentage of defaulters in each income category

In [None]:
bad_loan_income_groups = bad_loans_df.groupby('income_category')
good_loan_income_groups = good_loans_df.groupby('income_category')
bad_loan_income_groups.sanctioned_amount.count()

In [None]:
good_loan_income_groups.sanctioned_amount.count()

In [None]:
number_of_defaulters_in_category = np.array(bad_loan_income_groups.sanctioned_amount.count().tolist())

number_of_non_defaulters_in_category = np.array(good_loan_income_groups.sanctioned_amount.count().tolist())

total_applicants_in_each_category = number_of_defaulters_in_category + number_of_non_defaulters_in_category

percentage_defaulters_in_each_categories = np.around((number_of_defaulters_in_category/total_applicants_in_each_category) * 100, decimals=2)
print(percentage_defaulters_in_each_categories)

inc_categories = np.array(bad_loans_df.income_category.unique().tolist())
print(inc_categories)

plt.bar(inc_categories, percentage_defaulters_in_each_categories)
plt.ylabel('Percentage of Defaulters') 
plt.xlabel('Income Category')  
plt.title('Distribution of Defaulters in all Income Categories') 


1. Defaulter Percentage in income category **A is 16.32%**
2. Defaulter Percentage in income category **B is 12.38%**
3. Defaulter Percentage in income category **C is 11.16%**
4. Defaulter Percentage in income category **D is 8.3%**
5. Defaulter Percentage in income category **E is 19.3%**
6. Defaulter Percentage in income category **F is 9.09%**

#### Plotting DTI wise mean sanctioned amount for bad/good loans

For this we need to derive a new categorical variable called **dti_category**

In [None]:
bins = [0, 10, 20, 29.99]
labels = ["A","B","C","D","E","F"]

bad_loans_df['dti_category'] = pd.cut(bad_loans_df['dti'], bins=6, labels=labels)
good_loans_df['dti_category'] = pd.cut(good_loans_df['dti'], bins=6, labels=labels)

In [None]:
plt.figure(figsize=(20,5))

plt.subplot(121)
plt.title('dti_category wise mean sanctioned amount for bad loans')
sns.barplot(x='income_category', y='sanctioned_amount', data=bad_loans_df, estimator=np.mean)

plt.subplot(122)
plt.title('dti category wise mean sanctioned amount for good loans')
sns.barplot(x='income_category', y='sanctioned_amount', data=good_loans_df, estimator=np.mean)

### Bivariate Analysis

In [None]:
corr = bad_loans_df.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

In [None]:
corr = good_loans_df.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)