# Credit Risk Analysis
## BY SEYED JAVIDH 

### IMPORTING LIBRARIES

In [None]:
# Importing all the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Setting the relevant options to view most rows and columns

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200) 
pd.set_option('display.width', 1000)

### READING DATAFRAMES

In [None]:
# Reading both the files and creating 2 dataframes.

new_app_df = pd.read_csv('../input/credit-application-dataset/application_data - Copy.csv')
prev_app_df = pd.read_csv('../input/risk-analytics/previous_application.csv')

### DATAFRAME INSPECTION
#### Inspecting new application dataframe:

In [None]:
new_app_df.head()

In [None]:
# Checking the number of rows and columns

new_app_df.shape

In [None]:
#Checking datatypes of each column

new_app_df.info(verbose=True)

In [None]:
# Checking the statistical overview of the numerical columns

new_app_df.describe()

In [None]:
# Checking Null values

new_app_df.isnull().sum().sort_values(ascending = False)

In [None]:
# Checking percentage of Null values

round(new_app_df.isnull().sum()/new_app_df.shape[0]*100,2).sort_values(ascending = False)

#### Inspecting previous application dataframe:

In [None]:
prev_app_df.head()

In [None]:
# Checking the number of rows and columns

prev_app_df.shape

In [None]:
#Checking datatypes of each column

prev_app_df.info()

In [None]:
prev_app_df.describe()

### DATA CLEANING AND MANUPULATION

#### Dropping columns with more than 45% Null values:

In [None]:
# DROPPING COLUMNS WITH MORE THAN 45% OF NULL VALUES

new_app_df = new_app_df.loc[:,new_app_df.isnull().mean()<=0.45]

# verifying by checking the shape if the columns have been dropped

new_app_df.shape

#### Dropping columns which are not required for our analysis:

In [None]:
NOT_REQ = ['FLAG_DOCUMENT_21' , 'FLAG_DOCUMENT_20' , 'FLAG_DOCUMENT_19' , 'FLAG_DOCUMENT_18' , 'FLAG_DOCUMENT_17' ,
           'FLAG_DOCUMENT_16' , 'FLAG_DOCUMENT_15' , 'FLAG_DOCUMENT_14' , 'FLAG_DOCUMENT_13' , 'FLAG_DOCUMENT_12' ,
           'FLAG_DOCUMENT_11' , 'FLAG_DOCUMENT_10' , 'FLAG_DOCUMENT_9' , 'FLAG_DOCUMENT_8' , 'FLAG_DOCUMENT_7' ,  
           'FLAG_DOCUMENT_6' , 'FLAG_DOCUMENT_5' , 'FLAG_DOCUMENT_4' , 'FLAG_DOCUMENT_3' , 'FLAG_DOCUMENT_2' ,
           'OBS_30_CNT_SOCIAL_CIRCLE' , 'DEF_30_CNT_SOCIAL_CIRCLE' , 'OBS_60_CNT_SOCIAL_CIRCLE' , 'DEF_60_CNT_SOCIAL_CIRCLE' ,
           'AMT_REQ_CREDIT_BUREAU_YEAR' , 'AMT_REQ_CREDIT_BUREAU_MON' , 'AMT_REQ_CREDIT_BUREAU_WEEK' , 
           'AMT_REQ_CREDIT_BUREAU_DAY' , 'AMT_REQ_CREDIT_BUREAU_HOUR' , 'AMT_REQ_CREDIT_BUREAU_QRT' ]

new_app_df.drop( labels = NOT_REQ , axis = 1 , inplace = True)

In [None]:
# verifying by checking the shape if the columns have been dropped

new_app_df.shape

In [None]:
# REMAINING NULL PERCENTAGES

(new_app_df.isnull().sum()/new_app_df.shape[0]*100).sort_values(ascending = False)

#### Inspecting and Imputing values for the OCCUPATION TYPE column:

In [None]:
#VALUE COUNTS IN GRAPH 

plt.figure(figsize=[20,5])

sns.barplot(x=new_app_df.OCCUPATION_TYPE.value_counts().index, 
            y=new_app_df.OCCUPATION_TYPE.value_counts().values).set_title("OCCUPATION TYPE COUNTS", 
                                                                          fontsize=30, color='Green', pad = 20)

plt.xlabel('OCCUPATION TYPE', fontsize= 20, color='Brown')
plt.xticks(rotation=45)

plt.show()

##### **`INFERENCE`** - 

- Labourers and Sales Staff constitute the majority whereas IT Staff and HR staff are on the lower side

In [None]:
# Null values 

new_app_df['OCCUPATION_TYPE'].isnull().sum()

##### As the missing values in `OCCUPATION_TYPE` column is more than 30 %, we are creating a new category 'Unknown' 

In [None]:
# Replacing NaN values with 'Unknown'

new_app_df['OCCUPATION_TYPE'].replace(np.NaN ,'Unknown' , inplace = True)

In [None]:
# OCCUPATION TYPE VALUE COUNTS

new_app_df.OCCUPATION_TYPE.value_counts()

#### CHECKING AND IMPUTING `EXT_SOURCE_2` AND `EXT_SOURCE_3` COLUMNS

In [None]:
plt.figure(figsize=[20,15])

sns.set_style('darkgrid')

plt.subplot(2,2,1)
sns.boxplot(new_app_df['EXT_SOURCE_2']).set_title("EXT_SOURCE_2", fontsize=20, color='Green', pad=20)


plt.subplot(2,2,2)
sns.boxplot(new_app_df['EXT_SOURCE_3']).set_title("EXT_SOURCE_3", fontsize=20, color='Green', pad=20)

plt.subplot(2,2,3)
sns.distplot(new_app_df['EXT_SOURCE_2'], color='g')

plt.subplot(2,2,4)
sns.distplot(new_app_df['EXT_SOURCE_3'], color='g')

plt.show()

#### From the above graphs, we can conclude that:
- There are no outliers
- There is a small amount of skewness 

#### **`Median`** can be used to replace the missing values here because of skewness 

In [None]:
# REPLACING MISSING VALUES OF THESE 2 COLUMNS WITH ITS CORRESPONDING MEDIAN

for column in ['EXT_SOURCE_2', 'EXT_SOURCE_3']:
    new_app_df[column].fillna(new_app_df[column].median(), inplace=True)

#### Checking and Imputing `AMT_GOODS_PRICE`  column:

In [None]:
# Checking the correlation between the loan amount demamded vs the good's price. 

sns.jointplot(new_app_df['AMT_CREDIT'] , new_app_df['AMT_GOODS_PRICE'] , kind='reg',
              joint_kws = {'scatter_kws':dict(alpha=0.5)} , height=10)

plt.title("Correlation between the Loan amount and the price of goods for which loan was given\n\n\n\n\n", 
          fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Green'})
plt.xlabel("Loan Amount", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("Price of Goods", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`INFERENCE`** 
#### Since there is a very linear and positive correlation between the Loan Amount and the Good's price, we can assume that, in most cases the loan amount demanded by the customer is slightly more than but mostly equal to the price of the article he/she wishes to purchase.

#### For the AMT_GOODS_PRICE we can impute the same value of AMT_CREDIT for missing values (keeping in mind loan amount is usually same as good's price)


In [None]:
# Imputing the above mentioned logic

new_app_df['AMT_GOODS_PRICE'] = np.where(new_app_df['AMT_GOODS_PRICE'].isnull() == True, 
                                         new_app_df['AMT_CREDIT'], new_app_df['AMT_GOODS_PRICE'])

In [None]:
#Null values

new_app_df['AMT_GOODS_PRICE'].isnull().sum()

#### CHECKING AND IMPUTING `NAME_TYPE_SUITE`  COLUMN

In [None]:
# Percentage of each category

go.Figure(data=[go.Pie(labels=new_app_df.NAME_TYPE_SUITE.value_counts().index,
                       values=new_app_df.NAME_TYPE_SUITE.value_counts().values, hole=.6, title = 'NAME_TYPE_SUITE VALUE COUNTS',
                       pull=[0,0.1,0.1,0.1,0.1,0.1,0.1])] )

##### Replacing missing values with **`MODE`** :

In [None]:
new_app_df['NAME_TYPE_SUITE'].fillna(new_app_df['NAME_TYPE_SUITE'].mode()[0], inplace = True)

##### For columns with negligible Null values - Replacing Null values with its corresponding `MEDIAN` :

In [None]:
# REMAINING COLUMNS with negligible null values (LESS THAN 1%).

NULL_COL = ['CNT_FAM_MEMBERS' , 'AMT_ANNUITY' , 'DAYS_LAST_PHONE_CHANGE']

for column in NULL_COL:
    new_app_df[column].fillna(new_app_df[column].median(), inplace=True)

In [None]:
# REMAINING NULL PERCENTAGES

round(new_app_df.isnull().sum()/new_app_df.shape[0]*100,2)

### CHECKING DATATYPES OF EACH COLUMN

In [None]:
new_app_df.info()

##### CHANGING `DAYS` COLUMNS AND `COUNT` COLUMNS TO INTEGER


In [None]:
dayandcount = ['CNT_FAM_MEMBERS' , 'DAYS_REGISTRATION' , 'DAYS_LAST_PHONE_CHANGE' ]

new_app_df.loc[:,dayandcount]=new_app_df.loc[:,dayandcount].apply(lambda x: x.astype('int64',errors='ignore'))

##### CHANGING ALL VALUES OF COLUMNS WITH DTYPE OBJECT  TO STRING

In [None]:
#LISTING OBJECT TYPE COLUMNS AND CONFIRMING THE VALUES TO BE IN STRING TYPE

obj_col = list(new_app_df.select_dtypes(include='object').columns)

new_app_df.loc[:,obj_col] = new_app_df.loc[:,obj_col].apply(lambda x: x.astype('str', errors = 'ignore'))


### CHECKING VALUES OF OTHER CATEGORICAL COLUMNS 
##### CHECKING `GENDER CODE` COLUMN

In [None]:
# VALUE COUNTS OF GENDER CODE

new_app_df.CODE_GENDER.value_counts()

In [None]:
# Checking the gender column

plt.figure(figsize=[10,5])

sns.barplot(x=new_app_df.CODE_GENDER, y=new_app_df.TARGET).set_title("Gender vs Target", fontsize=20, color='Green', pad=20)

plt.show()

In [None]:
# REPLACING XNA VALUES WITH 'F'

new_app_df.CODE_GENDER = new_app_df.CODE_GENDER.str.replace('XNA','F')

In [None]:
# Generating the barplot again

'''
go.Figure(data=[go.Pie(labels=new_app_df.CODE_GENDER.value_counts().index,
                       values=new_app_df.CODE_GENDER.value_counts().values, hole=.6, title = 'GENDER_CODE VALUE COUNTS')])
'''
plt.figure(figsize=[10,5])

sns.barplot(x=new_app_df.CODE_GENDER, y=new_app_df.TARGET).set_title("Gender vs Target", fontsize=20, color='Green', pad=20)

plt.show()

##### Checking `ORGANIZATION TYPE` column:

In [None]:
round(new_app_df.ORGANIZATION_TYPE.value_counts()/new_app_df.shape[0]*100,2)

#### XNA is nearly 18% of the total data. we shall replace it with `Unknown`

In [None]:
new_app_df.ORGANIZATION_TYPE = new_app_df.ORGANIZATION_TYPE.str.replace('XNA','Unknown')

#### Also we notice that there are several sub-categories within Industry, Trade, Business and Transport.

In [None]:
# Therefore, we elimiate the sub-category with the overall category

new_app_df.ORGANIZATION_TYPE = new_app_df.ORGANIZATION_TYPE.apply(lambda x: 'Industry' if 'Industry' in x else x)
new_app_df.ORGANIZATION_TYPE = new_app_df.ORGANIZATION_TYPE.apply(lambda x: 'Trade' if 'Trade' in x else x)
new_app_df.ORGANIZATION_TYPE = new_app_df.ORGANIZATION_TYPE.apply(lambda x: 'Transport' if 'Transport' in x else x)
new_app_df.ORGANIZATION_TYPE = new_app_df.ORGANIZATION_TYPE.apply(lambda x: 'Business' if 'Business' in x else x)

In [None]:
plt.figure(figsize=[20,5])

sns.barplot(x=new_app_df.ORGANIZATION_TYPE.value_counts().index, 
            y=new_app_df.ORGANIZATION_TYPE.value_counts().values).set_title("Distribution within ORGANIZATION_TYPE", 
                                                                            fontsize=20, color='Green', pad=20)

# plt.xlabel('ORGANIZATION_TYPE', fontsize = 20, color = 'Brown')
plt.xticks(rotation = 90)

plt.show()

##### **`INFERENCE`** - 

- people who is in business field applied more in number for the loan compared to other fields.

##### Checking `NAME_CONTRACT_TYPE` column

In [None]:
new_app_df.NAME_CONTRACT_TYPE.value_counts()

##### CHECKING `FLAG OWN CAR` COLUMN

In [None]:
new_app_df.FLAG_OWN_CAR.value_counts()

##### CHECKING `FLAG OWN REALTY` COLUMN

In [None]:
new_app_df.FLAG_OWN_REALTY.value_counts()

##### CHECKING `NAME TYPE SUITE` COLUMN

In [None]:
new_app_df.NAME_TYPE_SUITE.value_counts()

##### CHECKING `NAME EDUCATION TYPE` COLUMN

In [None]:
new_app_df.NAME_EDUCATION_TYPE.value_counts()

##### CHECKING `NAME INCOME TYPE` COLUMN

In [None]:
new_app_df.NAME_INCOME_TYPE.value_counts()

##### CHECKING `NAME FAMILY STATUS` COLUMN

In [None]:
new_app_df.NAME_FAMILY_STATUS.value_counts()


##### CHECKING `NAME HOUSING TYPE` COLUMN

In [None]:
new_app_df.NAME_HOUSING_TYPE.value_counts()


##### CHECKING `WEEKDAY_APPR_PROCESS_START` COLUMN

In [None]:
new_app_df.WEEKDAY_APPR_PROCESS_START.value_counts()

#### CHECKING VALUES OF OTHER NUMERICAL COLUMNS (Days)


In [None]:
#CHECKING `DAYS_BIRTH` COLUMN

new_app_df.DAYS_BIRTH.unique()

In [None]:
#CHECKING `DAYS_EMPLOYED` COLUMN

new_app_df.DAYS_EMPLOYED.unique()

In [None]:
#CHECKING `DAYS_REGISTRATION` COLUMN

new_app_df.DAYS_REGISTRATION.unique()

In [None]:
#CHECKING `DAYS_ID_PUBLISH` COLUMN

new_app_df.DAYS_ID_PUBLISH.unique()

In [None]:
#CHECKING `DAYS_LAST_PHONE_CHANGE` COLUMN

new_app_df.DAYS_LAST_PHONE_CHANGE.unique()

In [None]:
# In the DAYS_EMPLOYED column, we can see that there is a value '365243' which corresponds to retired people or people who are not working.

new_app_df.DAYS_EMPLOYED.value_counts()

##### Number of days should be in positive integer  

In [None]:
#changing values to positive integer

num_days = ['DAYS_BIRTH' , 'DAYS_EMPLOYED', 'DAYS_REGISTRATION' , 'DAYS_ID_PUBLISH' , 'DAYS_LAST_PHONE_CHANGE']

for i in num_days:
    new_app_df[i] = abs(new_app_df[i])
    

In [None]:
new_app_df.AMT_INCOME_TOTAL.quantile([0,0.1,0.15,0.2,0.25,0.3,0.4,0.5,0.6,0.7,0.75,0.8,0.9,0.95,0.99,0.999,1])

### Binning (continuous) numerical column values for analysis 

##### BINNING OF `AMT_INCOME_TOTAL` COLUMN

In [None]:
new_app_df['INCOME_SLAB']=pd.qcut(new_app_df['AMT_INCOME_TOTAL'] , q=[0,0.2,0.5,0.75,0.95,1],
                                  labels=['VeryLow','Low','Medium','High','VeryHigh'])

In [None]:
new_app_df['INCOME_SLAB'].value_counts()

##### BINNING OF `AMT_CREDIT` COLUMN

In [None]:
new_app_df['AMT_CREDIT_slab'] = pd.qcut(new_app_df['AMT_CREDIT'] , q = [0,0.2,0.5,0.75,0.95,1],
                                   labels = ['VeryLow','Low','Medium','High','VeryHigh'])

In [None]:
new_app_df['AMT_CREDIT_slab'].value_counts()

##### BINNING OF `DAYS_BIRTH` / AGE COLUMN

In [None]:
#CONVERTING DAYS TO YEARS

new_app_df['AGE']=(new_app_df['DAYS_BIRTH']//365).astype('int64',errors='ignore')

In [None]:
#MAX & MIN VALUES

print(new_app_df['AGE'].max())
print(new_app_df['AGE'].min())

In [None]:
#CREATING 10 BINS 

new_app_df['AGE_BINS'] = pd.cut(new_app_df['AGE'],bins=np.arange(20,71,5))

In [None]:
new_app_df['AGE_BINS'].value_counts()

### OUTLIER ANALYSIS 
##### CHECKING `AMT_INCOME_TOTAL` COLUMN

In [None]:
new_app_df.AMT_INCOME_TOTAL.quantile([0.9991,0.9992,0.9993])

As the value of 99.9 percent shows around 10 lakh, we can cap outlier to this value 

In [None]:
# Handling outliers for the AMT_INCOME_TOTAL by capping the income to 10lakh:

new_app_df['AMT_INCOME_TOTAL'] = np.where(new_app_df['AMT_INCOME_TOTAL'] > 1000000 , 1000000 , new_app_df['AMT_INCOME_TOTAL'])

# new_app_df = new_app_df[new_app_df.AMT_INCOME_TOTAL < temp] 

In [None]:
new_app_df['AMT_INCOME_TOTAL'].value_counts()

In [None]:
plt.figure(figsize=[20,5])

plt.subplot(1,2,1)
sns.boxplot(new_app_df['AMT_INCOME_TOTAL']).set_title("AMT_INCOME_TOTAL - BOX PLOT", fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
sns.distplot(new_app_df['AMT_INCOME_TOTAL'], color='green').set_title("AMT_INCOME_TOTAL - DISTRIBUTION",
                                                                      fontsize=20, color='indigo', pad=20)

plt.show()

##### **`INFERENCE`** - 

- It can be inferred that, most of the people earn around 1-2 lakh annually.
- There are ofcourse people who earn a lot more, but they are present in mere numbers upto 10lakhs.
- Largly the the bigger part of the population, applying for loan is concentrated near the 20 thousand to 4 lakh bucket.

##### Checking `AMT_ANNUITY` column

In [None]:
plt.figure(figsize=[20,5])

plt.subplot(1,2,1)
sns.boxplot(new_app_df['AMT_ANNUITY']).set_title("AMT_ANNUITY - BOXPLOT", fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
sns.distplot(new_app_df['AMT_ANNUITY'], color='g').set_title("AMT_ANNUITY - DISTRIBUTION", fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 

- The loan annuity amount is mostly concentrated around the 25,000 mark.


##### Checking `AGE` Column

In [None]:
plt.figure(figsize=[20,5])

plt.subplot(1,2,1)
sns.boxplot(new_app_df['AGE']).set_title("AGE - BOXPLOT", fontsize=20, color='indigo')

plt.subplot(1,2,2)
sns.distplot(new_app_df['AGE'], color='g').set_title("AGE - DISTRIBUTION", fontsize=20, color='indigo')

plt.show()

**`INFERENCE`** - 

- Usually people who are coming for loan are 20 years and above, till the age of 69.
- Most of the people are in the range of 28 years to 45 years of age.
- there are no outliers

##### CHECKING `DAYS_EMPLOYED` COLUMN

In [None]:
plt.figure(figsize=[20,5])

plt.subplot(1,2,1)
sns.boxplot(new_app_df['DAYS_EMPLOYED']).set_title("DAYS_EMPLOYED - BOXPLOT", fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
sns.distplot(new_app_df['DAYS_EMPLOYED'], color='g').set_title("DAYS_EMPLOYED - DISTRIBUTION", fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 
- Most of the applicants are near value zero 
- the value above 350000 is surely an outlier or NA value as the value comes around 1000 years

##### CHECKING `DAYS_REGISTRATION` COLUMN

In [None]:
plt.figure(figsize=[20,5])

plt.subplot(1,2,1)
sns.boxplot(new_app_df['DAYS_REGISTRATION']).set_title("DAYS_REGISTRATION - BOXPLOT", fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
sns.distplot(new_app_df['DAYS_REGISTRATION'], color='g').set_title("DAYS_REGISTRATION - DISTRIBUTION", 
                                                                   fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** -

- It can be observed that most of the customers have changed their registration at most 15000 days before application
- Most of the people are in the range of 2500 to 7500 days.
- But the maximum number of poeple changed their registration just days before the application.

####  CREATING ADDITIONAL COLUMNS FOR ANALYSIS

In [None]:
# CREATING A COLUMN WITH VALUES CREDIT TO INCOME RATIO
    
new_app_df['CREDIT_RATIO'] = (new_app_df.AMT_CREDIT // new_app_df.AMT_INCOME_TOTAL).astype('int64')

### Checking target imbalance

In [None]:
go.Figure(data=[go.Pie(labels=new_app_df.TARGET.value_counts(normalize=True).index,
                       values=new_app_df.TARGET.value_counts(normalize=True).values, hole=.6,
                       title = 'Defaulter VS Non-Defaulters')])

**`INFERENCE`** - 

- There is a huge data imbalance with almost 92% of the data corresponding to the Defaulter whereas only 8% belonging to the Non-defaulters.

### Checking Gender imbalance

In [None]:
go.Figure(data=[go.Pie(labels=new_app_df.CODE_GENDER.value_counts(normalize=True).index,
                       values=new_app_df.CODE_GENDER.value_counts(normalize=True).values, hole=.5,
                       title = 'Male Vs Female')])

In [None]:
new_app_df.info()

### Correlation between variables

In [None]:
plt.figure(figsize=[12,12])

f=sns.heatmap(new_app_df[['AMT_CREDIT','AMT_GOODS_PRICE','AMT_ANNUITY','AMT_INCOME_TOTAL' , 'AGE' , 'DAYS_BIRTH' , 
                          'DAYS_EMPLOYED' , 'REGION_RATING_CLIENT' , 'REGION_RATING_CLIENT_W_CITY']].corr(),
              cmap = "PRGn" , annot=True).set_title('Correlation between variables', fontsize = 25, color='Teal')

plt.show()

**`INFERENCE`** - 
  
- Here, we can see a very strong correlation between the amount of goods price and the loan amount. From this, we can conclude like previously that the loan amount disbursed is mostly equal or slightly higher than the cost of article the client wishes to purchase.
- There is also a good correlation between the annuity amount and the loan amount as well as the good's price.
- Here there is a negative correlation between the client's region and the money he earns. This means that if a client is from a place with a higher rating, he or she will more likely earn less money. 

In [None]:
plt.figure(figsize=[12,12])

f=sns.heatmap(new_app_df[['AGE' , 'FLAG_EMP_PHONE' , 'FLAG_WORK_PHONE' , 'DAYS_REGISTRATION' , 'DAYS_ID_PUBLISH' ,
                          'REGION_RATING_CLIENT' , 'REGION_RATING_CLIENT_W_CITY' , 'REG_CITY_NOT_LIVE_CITY' ,
                          'REG_CITY_NOT_WORK_CITY' , 'LIVE_CITY_NOT_WORK_CITY' ]].corr(),
              cmap = "PiYG" , annot=True).set_title('Correlation between variables', fontsize = 25, color='NAvy')

plt.show()

**`INFERENCE`** - 

- Here we see a strong negative correlation between employee phone number and age.
- There is a positive correlation between the number of days before which client changed his registration with respect to age. This goes to show that elderly people are less likely to make changes to their registration prior to applying for loan.
- Clients that do not provide their phone numbers are also less likely to provide incorrect permanent and work address.


In [None]:
# AMT_CREDIT Vs AMT_GOODS_PRICE Vs AMT_ANNUITY Vs AGE

f = sns.pairplot(new_app_df[['AMT_CREDIT','AMT_GOODS_PRICE','AMT_ANNUITY','AMT_INCOME_TOTAL']] , height = 3)
f.fig.suptitle('AMT_CREDIT Vs AMT_GOODS_PRICE Vs AMT_ANNUITY Vs AMT_INCOME_TOTAL \n\n\n\n\n', fontsize = 25, color='Green')

plt.show()

**`INFERENCE`** - 

- From the above pairplots we can find that there are good positive correlations between the Credit Amount, Good's price and Amount annuity.

### Splitting the datasets into two, based on whether the person is defaulter or not(based on Target variable).

In [None]:
# checking the head of the dataset before splitting...

new_app_df.head()

In [None]:
# Creating two datasets for target=1 and target=0 (1=bad,0=good)

target1 = new_app_df[new_app_df['TARGET'] == 1]
target0 = new_app_df[new_app_df['TARGET'] == 0]

print(target1.shape, target0.shape, new_app_df.shape)

#### Here we notice that the data is imbalanced.

In [None]:
# Finding the percentage of imbalance...

print("The dataset with Target value 1 has :" + "{:.2%}".format(target1.shape[0]/new_app_df.shape[0]) + " data.")
print("The dataset with Target value 0 has :" + "{:.2%}".format(target0.shape[0]/new_app_df.shape[0]) + " data.") 

### Segmented Univariate analysis (Categorical columns)

#### Performing some Univariate analysis on some categorical data and comparing the characteristics of Defaulters vs non-Defaulters

In [None]:
# Checking the gender column:

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
sns.barplot(x = target1.CODE_GENDER.value_counts(normalize=True).index, 
            y = target1.CODE_GENDER.value_counts(normalize=True).values ,
            palette="rocket").set_title("Gender(Defaulter) %" , fontsize=30, color='Green')

plt.subplot(1,2,2)
sns.barplot(x = target0.CODE_GENDER.value_counts(normalize=True).index, 
            y = target0.CODE_GENDER.value_counts(normalize=True).values,
            palette="rocket").set_title("Gender(Non-Defaulter) %" , fontsize=30, color='Green')

plt.show()

**`INFERENCE`** - 

- Here we see that the male % has increased almost by 10% from non-defaulter to defaulter.
- In-case of female, we can see that there is also a similar 10% decrease from defaulter to non-defaulter.
- We can imply that, men are more likely to default a loan than women.

In [None]:
# Checking NAME_CONTRACT_TYPE:

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.NAME_CONTRACT_TYPE.value_counts(normalize=True).plot.barh(color='firebrick').set_title("NAME_CONTRACT_TYPE(Defaulter) %",
                                                                                               fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.NAME_CONTRACT_TYPE.value_counts(normalize=True).plot.barh(color='green').set_title("NAME_CONTRACT_TYPE(Non-Defaulter) %",
                                                                                           fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 

- Here we do not see much of a difference between the two graphs, so it is safe to assume that the type of loan does is not enough to understand whether a person will default or not.

In [None]:
# Owning car vs Defaulting:

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.FLAG_OWN_CAR.value_counts(normalize=True).plot.barh(color='firebrick').set_title("FLAG_OWN_CAR(Defaulter) %", 
                                                                                         fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.FLAG_OWN_CAR.value_counts(normalize=True).plot.barh(color='green').set_title("FLAG_OWN_CAR(Non-Defaulter) %", 
                                                                                     fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 

- Here we can see that who own car are less likely to default a loan.
- From this we can also understand that if a person is rich enough to afford a car, he/she will have the money to pay back.

In [None]:
# Owning a House/Estate etc vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.FLAG_OWN_REALTY.value_counts(normalize=True).plot.barh(color='firebrick').set_title("FLAG_OWN_REALTY(Defaulter) %", 
                                                                                            fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.FLAG_OWN_REALTY.value_counts(normalize=True).plot.barh(color='green').set_title("FLAG_OWN_REALTY(Non-Defaulter) %", 
                                                                                        fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 

- Owning a realty estate doesn't show much of an impact on whether a person is going to be a defaulter or not.
- There is a very minor observation that, people who don't have house, might end up defaulting.

In [None]:
# NAME_TYPE_SUITE vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.NAME_TYPE_SUITE.value_counts(normalize=True).plot.barh(color='firebrick').set_title("NAME_TYPE_SUITE(Defaulter) %", 
                                                                                            fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.NAME_TYPE_SUITE.value_counts(normalize=True).plot.barh(color='green').set_title("NAME_TYPE_SUITE(Non-Defaulter) %", 
                                                                                        fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 

- Over here, from this graph, we do not see much difference. 
- It can be understood that, person accompanying the client doesn't have an impact on deciding if he/she will default the loan.

In [None]:
# Income type vs Defaulting

plt.figure(figsize=[22,10])

plt.subplot(1,2,1)
target1.NAME_INCOME_TYPE.value_counts(normalize=True).plot.barh(color='firebrick').set_title("NAME_INCOME_TYPE(Defaulter) %", 
                                                                                             fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.NAME_INCOME_TYPE.value_counts(normalize=True).plot.barh(color='green').set_title("NAME_INCOME_TYPE(Non-Defaulter) %", 
                                                                                         fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 

- We can notice that "Students" do not appear on the defaulters as they dont have to pay when they study. So they are a very good client to target.
- Also businessmen dont default much like students category
- Also, we see more than 10% increase in the number of "Working" category people who default loans.

In [None]:
# Education vs Defaulting

plt.figure(figsize=[20,10])

plt.subplot(1,2,1)
target1.NAME_EDUCATION_TYPE.value_counts(normalize=True).plot.barh(color='firebrick').set_title("NAME_EDUCATION_TYPE(Defaulter) %", 
                                                                                                fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.NAME_EDUCATION_TYPE.value_counts(normalize=True).plot.barh(color='green').set_title("NAME_EDUCATION_TYPE(Non-Defaulter) %", 
                                                                                            fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 

- From the above graphs we can make out that people who persue "Higher Education" are less likely to default loans.
- Client who have attained only "Secondary education" are more likely to default.

In [None]:
# Housing vs Defaulting

plt.figure(figsize=[20,10])

plt.subplot(1,2,1)
target1.NAME_HOUSING_TYPE.value_counts(normalize=True).plot.barh(color='firebrick').set_title("NAME_HOUSING_TYPE(Defaulter) %",
                                                                                              fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.NAME_HOUSING_TYPE.value_counts(normalize=True).plot.barh(color='green').set_title("NAME_HOUSING_TYPE(Non-Defaulter) %",
                                                                                          fontsize=20, color='indigo', pad=20)

plt.show()


**`INFERENCE`** - 

- Here we see only a minor change in the "With Parents" category by only a few percentage.
- We can imply that clients who live with their parents might not be well established and might end up having difficulty in paying a loan back.
- people who have House/Appartment tend to apply for more loans.

In [None]:
# Marital Status vs Defaulting

plt.figure(figsize=[20,10])

plt.subplot(1,2,1)
target1.NAME_FAMILY_STATUS.value_counts(normalize=True).plot.barh(color='firebrick').set_title("NAME_FAMILY_STATUS(Defaulter) %",
                                                                                               fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.NAME_FAMILY_STATUS.value_counts(normalize=True).plot.barh(color='green').set_title("NAME_FAMILY_STATUS(Non-Defaulter) %",
                                                                                           fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 

- We can observe that single and civil marraige people are more likely to default.
- Married people are less likely to default.

In [None]:
# Occupations vs Defaulting

plt.figure(figsize=[20,10])

plt.subplot(1,2,1)
target1.OCCUPATION_TYPE.value_counts(normalize=True).plot.barh(color='firebrick').set_title("OCCUPATION_TYPE(Defaulter) %", 
                                                                                            fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.OCCUPATION_TYPE.value_counts(normalize=True).plot.barh(color='green').set_title("OCCUPATION_TYPE(Non-Defaulter) %", 
                                                                                        fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 

- From the above graph, we can understand that, Labourers, Sales staff, drivers, cleaning staff, low-skill labours are more likely to default a payment of the loan.
- The best clients to target in this case would be Managers, core staff, high skill tech staff.

In [None]:
# Day of the Week vs Defaulting:

plt.figure(figsize=[20,10])

plt.subplot(1,2,1)
target1.WEEKDAY_APPR_PROCESS_START.value_counts(normalize=True).plot.barh(color='firebrick').set_title("WEEKDAY_APPR_PROCESS_START(Defaulter) %", 
                                                                                                       fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.WEEKDAY_APPR_PROCESS_START.value_counts(normalize=True).plot.barh(color='green').set_title("WEEKDAY_APPR_PROCESS_START(Non-Defaulter) %", 
                                                                                                   fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 

- Over here, we do not see any indicator of the day of the week on which the application was started having an impact on the loan default analysis.

In [None]:
# Organization type vs Defaulting:

plt.figure(figsize=[20,10])

plt.subplot(1,2,1)
target1.ORGANIZATION_TYPE.value_counts(normalize=True).plot.barh(color='firebrick').set_title("ORGANIZATION_TYPE(Defaulter) %", 
                                                                                              fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.ORGANIZATION_TYPE.value_counts(normalize=True).plot.barh(color='green').set_title("ORGANIZATION_TYPE(Non-Defaulter) %", 
                                                                                          fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 

- We can see that self employed, trade, construction, security, transport people and business people are more likely to have difficulty in paying the loan.

In [None]:
# Income vs Defaulting

plt.figure(figsize=[20,10])

plt.subplot(1,2,1)
target1.INCOME_SLAB.value_counts(normalize=True).plot.barh(color='firebrick').set_title("INCOME_SLAB(Defaulter) %", 
                                                                                        fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.INCOME_SLAB.value_counts(normalize=True).plot.barh(color='green').set_title("INCOME_SLAB(Non-Defaulter) %", 
                                                                                    fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 

- Here people belonging to very high, high income slab do not face much difficulty with loan repayment.
- However, people with low income struggle to make payment and are likely to default.

In [None]:
# Loan amount vs Defaulting

plt.figure(figsize=[20,10])

plt.subplot(1,2,1)
target1.AMT_CREDIT_slab.value_counts(normalize=True).plot.barh(color='firebrick').set_title("AMT_CREDIT_slab(Defaulter) %", 
                                                                                            fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.AMT_CREDIT_slab.value_counts(normalize=True).plot.barh(color='green').set_title("AMT_CREDIT_slab(Non-Defaulter) %", 
                                                                                        fontsize=20, color='indigo', pad=20)

plt.show()

**`INFERENCE`** - 

- Here we can see that if the amount of credit taken as a loan is high, people are less likely to default. This is due to the fact that only a rich or established client will take a loan of very high amount.
- On the other hand if the loan amount is low, we can see that people struggle to pay it back.

In [None]:
# Age groups vs Defaulting

plt.figure(figsize=[20,10])

plt.subplot(1,2,1)
target1.AGE_BINS.value_counts(normalize=True).plot.barh(color='firebrick').set_title("AGE_BINS(Defaulter) %", 
                                                                                     fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
target0.AGE_BINS.value_counts(normalize=True).plot.barh(color='green').set_title("AGE_BINS(Non-Defaulter) %", 
                                                                                 fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- From the above graph we can infer that the age bin from 25 to 40 are more likely to default a loan payment.
- People above 45 are less likely to default.
- With increasing age group, people tend to default less

### Segmented Univariate analysis (Continious Numeric columns)

In [None]:
target0.info()

In [None]:
target0.head()

In [None]:
# Total income vs Defaulting

plt.figure(figsize=[20,15])

plt.suptitle('AMT_INCOME_TOTAL for Defaulters and Non-Defaulters ',fontsize = 35, color='Teal')

plt.subplot(2,2,1)
sns.boxplot(target0['AMT_INCOME_TOTAL']).set_title("INCOME_TOTAL - BOXPLOT(Non-Defaulter)", 
                                                   fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,2)
sns.distplot(target0['AMT_INCOME_TOTAL'], color='g').set_title("AMT_INCOME_TOTAL - DISTRIBUTION(Non-Defaulter)", 
                                                               fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,3)
sns.boxplot(target1['AMT_INCOME_TOTAL']).set_title("AMT_INCOME_TOTAL - BOXPLOT(Defaulter)", 
                                                   fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,4)
sns.distplot(target1['AMT_INCOME_TOTAL'], color='g').set_title("AMT_INCOME_TOTAL - DISTRIBUTION(Defaulter)", 
                                                               fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -
- Not a big difference, still with increace in income, less likely to default

In [None]:
# Total income vs Defaulting

plt.figure(figsize=[20,15])

plt.suptitle('AMT_CREDIT for Defaulters and Non-Defaulters ',fontsize = 35, color='Teal')

plt.subplot(2,2,1)
sns.boxplot(target0['AMT_CREDIT']).set_title("AMT_CREDIT - BOXPLOT(Non-Defaulter)", fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,2)
sns.distplot(target0['AMT_CREDIT'], color='g').set_title("AMT_CREDIT - DISTRIBUTION(Non-Defaulter)", fontsize=20, 
                                                         color='indigo', pad=20)

plt.subplot(2,2,3)
sns.boxplot(target1['AMT_CREDIT']).set_title("AMT_CREDIT - BOXPLOT(Defaulter)", fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,4)
sns.distplot(target1['AMT_CREDIT'], color='g').set_title("AMT_CREDIT - DISTRIBUTION(Defaulter)", fontsize=20, 
                                                         color='indigo', pad=20)

plt.show()

**`Inference`** -

- Form the above histograms, we can observe that there is a sharp increase in the size of a bin at around 50,000.
- So we can assume that client who's loan amount is 50,000, are likely to default their loan payment.

In [None]:
# AMT_ANNUITY vs Defaulting

plt.figure(figsize=[20,15])

plt.suptitle('AMT_ANNUITY for Defaulters and Non-Defaulters ',fontsize = 35, color='Teal')

plt.subplot(2,2,1)
sns.boxplot(target0['AMT_ANNUITY']).set_title("AMT_ANNUITY - BOXPLOT(Non-Defaulter)", fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,2)
sns.distplot(target0['AMT_ANNUITY'], color='g').set_title("AMT_ANNUITY - DISTRIBUTION(Non-Defaulter)", fontsize=20, 
                                                          color='indigo', pad=20)

plt.subplot(2,2,3)
sns.boxplot(target1['AMT_ANNUITY']).set_title("AMT_ANNUITY - BOXPLOT(Defaulter)", fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,4)
sns.distplot(target1['AMT_ANNUITY'], color='g').set_title("AMT_ANNUITY - DISTRIBUTION(Defaulter)", fontsize=20, 
                                                          color='indigo', pad=20)

plt.show()

**`Inference`** -

- From the above histograms and boxplots, we can understand that there are no impacts on loan defaulting due to Annuity amount.
- From box plot we can say better return of loans for the higher amount annuity.

In [None]:
# AGE vs Defaulting

plt.figure(figsize=[20,15])

plt.suptitle('AGE for Defaulters and Non-Defaulters ',fontsize = 35, color='Teal')

plt.subplot(2,2,1)
sns.boxplot(target0['AGE']).set_title("AGE - BOXPLOT(Non-Defaulter)", fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,2)
sns.distplot(target0['AGE'], color='g').set_title("AGE - DISTRIBUTION(Non-Defaulter)", fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,3)
sns.boxplot(target1['AGE']).set_title("AGE - BOXPLOT(Defaulter)", fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,4)
sns.distplot(target1['AGE'], color='g').set_title("AGE - DISTRIBUTION(Defaulter)", fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- From the age 20 to 40 years, clients falling in this category of age range are more likely to default a payment.
- Clients above 50 are less likely to default any payments.

In [None]:
# REGION_POPULATION_RELATIVE vs Defaulting

plt.figure(figsize=[20,15])

plt.suptitle('REGION_POPULATION_RELATIVE for Defaulters and Non-Defaulters ',fontsize = 35, color='Teal')

plt.subplot(2,2,1)
sns.boxplot(target0['REGION_POPULATION_RELATIVE']).set_title("REGION_POPULATION_RELATIVE - BOXPLOT(Non-Defaulter)", 
                                                             fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,2)
sns.distplot(target0['REGION_POPULATION_RELATIVE'], color='g').set_title("REGION_POPULATION_RELATIVE - DISTRIBUTION(Non-Defaulter)", 
                                                                         fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,3)
sns.boxplot(target1['REGION_POPULATION_RELATIVE']).set_title("REGION_POPULATION_RELATIVE - BOXPLOT(Defaulter)", 
                                                             fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,4)
sns.distplot(target1['REGION_POPULATION_RELATIVE'], color='g').set_title("REGION_POPULATION_RELATIVE - DISTRIBUTION(Defaulter)",
                                                                         fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- From the above histograms, we can see that people who live in a place which is not so populated, like village or small towns, have difficulty in repaying loan amount.
- We can also see that people who live in cities which are more populated, do not face much difficulty with loan payments.

In [None]:
# DAYS_REGISTRATION vs Defaulting

plt.figure(figsize=[20,15])

plt.suptitle('DAYS_REGISTRATION for Defaulters and Non-Defaulters ',fontsize = 35, color='Teal')

plt.subplot(2,2,1)
sns.boxplot(target0['DAYS_REGISTRATION']).set_title("DAYS_REGISTRATION - BOXPLOT(Non-Defaulter)", 
                                                    fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,2)
sns.distplot(target0['DAYS_REGISTRATION'], color='g').set_title("DAYS_REGISTRATION - DISTRIBUTION(Non-Defaulter)", 
                                                                fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,3)
sns.boxplot(target1['DAYS_REGISTRATION']).set_title("DAYS_REGISTRATION - BOXPLOT(Defaulter)", 
                                                    fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,4)
sns.distplot(target1['DAYS_REGISTRATION'], color='g').set_title("DAYS_REGISTRATION - DISTRIBUTION(Defaulter)", 
                                                                fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- Here we can see that clients who are more likely to default loans are more likely to change their registration, few days prior to applying for the loan.

In [None]:
# EXT_SOURCE_2 vs Defaulting

plt.figure(figsize=[20,15])

plt.suptitle('EXT_SOURCE_2 for Defaulters and Non-Defaulters ',fontsize = 35, color='Teal')

plt.subplot(2,2,1)
sns.boxplot(target0['EXT_SOURCE_2']).set_title("EXT_SOURCE_2 - BOXPLOT(Non-Defaulter)", 
                                                    fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,2)
sns.distplot(target0['EXT_SOURCE_2'], color='g').set_title("EXT_SOURCE_2 - DISTRIBUTION(Non-Defaulter)", 
                                                                fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,3)
sns.boxplot(target1['EXT_SOURCE_2']).set_title("EXT_SOURCE_2 - BOXPLOT(Defaulter)", 
                                                    fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,4)
sns.distplot(target1['EXT_SOURCE_2'], color='g').set_title("EXT_SOURCE_2 - DISTRIBUTION(Defaulter)", 
                                                                fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- We can see that people who face difficulty in paying a loan back are the ones, whose ext source score are below 2.0.
- On the other hand, people with score above 2.0 are less likely to default on a loan payment.

In [None]:
# DAYS_LAST_PHONE_CHANGE vs Defaulting

plt.figure(figsize=[20,15])

plt.suptitle('DAYS_LAST_PHONE_CHANGE for Defaulters and Non-Defaulters ',fontsize = 35, color='Teal')

plt.subplot(2,2,1)
sns.boxplot(target0['DAYS_LAST_PHONE_CHANGE']).set_title("DAYS_LAST_PHONE_CHANGE - BOXPLOT(Non-Defaulter)", 
                                                    fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,2)
sns.distplot(target0['DAYS_LAST_PHONE_CHANGE'], color='g').set_title("DAYS_LAST_PHONE_CHANGE - DISTRIBUTION(Non-Defaulter)", 
                                                                fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,3)
sns.boxplot(target1['DAYS_LAST_PHONE_CHANGE']).set_title("DAYS_LAST_PHONE_CHANGE - BOXPLOT(Defaulter)", 
                                                    fontsize=20, color='indigo', pad=20)

plt.subplot(2,2,4)
sns.distplot(target1['DAYS_LAST_PHONE_CHANGE'], color='g').set_title("DAYS_LAST_PHONE_CHANGE - DISTRIBUTION(Defaulter)", 
                                                                fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- Here it can be seen that people who might default do change their number just prior to registration.

In [None]:
# CNT_FAM_MEMBERS vs Defaulting

plt.figure(figsize=[20,8])

plt.suptitle('CNT_FAM_MEMBERS for Defaulters and Non-Defaulters \n ',fontsize = 30, color='Teal')

plt.subplot(1,2,1)
sns.barplot(x = target0.CNT_FAM_MEMBERS.value_counts(normalize=True).index,
            y = target0.CNT_FAM_MEMBERS.value_counts(normalize=True).values ,
            palette="magma").set_title("CNT_FAM_MEMBERS(Non-Defaulter) " , fontsize=20, color='Green')

plt.subplot(1,2,2)
sns.barplot(x = target1.CNT_FAM_MEMBERS.value_counts(normalize=True).index, 
            y = target1.CNT_FAM_MEMBERS.value_counts(normalize=True).values ,
            palette="rocket").set_title("CNT_FAM_MEMBERS(Defaulter) " , fontsize=20, color='Green')

plt.show()

**`Inference`** -

- We do not observe any significant impact of the number of family members of a client on defaulting.
- We do however, see a very small trend that, clients who default might have more than 4 family members.

### Working on the Previous_application dataset:

In [None]:
prev_app_df.head()

In [None]:
#Checking Shapes

prev_app_df.shape

In [None]:
#Null counts

prev_app_df.isnull().sum()

#### Keeping only the necessary columns for merge and analysis:

In [None]:
# columns required:

cols_n = ['SK_ID_CURR', 'AMT_APPLICATION', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'DAYS_DECISION', 'NAME_PAYMENT_TYPE',
         'CODE_REJECT_REASON','NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
         'NAME_YIELD_GROUP']

prev_app_df = prev_app_df.loc[:,cols_n]

In [None]:
# checking if the change took place

prev_app_df.shape

In [None]:
# checking for any missing values

prev_app_df.isnull().sum()

In [None]:
prev_app_df.info()

In [None]:
#Checking unique values

prev_app_df.DAYS_DECISION.unique()

In [None]:
#Changing values to positive integer

prev_app_df['DAYS_DECISION'] = abs(prev_app_df['DAYS_DECISION'])

In [None]:
prev_app_df.NAME_PAYMENT_TYPE.value_counts()

In [None]:
prev_app_df.NAME_CLIENT_TYPE.value_counts()

In [None]:
prev_app_df.NAME_PORTFOLIO.value_counts()

In [None]:
# Changing XNA to Unknown

XNA_col = ['NAME_PAYMENT_TYPE' , 'NAME_CLIENT_TYPE', 'NAME_PORTFOLIO' ]

for i in XNA_col:
    prev_app_df[i] = prev_app_df[i].str.replace('XNA','Unknown')



### Univariate analysis on the previous_application columns

In [None]:
#plotting graph for AMT_APPLICATION

plt.figure(figsize=[20,6])

plt.subplot(1,2,1)
sns.boxplot(prev_app_df['AMT_APPLICATION']).set_title("AMT_APPLICATION - BOXPLOT", fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
sns.distplot(prev_app_df['AMT_APPLICATION'], color='g').set_title("AMT_APPLICATION - DISTRIBUTION", 
                                                                  fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- From the box plot and the histogram we can see that most of the clients have asked for credit worth 10 lakh and less.
- Most of the credit being near about 1-3 lakh marker.

In [None]:
#plotting graph for DAYS_DECISION

plt.figure(figsize=[20,6])

plt.subplot(1,2,1)
sns.boxplot(prev_app_df['DAYS_DECISION']).set_title("DAYS_DECISION - BOXPLOT", fontsize=20, color='indigo', pad=20)

plt.subplot(1,2,2)
sns.distplot(prev_app_df['DAYS_DECISION'], color='g').set_title("DAYS_DECISION - DISTRIBUTION", 
                                                                fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- Here we can see that most customers who took loan previously came back to the back for loan again in 2 years time.

In [None]:
# Reason for loan

plt.figure(figsize=[20,10])

prev_app_df.NAME_CASH_LOAN_PURPOSE.value_counts(normalize=True).plot.barh(color='firebrick').set_title("NAME_CASH_LOAN_PURPOSE",
                                                                                                       fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- Here most of the reason for why loan was taken is missing.

In [None]:
# since most of the values are missing, this column is not useful for our analysis.

prev_app_df.drop(columns=['NAME_CASH_LOAN_PURPOSE'], inplace=True)

In [None]:
prev_app_df.shape

In [None]:
# Payment method for loan

plt.figure(figsize=[20,10])

prev_app_df.NAME_PAYMENT_TYPE.value_counts(normalize=True).plot.barh(color='firebrick').set_title("Payment method for loan", 
                                                                                                  fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- About 62% customers prefer to pay the loan back using cash through the bank.


In [None]:
# Previous Status of loan

plt.figure(figsize=[20,10])

prev_app_df.NAME_CONTRACT_STATUS.value_counts(normalize=True).plot.barh(color='firebrick').set_title("Loan Status", 
                                                                                                     fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- About 62% of the loans were approved by the bank.
- 19% were cancelled and 17% were refused.

In [None]:
# Reason loan was rejected

plt.figure(figsize=[20,10])

prev_app_df.CODE_REJECT_REASON.value_counts(normalize=True).plot.barh(color='firebrick').set_title("Reason loan was rejected", 
                                                                                                   fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- Reason why loan was rejected, column doesn't give us much info, so it can be dropped too.

In [None]:
# since most of the values are missing, this column is not useful for our analysis.

prev_app_df.drop(columns=['CODE_REJECT_REASON'], inplace=True)
prev_app_df.shape

In [None]:
# Was the client old or new client when applying for the previous application

plt.figure(figsize=[20,10])

prev_app_df.NAME_CLIENT_TYPE.value_counts(normalize=True).plot.barh(color='firebrick').set_title("Was the client old or new", 
                                                                                                 fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- From the plot above we can clearly see that most of the customers are repeaters.
- Only about 19% of the customers are new.

In [None]:
# Kind of goods the client applied for in the previous application

plt.figure(figsize=[20,10])

prev_app_df.NAME_GOODS_CATEGORY.value_counts(normalize=True).plot.barh(color='firebrick').set_title("Kind of goods the client applied for in the previous application", 
                                                                                                    fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -

- Since most of the data about the customer are not disclosed, as evident by the XNA value, this column won't help in analysis.
- So we need to drop this column

In [None]:
# since most of the values are missing, this column is not useful for our analysis.

prev_app_df.drop(columns=['NAME_GOODS_CATEGORY'], inplace=True)
prev_app_df.shape

In [None]:
# Was the previous application for CASH, POS, CAR, …

plt.figure(figsize=[20,10])

prev_app_df.NAME_PORTFOLIO.value_counts(normalize=True).plot.barh(color='firebrick').set_title("Was the previous application for CASH, POS, CAR, …", 
                                                                                               fontsize=20, color='indigo', pad=20)

plt.show()

**`Inference`** -
- Among these categories, POS are the majority

In [None]:
# Was the previous application x-sell o walk-in

plt.figure(figsize=[20,10])

prev_app_df.NAME_PRODUCT_TYPE.value_counts(normalize=True).plot.barh(color='firebrick').set_title("Was the previous application x-sell o walk-in", 
                                                                                                  fontsize=20, color='indigo', pad=20)

plt.show()

In [None]:
# Since it doesn't make sense to have this column in the dataset, we will remove this...

prev_app_df.drop(columns=['NAME_PRODUCT_TYPE'], inplace=True)
prev_app_df.shape

In [None]:
# Through which channel we acquired the client on the previous application

plt.figure(figsize=[20,10])

prev_app_df.CHANNEL_TYPE.value_counts(normalize=True).plot.barh(color='firebrick').set_title("Through which channel the client was acquired on the previous application", 
                                                                                             fontsize=20, color='indigo', pad=20)

plt.show()

In [None]:
# Grouped interest rate into small medium and high of the previous application

plt.figure(figsize=[20,10])

prev_app_df.NAME_YIELD_GROUP.value_counts(normalize=True).plot.barh(color='firebrick').set_title("Interest group", 
                                                                                                 fontsize=20, color='indigo', pad=20)

plt.show()

In [None]:
# Since this column will not add value to our analysis, we are dropping it...

prev_app_df.drop(columns=['NAME_YIELD_GROUP'], inplace=True)
prev_app_df.shape

In [None]:
prev_app_df.info()

### Merging both new and old dataframes

In [None]:
new_df=pd.merge(left=new_app_df , right=prev_app_df , how='inner' , on='SK_ID_CURR' , suffixes='_x')

In [None]:
new_df.info()

### BIVARIATE / MULTIVARIATE ANALYSIS

### % of Loan Payment Difficulties

In [None]:
#% of Loan Payment Difficulties for NAME_CONTRACT_STATUS and NAME_CLIENT_TYPE

table = pd.pivot_table(new_df, values='TARGET', index=['NAME_CLIENT_TYPE'],
                       columns=['NAME_CONTRACT_STATUS'], aggfunc=np.mean)

cm = sns.light_palette("green", as_cmap=True)
table.style.background_gradient(cmap=cm)

In [None]:
table.T.plot(kind='bar').set_ylabel('% of Loan-Payment Difficulties')

plt.title('% of Loan Payment Difficulties for NAME_CONTRACT_STATUS and NAME_CLIENT_TYPE', fontdict={'fontsize':18}, pad=20)

plt.show()

**`Inference`** -

- From the above data we can infer that new clients are more likely to cancel loans.
- Also, new clients are more likely to get their loan amount refused.
- Repeater clients are more likely to get a loan refused.

In [None]:
#% of Loan Payment Difficulties for NAME_CONTRACT_STATUS and NAME_CLIENT_TYPE

table = pd.pivot_table(new_df, values='TARGET', index=['NAME_CONTRACT_TYPE'],
                       columns=['NAME_CONTRACT_STATUS'], aggfunc=np.mean)

cm = sns.light_palette("green", as_cmap=True)
table.style.background_gradient(cmap=cm)

In [None]:
table.T.plot(kind='bar').set_ylabel('% of Loan-Payment Difficulties')

plt.title('% of Loan Payment Difficulties for NAME_CONTRACT_STATUS and NAME_CONTRACT_TYPE', fontdict={'fontsize':18}, pad=20)

plt.show()

**`Inference`** -

- Cash loans are more likely to get cancelled or refused with a bigger margin of that of revolving loans.

### Bivariate and multivatiate analysis

In [None]:
#NAME_HOUSING_TYPE Vs NAME_CONTRACT_STATUS

plt.figure(figsize=[20,10])

sns.countplot(data = new_df, y= 'NAME_HOUSING_TYPE', order=new_df['NAME_HOUSING_TYPE'].value_counts().index,
              hue = 'NAME_CONTRACT_STATUS').set_title("NAME_HOUSING_TYPE Vs NAME_CONTRACT_STATUS " ,
                                                                      fontsize=30, color='Green', pad=20)


plt.ylabel("NAME_HOUSING_TYPE", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`Inference`** -

- Clients living in house or appartments are more likely to get their loans approved.

In [None]:
#CODE_GENDER Vs NAME_CONTRACT_STATUS

plt.figure(figsize=[20,10])

sns.countplot(data = new_df, y= 'CODE_GENDER', order=new_df['CODE_GENDER'].value_counts().index,
              hue = 'NAME_CONTRACT_STATUS',palette='magma').set_title("CODE_GENDER Vs NAME_CONTRACT_STATUS " ,
                                                                      fontsize=30, color='Green', pad=20)


plt.ylabel("CODE_GENDER", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`Inference`** -

- It is observed that Female clients are more successful in terms of having their loans approved.
- However male clients are not so successful and do see an increase in the number of times their loans get refused.

In [None]:
#NAME_CLIENT_TYPE Vs NAME_CONTRACT_STATUS

plt.figure(figsize=[20,10])

sns.countplot(data = new_df, y= 'NAME_CLIENT_TYPE', order=new_df['NAME_CLIENT_TYPE'].value_counts().index,
              hue = 'NAME_CONTRACT_STATUS',palette='mako').set_title("NAME_CLIENT_TYPE Vs NAME_CONTRACT_STATUS " ,
                                                                      fontsize=30, color='Green', pad=20)


plt.ylabel("NAME_CLIENT_TYPE", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`Inference`** -

- New clients are more successful in getting their loan approved.
- Repeater clients cancel or have their loan amount refused more than the new clients.

In [None]:
#NAME_EDUCATION_TYPE Vs NAME_CONTRACT_STATUS

plt.figure(figsize=[20,10])

sns.countplot(data = new_df, y= 'NAME_EDUCATION_TYPE', order=new_df['NAME_EDUCATION_TYPE'].value_counts().index,
              hue = 'NAME_CONTRACT_STATUS',palette='crest').set_title("NAME_EDUCATION_TYPE Vs NAME_CONTRACT_STATUS " ,
                                                                      fontsize=30, color='Green', pad=20)


plt.ylabel("NAME_EDUCATION_TYPE", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`Inference`** -

- Here we see usual trend in all the education type, so we cannot make any inferences here.

In [None]:
#AMT_CREDIT_slab Vs NAME_CONTRACT_STATUS

plt.figure(figsize=[20,10])

sns.countplot(data = new_df, y= 'AMT_CREDIT_slab', order=new_df['AMT_CREDIT_slab'].value_counts().index,
              hue = 'NAME_CONTRACT_STATUS',palette='icefire').set_title("AMT_CREDIT_slab Vs NAME_CONTRACT_STATUS " ,
                                                                      fontsize=30, color='Green', pad=20)


plt.ylabel("AMT_CREDIT_slab", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`Inference`** -

- Here we do not see much trend in the loan amount and it getting accepted or rejected.

In [None]:
#AGE_BINS Vs NAME_CONTRACT_STATUS

plt.figure(figsize=[20,10])

sns.countplot(data = new_df, y= 'AGE_BINS', order=new_df['AGE_BINS'].value_counts().index,
              hue = 'NAME_CONTRACT_STATUS',palette='cubehelix').set_title("AGE_BINS Vs NAME_CONTRACT_STATUS " ,
                                                                      fontsize=30, color='Green', pad=20)


plt.ylabel("AGE_BINS", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`Inference`** -

- In the age category of 20 to 35 years of age, we see a lot of rejection of loan. These group of people are also more likely to default as per our previous inferences and conclusions.
- Age group of people above 40 are less likely to default and also they see less rejection and cancellation of loan amounts.

In [None]:
#INCOME_SLAB Vs NAME_CONTRACT_STATUS

plt.figure(figsize=[20,10])

sns.countplot(data = new_df, y= 'INCOME_SLAB', order=new_df['INCOME_SLAB'].value_counts().index,
              hue = 'NAME_CONTRACT_STATUS',palette='Paired').set_title("INCOME_SLAB Vs NAME_CONTRACT_STATUS " ,
                                                                      fontsize=30, color='Green', pad=20)


plt.ylabel("INCOME_SLAB", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`Inference`** -

- We can see that customers with very low income or very high income show more signs of getting their loan amount refused, by a very minor margin.

In [None]:
#Contract type Vs AMT_CREDIT

plt.figure(figsize=[20,10])

sns.boxenplot(x = new_df['NAME_CONTRACT_STATUS'] , y = new_df['AMT_CREDIT'] , hue = new_df['TARGET'] , 
           palette = 'viridis').set_title("Contract type Vs AMT_CREDIT " , fontsize=30, color='Green', pad=20)
plt.xlabel("NAME_CONTRACT_STATUS", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("AMT_CREDIT", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`Inference`** -

In [None]:
#Education Vs AMT_CREDIT

plt.figure(figsize=[20,10])

sns.boxenplot(x = new_df['NAME_EDUCATION_TYPE'] , y = new_df['AMT_CREDIT'] , 
            hue = new_df['TARGET']).set_title("NAME_EDUCATION_TYPE Vs AMT_CREDIT " , fontsize=30, color='Green', pad=20)
plt.xlabel("NAME_EDUCATION_TYPE", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("AMT_CREDIT", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`Inference`** -

- People with just an academic degree are more likely to default.

In [None]:
#INCOME_SLAB Vs AMT_CREDIT

plt.figure(figsize=[20,10])

sns.boxenplot(x = new_df['INCOME_SLAB'] , y = new_df['AMT_CREDIT'] , hue = new_df['TARGET'] , 
           palette = 'Set2').set_title("INCOME_SLAB Vs AMT_CREDIT " , fontsize=30, color='Green', pad=20)
plt.xlabel("INCOME_SLAB", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("AMT_CREDIT", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`Inference`** -

- We can see that as the income slab increases, the credit amount of the loan also increases.
- We can also see that the people in the low and very low income slabs are likely to default more.

In [None]:
#NAME_INCOME_TYPE Vs AMT_CREDIT

plt.figure(figsize=[20,10])

sns.boxenplot(x = new_df['NAME_INCOME_TYPE'] , y = new_df['AMT_CREDIT'] , hue = new_df['TARGET'] , 
           palette = 'flare').set_title("NAME_INCOME_TYPE Vs AMT_CREDIT " , fontsize=30, color='Green', pad=20)
plt.xlabel("NAME_INCOME_TYPE", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("AMT_CREDIT", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`Inference`** -

- People involved in business and unemployed people ask for more loan and repay better.

**`Inference`** -

- Clients in the age range 30 years to 50 years ask for more credit amount and people above the age of 60 are likely to default on their payment.

In [None]:
#NAME_CLIENT_TYPE Vs AMT_CREDIT

plt.figure(figsize=[20,10])

sns.boxenplot(x = new_df['NAME_CLIENT_TYPE'] , y = new_df['AMT_CREDIT'] , hue = new_df['TARGET'] , 
           palette = 'light:#5A9').set_title("NAME_CLIENT_TYPE Vs AMT_CREDIT " , fontsize=30, color='Green', pad=20)
plt.xlabel("NAME_CLIENT_TYPE", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("AMT_CREDIT", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`Inference`** -

- Repeaters and Refreshed clients show an increase in the number of default.

### Final Conclusion:

#### The application dataset and the previous_application dataset were analyzed, cleaned and inferences/correlations were drawn. We have thoroughly observed these datasets and here are our observations and comments about the same:

- Banks can give away loans to Students, pensioners and people with higher education degrees, as they are very less likely to default loan payments.

- We understood that, Labourers, Sales staff, drivers, cleaning staff, low-skill labours are more likely to default a payment of the loan.

- The best clients to target in this case would be Managers, core staff, high skill tech staff.

- People in the age group of 20 to 30 are more likely to default. People above the age group of 45 do not default on their payments as much.

- It was also observed that, people belonging to low and very low income slabs were showing strong indicating signs of defaulting.

- We also observed that people who live in a place which is not so populated, like village or small towns, have difficulty in repaying loan amount.

- Clients who are more likely to default loans are more likely to change their registration, few days prior to applying for the loan.

##### Keeping these points in mind, if a customer can be evaluated based on the above parameters, the bank would see less default payments.