In [1]:
# Importing required libraries for overall analysis
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

from datetime import date, time, datetime
import plotly.graph_objs as go
import plotly.offline as pyo
import plotly.figure_factory as ff
import plotly.express as px
from plotly import tools
from plotly.subplots import make_subplots
from plotly.offline import iplot

In [3]:
## loading application.csv file to a dataframe named bankloan
bankloan=pd.read_csv(r"C:\Users\ASUS\Downloads\application_data.csv")

In [None]:
## checking how the dataframe looks
bankloan.head()

In [None]:
#checking null value percentage of 1-122 columns
bankloan.iloc[:,0:122].isnull().sum()/len(bankloan)*100

In [None]:
## getting the information about the dataframe created above
bankloan.info(verbose=True,null_counts=True)

In [None]:
## describing the dataframe
bankloan.describe()

In [None]:
# checking the data types of all columns
bankloan.dtypes.value_counts()

In [None]:
#Checking the number of columns with null value percentage more than 30
emptycolumn=bankloan.isnull().sum()
emptycolumn=emptycolumn[emptycolumn.values>(0.3*len(bankloan))]
len(emptycolumn)

In [None]:
# Checking the null value percentage for each column after dropping columns with more than 30% null values
emptycolumn=list(emptycolumn[emptycolumn.values>0.3].index)
bankloan.drop(labels=emptycolumn,axis=1,inplace=True)
bankloan.isnull().sum()/len(bankloan)*100

In [None]:
# Getting all the names of columns after dropping the columns with null values more than 30%
bankloan.columns

In [None]:
#Imputing the null values of the column named AMT_ANNUITY by its median
bankloan.AMT_ANNUITY=bankloan.AMT_ANNUITY.fillna(bankloan['AMT_ANNUITY'].median())

In [None]:
bankloan.isnull().sum()

In [None]:
# Checking for wrong data in wrong column
bankloan[bankloan.CNT_FAM_MEMBERS.isnull()]

In [None]:
#List of non_relevant columns
nonrelevant=['FLAG_MOBIL','FLAG_EMP_PHONE','FLAG_WORK_PHONE','FLAG_CONT_MOBILE','FLAG_PHONE','FLAG_EMAIL','REGION_RATING_CLIENT','CNT_FAM_MEMBERS','REGION_RATING_CLIENT_W_CITY','DAYS_LAST_PHONE_CHANGE','FLAG_DOCUMENT_2','FLAG_DOCUMENT_3','FLAG_DOCUMENT_4','FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7','FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10','FLAG_DOCUMENT_11','FLAG_DOCUMENT_12','FLAG_DOCUMENT_13','FLAG_DOCUMENT_14','FLAG_DOCUMENT_15','FLAG_DOCUMENT_16','FLAG_DOCUMENT_17','FLAG_DOCUMENT_18','FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21']

In [None]:
#Checking the number of non-relevant columns
len(nonrelevant)

In [None]:
#Dropping non_relevant columns from the main dataframe
bankloan.drop(labels=nonrelevant,axis=1,inplace=True)
bankloan.info()

In [None]:
#Checking how the dataframe looks after performing all above mentioned operations
bankloan.head()

In [None]:
bankloan.columns

In [None]:
#Checking for undefined values in column named CODE_GENDER
bankloan.CODE_GENDER.value_counts()

In [None]:
#Imputing the undefined value of the column CODE_GENDER with F as the number is too less
bankloan.loc[bankloan['CODE_GENDER']=='XNA']='F'
bankloan.CODE_GENDER.value_counts()

In [None]:
bankloan.NAME_CONTRACT_TYPE.value_counts()

In [None]:
bankloan.NAME_FAMILY_STATUS.value_counts()

In [None]:
bankloan.NAME_FAMILY_STATUS=bankloan.NAME_FAMILY_STATUS.apply(lambda x: x.replace('/','or'))

In [None]:
bankloan.NAME_FAMILY_STATUS.value_counts()

In [None]:
bankloan.NAME_HOUSING_TYPE.value_counts()

In [None]:
bankloan.NAME_HOUSING_TYPE=bankloan.NAME_HOUSING_TYPE.apply(lambda x: x.replace('/','or'))

In [None]:
bankloan.NAME_INCOME_TYPE.value_counts()

In [None]:
bankloan.ORGANIZATION_TYPE.value_counts()

In [None]:
bankloan=bankloan[~(bankloan.ORGANIZATION_TYPE=='XNA')]

In [None]:
bankloan.ORGANIZATION_TYPE.value_counts()

In [None]:
bankloan.REG_CITY_NOT_LIVE_CITY.value_counts()

In [None]:
bankloan.loc[bankloan.REG_CITY_NOT_LIVE_CITY=='F']=0

In [None]:
bankloan.REG_CITY_NOT_LIVE_CITY.value_counts()

In [None]:
#Converting columns to numeric
ncs=['TARGET','CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','REGION_POPULATION_RELATIVE','DAYS_BIRTH','DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH','HOUR_APPR_PROCESS_START','LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY','REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY']
bankloan[ncs]=bankloan[ncs].apply(pd.to_numeric)
bankloan.head()

In [None]:
# Converting '-ve' values into '+ve' Values
bankloan['DAYS_BIRTH'] = bankloan['DAYS_BIRTH'].abs()
bankloan['DAYS_EMPLOYED'] = bankloan['DAYS_EMPLOYED'].abs()
bankloan['DAYS_REGISTRATION'] = bankloan['DAYS_REGISTRATION'].abs()
bankloan['DAYS_ID_PUBLISH'] = bankloan['DAYS_ID_PUBLISH'].abs()

In [None]:
#Converting number of days of birth and employed into years 

bankloan.DAYS_BIRTH=bankloan.DAYS_BIRTH.values/365


In [None]:
incomebins=[25000,275000,525000,775000,1025000,1275000,1525000]
incomeslots = ['25000-275000','275000-525000','525000-775000','775000-1025000','1025000-1275000','1275000 and above']
bankloan['Income_range']=pd.cut(bankloan['AMT_INCOME_TOTAL'],bins=incomebins,labels=incomeslots)

In [None]:
creditbins = [45000,1045000,2045000,3045000,4045000,5045000]
creditslots = ['45000-1045000','1045000-2045000','2045000-3045000','3045000-4045000','4045000-5045000']

bankloan['CREDIT_RANGE'] = pd.cut(bankloan.AMT_CREDIT,bins=creditbins,labels=creditslots)

In [None]:
bankloan.CREDIT_RANGE.value_counts()

In [None]:
#Checking value counts for TARGET variable
bankloan.TARGET.value_counts()

In [None]:
box = go.Figure(go.Box(y = bankloan['AMT_CREDIT'],
                       name = 'Amount Credit', x = bankloan['TARGET']))

box.update_layout(title={'text': "AMOUNT CREDIT",
                         'y':0.9,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'},
                  width = 800,
                  height=450,
                  template='plotly')   
iplot(box)

In [None]:
box = go.Figure(go.Box(y = bankloan['AMT_INCOME_TOTAL'],
                       name = 'Amount Income', x = bankloan['TARGET']))

box.update_layout(title={'text': "AMOUNT INCOME",
                         'y':0.9,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'},
                  width = 800,
                  height=450,
                  template='plotly')   
iplot(box)

In [None]:
box = go.Figure(go.Box(y = bankloan['AMT_ANNUITY'],
                       name = 'Amount Annuity', x = bankloan['TARGET']))

box.update_layout(title={'text': "AMOUNT ANNUITY",
                         'y':0.9,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'},
                  width = 800,
                  height=450,
                  template='plotly')   
iplot(box)

In [None]:
import seaborn as sns

In [None]:
sns.countplot(data=bankloan,x='NAME_CONTRACT_TYPE',order=bankloan.NAME_CONTRACT_TYPE.value_counts().index)
plt.title('Contract Type \n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
sns.countplot(data=bankloan,x='CODE_GENDER',order=bankloan.CODE_GENDER.value_counts().index)
plt.title('Gender \n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
ax = sns.countplot(data=bankloan,x='NAME_EDUCATION_TYPE',order=bankloan.NAME_EDUCATION_TYPE.value_counts().index)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.title('Education Type \n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
bankloan.CNT_CHILDREN.value_counts()

In [None]:
sns.countplot(data=bankloan,x='CNT_CHILDREN',order=bankloan.CNT_CHILDREN.value_counts().index)
plt.title('COUNT Children \n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
#Checking which income range of people are mostly applying for loans
sns.countplot(bankloan.Income_range)
plt.yscale('linear')
plt.ylabel("Count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Income Range Values", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('AMOUNT INCOME\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
#Checking which income range of people are mostly applying for loans
sns.countplot(bankloan.CREDIT_RANGE)
plt.yscale('linear')
plt.ylabel("Count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("credit Range Values", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('AMOUNT CREDIT\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
bankloan.CREDIT_RANGE

In [None]:
AGE = bankloan.DAYS_BIRTH.round()
AGE

In [None]:
agebins = [20,35,50,65,80,]
ageslots = ['20-35','30-50','50-65','65-80']

bankloan['AGE_RANGE'] = pd.cut(AGE,bins=agebins,labels=ageslots)

In [None]:
bankloan.AGE_RANGE.value_counts()

In [None]:
#Checking which income range of people are mostly applying for loans
sns.countplot(bankloan.AGE_RANGE)
plt.yscale('linear')
plt.ylabel("Count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Age Range", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=40)
plt.title('AGE\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
bankloan.DAYS_EMPLOYED = bankloan.DAYS_EMPLOYED/30

In [None]:
employed = bankloan.DAYS_EMPLOYED.round()

In [None]:
employed

In [None]:
employedbins = [0,100,200,300,400,500,600]
employedslots = ['0-100','100-200','200-300','300-400','400-500','500-600']

bankloan['EMPLOYED_RANGE'] = pd.cut(employed,bins=employedbins,labels=employedslots)

In [None]:
#Checking which income range of people are mostly applying for loans
sns.countplot(bankloan.EMPLOYED_RANGE)
plt.yscale('linear')
plt.ylabel("Count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Months Range", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=40)
plt.title('Months Employed\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
sns.countplot(bankloan.NAME_HOUSING_TYPE)
plt.yscale('linear')
plt.ylabel("Count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("housing type", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=40)
plt.title('Housing Type\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
sns.countplot(bankloan.NAME_FAMILY_STATUS)
plt.yscale('linear')
plt.ylabel("Count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Family Status", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('FAMILY STATUS\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
sns.countplot(bankloan.NAME_INCOME_TYPE)
plt.yscale('linear')
plt.ylabel("Count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Income Type", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('INCOME TYPE\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
bankloan.NAME_TYPE_SUITE.value_counts()

In [None]:
sns.countplot(bankloan.NAME_TYPE_SUITE)
plt.yscale('linear')
plt.ylabel("Count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Name Type suite", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('NAME TYPE SUITE\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
sns.countplot(bankloan.Income_range, hue=bankloan.TARGET)
plt.yscale('log')
plt.ylabel("Count in log", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Income range", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('AMOUNT INCOME VS TARGET\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
sns.countplot(bankloan.AGE_RANGE, hue=bankloan.TARGET)
plt.yscale('linear')
plt.ylabel("Count ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Age range", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('AGE VS TARGET\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
sns.countplot(bankloan.CODE_GENDER, hue=bankloan.TARGET)
plt.yscale('linear')
plt.ylabel("Count ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Gender", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('GENDER VS TARGET\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
sns.countplot(bankloan.NAME_INCOME_TYPE, hue=bankloan.TARGET)
plt.yscale('log')
plt.ylabel("Count ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Income type", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('INCOME TYPE VS TARGET\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
# Dividing the dataset into two dataset of  target=1(client with payment difficulties) and target=0(all other)

target0=bankloan.loc[bankloan["TARGET"]==0]
target1=bankloan.loc[bankloan["TARGET"]==1]

In [None]:
target0.shape 
target1.shape

In [None]:
# Checking the balance ratio of the dataframes created above
round(len(target0)/len(target1),2)

BIVARIATE ANALYSIS
For Dataframe named target0

In [None]:
#Checking for Credit amount provided to the customers based on their Family type
#and plotted according their Eduaction status
plt.figure(figsize=(12,8))
scale_factor=5
sns.stripplot(data=target0,x=target0.NAME_FAMILY_STATUS,y=target0.AMT_CREDIT,hue=target0.NAME_EDUCATION_TYPE)
plt.xticks(rotation=-30)
plt.xlabel("FAMILY STATUS ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("CREDIT AMOUNT ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.title('Credit amount vs Family Status \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.show()

In [None]:
#Checking for Income amount of the customers based on their Family type 
#and plotted according their Education status
plt.figure(figsize=(12,8))
scale_factor=5
sns.stripplot(data=target0,x=target0.NAME_FAMILY_STATUS,y=target0.AMT_INCOME_TOTAL,hue=target0.NAME_EDUCATION_TYPE)
plt.xticks(rotation=-30)
plt.xlabel("FAMILY STATUS ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("TOTAL INCOME AMOUNT ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yscale('log')
plt.title('Total Income amount vs Family Status \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.show()


In [None]:
#Checking for Credit amount provided to the customers based on their education 
#and plotted according the Contract types of loans they are applying for.
plt.figure(figsize=(12,8))
scale_factor=5
sns.boxplot(data=target0,x=target0.NAME_EDUCATION_TYPE,y=target0.AMT_CREDIT,hue=target0.NAME_CONTRACT_TYPE)
plt.xticks(rotation=-30)
plt.xlabel("EDUCATION STATUS ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("CREDIT AMOUNT ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yscale('log')
plt.title('Credit amount vs Education Status \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.show()

In [None]:
#Checking for Income amount provided to the customers based on their education 
#and plotted according the Contract types of loans they are applying for.
plt.figure(figsize=(12,8))
scale_factor=5
sns.barplot(data=target0,x=target0.NAME_EDUCATION_TYPE,y=target0.AMT_INCOME_TOTAL,hue=target0.NAME_CONTRACT_TYPE)
plt.xticks(rotation=-30)
plt.xlabel("EDUCATION STATUS ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("Total Income AMOUNT ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yscale('log')
plt.title('Income amount vs Education Status \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.show()


For Dataframe named target1

In [None]:
#Checking for Credit amount provided to the customers based on their education 
#and plotted according their family status
plt.figure(figsize=(12,8))
scale_factor=5
sns.stripplot(data=target1,x=target1.NAME_EDUCATION_TYPE,y=target1.AMT_CREDIT,hue=target1.NAME_FAMILY_STATUS)
plt.xticks(rotation=-30)
plt.xlabel("EDUCATION STATUS ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("CREDIT AMOUNT ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yscale('log')
plt.title('Credit amount vs Education Status \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.show()

In [None]:
#Checking for Income amount of the customers based on their education 
#and plotted according their family status
plt.figure(figsize=(12,8))
scale_factor=5
sns.stripplot(data=target1,x=target1.NAME_FAMILY_STATUS,y=target1.AMT_INCOME_TOTAL,hue=target1.NAME_EDUCATION_TYPE)
plt.xticks(rotation=-30)
plt.xlabel("EDUCATION STATUS ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("TOTAL INCOME AMOUNT ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yscale('log')
plt.title('Total Income amount vs Education Status \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.show()


In [None]:
#Checking for Credit amount provided to the customers based on their education 
#and plotted according the Contract types of loans they are applying for.
plt.figure(figsize=(12,8))
scale_factor=5
sns.stripplot(data=target1,x=target1.NAME_EDUCATION_TYPE,y=target1.AMT_CREDIT,hue=target1.NAME_CONTRACT_TYPE)
plt.xticks(rotation=-30)
plt.xlabel("EDUCATION STATUS ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("CREDIT AMOUNT ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yscale('linear')
plt.title('Credit amount vs Education Status \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.show()

In [None]:
#Checking for Income amount provided to the customers based on their education 
#and plotted according the Contract types of loans they are applying for.
plt.figure(figsize=(12,8))
scale_factor=5
sns.boxplot(data=target1,x=target1.NAME_EDUCATION_TYPE,y=target1.AMT_INCOME_TOTAL,hue=target1.NAME_CONTRACT_TYPE)
plt.xticks(rotation=-30)
plt.xlabel("EDUCATION STATUS ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("Total Income AMOUNT ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yscale('log')
plt.title('Income amount vs Education Status \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.show()


In [None]:
#Checking for Income amount provided to the customers based on their education 
#and plotted according the Contract types of loans they are applying for.
plt.figure(figsize=(12,8))
scale_factor=5
sns.stripplot(data=target1,x=target1.NAME_EDUCATION_TYPE,y=target1.AMT_INCOME_TOTAL,hue=target1.NAME_CONTRACT_TYPE)
plt.xticks(rotation=-30)
plt.xlabel("EDUCATION STATUS ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("Total Income AMOUNT ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yscale('log')
plt.title('Income amount vs Education Status \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.show()


In [None]:
#Creating a correlation table between all the variables of targeto dataframe
target0_correlation=target0.iloc[:,2:].corr()
target0_correlation

In [None]:
#plotting the correlation table of all variables of target0 dataframe
f, ax = plt.subplots(figsize=(14, 9))
sns.heatmap(target0_correlation,cmap='Reds',annot=True)
plt.title('CORRELATION TABLE FOR target0 \n',fontdict={'fontsize':18, 'fontweight' : 10, 'color' : 'Blue'})
plt.xlabel("All columns of target0 ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("All Columns of target0 ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.show()

In [None]:
#Creating a correlation table between all the variables of targeto dataframe
target1_correlation=target1.iloc[:,2:].corr()
target1_correlation

In [None]:
#plotting the correlation table of all variables of target1 dataframe
f, ax = plt.subplots(figsize=(14, 9))
sns.heatmap(target1_correlation, cmap='Blues',annot=True)
plt.title('CORRELATION TABLE FOR target1 \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.xlabel("All columns of target1 ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("All Columns of target1 ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.show()

In [None]:
preloanapp=pd.read_csv(r"C:\Users\ASUS\Downloads\previous_application.csv")
preloanapp.head()

In [None]:
preloanapp.shape

In [None]:
preloanapp.info()

In [None]:
preloanapp.describe()

In [None]:
preloanapp.isnull().sum()/len(preloanapp)*100

In [None]:
#Checking the number of columns with null value percentage more than 40
appempty1=preloanapp.isnull().sum()
appempty1=appempty1[appempty1.values>(0.4*len(preloanapp))]
len(appempty1)

In [None]:
preloanapp=preloanapp.drop(appempty1.index, axis=1)


In [None]:
preloanapp.info()

In [None]:
box = go.Figure(go.Box(y = preloanapp['AMT_APPLICATION'],
                       name = 'Amount Applied',))

box.update_layout(title={'text': "AMOUNT APPLIED",
                         'y':0.9,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'},
                  width = 800,
                  height=450,
                  template='plotly')   
iplot(box)

In [None]:
appliedbins=[0,500000,1000000,1500000,2000000,2500000,3000000,3500000,4000000,4500000,5000000,5500000,6000000,6500000,7000000]
appliedslots = ['0-500000','500000-1000000','1000000-1500000','1500000-2000000','2000000-2500000', '2500000-3000000', '3000000-3500000','3500000-4000000','4000000-4500000','4500000-5000000','5000000-5500000','5500000-6000000','6000000-6500000','6500000-7000000']
preloanapp['Income_applied']=pd.cut(preloanapp['AMT_APPLICATION'],bins=appliedbins,labels=appliedslots)

In [None]:
preloanapp.Income_applied.value_counts()

In [None]:
#Checking which income applied range of people are mostly applying for loans
sns.countplot(preloanapp.Income_applied)
plt.yscale('linear')
plt.ylabel("Count ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Income applied Range", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('AMOUNT APPLIED\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
credit1bins=[0,500000,1000000,1500000,2000000,2500000,3000000,3500000,4000000,4500000,5000000,5500000,6000000,6500000,7000000]
credit1slots = ['0-500000','500000-1000000','1000000-1500000','1500000-2000000','2000000-2500000', '2500000-3000000', '3000000-3500000','3500000-4000000','4000000-4500000','4500000-5000000','5000000-5500000','5500000-6000000','6000000-6500000','6500000-7000000']
preloanapp['Income_credit']=pd.cut(preloanapp['AMT_CREDIT'],bins=credit1bins,labels=credit1slots)

In [None]:
sns.countplot(preloanapp.Income_credit)
plt.yscale('linear')
plt.ylabel("Count ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Income credit Range", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('AMOUNT CREDIT\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
preloanapp.FLAG_LAST_APPL_PER_CONTRACT.value_counts() 

In [None]:
sns.countplot(preloanapp.FLAG_LAST_APPL_PER_CONTRACT)
plt.ylabel("Count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('LAST APP CONTRACT\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
sellerbins=[-1,9999,19999,29999,39999,49999,59999,69999,79999,89999,99999,109999,119999,409999]
sellerslots = ['-1-9999','9999-19999','19999-29999','29999-39999','39999-49999', '49999-59999', '59999-69999','69999-79999','79999-89999','89999-99999','99999-109999','109999-119999','119999-409999']
preloanapp['Sellerplace_range']=pd.cut(preloanapp['SELLERPLACE_AREA'],bins=sellerbins,labels=sellerslots)

In [None]:
sns.countplot(preloanapp.Sellerplace_range)
plt.yscale('linear')
plt.ylabel("Count ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Income credit Range", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('AMOUNT CREDIT\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
 preloanapp.NAME_CLIENT_TYPE.value_counts()

In [None]:
sns.countplot(preloanapp.NAME_CLIENT_TYPE )
plt.xlabel("Client type", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('CLIENT TYPE\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
preloanapp.NAME_SELLER_INDUSTRY.value_counts()

In [None]:
sns.countplot(preloanapp.NAME_SELLER_INDUSTRY )
plt.xlabel("Seller industry", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=90)
plt.title('SELLER INDUSTRY\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
#Merging the two dataframes created above
mergedloandf=bankloan.merge(preloanapp,on='SK_ID_CURR')

In [None]:
#Checking how the merged dataframe looks like
mergedloandf.head()

In [None]:
#Finding information about the combined dataframe
mergedloandf.info()

In [None]:
#Renaming the columns as convention
mergedloandf = mergedloandf.rename({'NAME_CONTRACT_TYPE_x' : 'NAME_CONTRACT_TYPE','AMT_CREDIT_x':'AMT_CREDIT','AMT_ANNUITY_x':'AMT_ANNUITY',
                         'WEEKDAY_APPR_PROCESS_START_x' : 'WEEKDAY_APPR_PROCESS_START','AMT_GOODS_PRICE_x':'AMT_GOODS_PRICE',
                         'HOUR_APPR_PROCESS_START_x':'HOUR_APPR_PROCESS_START','NAME_CONTRACT_TYPE_y':'NAME_CONTRACT_TYPE_PREV',
                         'AMT_CREDIT_y':'AMT_CREDIT_PREV','AMT_ANNUITY_y':'AMT_ANNUITY_PREV','AMT_GOODS_PRICE_y':'AMT_GOODS_PRICE_PREV',
                         'WEEKDAY_APPR_PROCESS_START_y':'WEEKDAY_APPR_PROCESS_START_PREV',
                         'HOUR_APPR_PROCESS_START_y':'HOUR_APPR_PROCESS_START_PREV'}, axis=1)

In [None]:
#Droppng the non relevant columns of the dataframe
mergedloandf.drop(['SK_ID_CURR','WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START','REG_REGION_NOT_LIVE_REGION', 
              'REG_REGION_NOT_WORK_REGION','LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
              'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY','WEEKDAY_APPR_PROCESS_START_PREV',
              'HOUR_APPR_PROCESS_START_PREV', 'FLAG_LAST_APPL_PER_CONTRACT','NFLAG_LAST_APPL_IN_DAY'],axis=1,inplace=True)

In [None]:
mergedloandf.info()

In [None]:
#Checking for null value percentage
mergedloandf.isnull().sum()/len(mergedloandf)*100

In [None]:
#Checkng the Product combination with respect to Type of contract 
sns.countplot(mergedloandf.PRODUCT_COMBINATION,hue=mergedloandf.NAME_CONTRACT_TYPE)
plt.yscale('log')
plt.title('Count of Product Combinations W.R.T Contract Type \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.legend(bbox_to_anchor=(1.36,1))
plt.xticks(rotation=90)
plt.show()

In [None]:
#Checking Product combination with respect to Status of the Contract
sns.countplot(mergedloandf.PRODUCT_COMBINATION,hue=mergedloandf.NAME_CONTRACT_STATUS)
plt.yscale('log')
plt.title('Count of Product Combinations W.R.T Contract Status \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.legend(bbox_to_anchor=(1,1))
plt.xticks(rotation=90)
plt.show()

In [None]:
#Plotting countplot for checking what loan purpose people are applying for loans 
# and are actually getting the loans 
plt.figure(figsize=(12,12))
plt.xticks(rotation=30)
plt.xscale('log')
plt.ylabel("NAME_CONTRACT_STATUS", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yticks(rotation=30)
plt.title('Distribution of Loan Purpose w.r.t Contract Status\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
sns.countplot(data=mergedloandf,y='NAME_CASH_LOAN_PURPOSE',order=mergedloandf['NAME_CASH_LOAN_PURPOSE'].value_counts().index,hue='NAME_CONTRACT_STATUS')
plt.show()

In [None]:
#Plotting countplot for checking which types of incomes of people are applying for loans 
# and are actually capable of paying back the loans
plt.figure(figsize=(8,6))
plt.xticks(rotation=30)
plt.xscale('log')
plt.ylabel("NAME_INCOME_TYPE", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yticks(rotation=30)
plt.title('Distribution of Income Types w.r.t TARGET\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
sns.countplot(data=mergedloandf,y='NAME_INCOME_TYPE',order=mergedloandf['NAME_INCOME_TYPE'].value_counts().index,hue='TARGET')
plt.show()

In [None]:
#Plotting countplot for checking what loan purpose people are applying for loans 
# and are actually capable of paying back 
plt.figure(figsize=(12,10))
plt.xticks(rotation=30)
plt.xscale('log')
plt.ylabel('NAME_CASH_LOAN_PURPOSE', fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yticks(rotation=30)
plt.title('Distribution of Loan Purpose w.r.t TARGET\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
sns.countplot(data=mergedloandf,y='NAME_CASH_LOAN_PURPOSE',order=mergedloandf.NAME_CASH_LOAN_PURPOSE.value_counts().index,hue='TARGET')
plt.show()

In [None]:
#Checking for Amount credited previously against Purpose of loan with respect to 
plt.figure(figsize=(18,15),dpi=400)
sns.stripplot(data=mergedloandf,y='AMT_CREDIT_PREV',x='NAME_CASH_LOAN_PURPOSE',hue='NAME_INCOME_TYPE')
plt.xticks(rotation=90)
plt.yscale('log')
plt.ylabel("AMT_CREDIT_PREV", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yticks(rotation=30)
plt.title('Distribution of Loan Purpose w.r.t TARGET\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
#Checking for Amount Credited vs housing type with respect to target
plt.figure(figsize=(8,6))
sns.barplot(data=mergedloandf,y='AMT_CREDIT_PREV',hue='TARGET',x='NAME_HOUSING_TYPE')
plt.xticks(rotation=90)
plt.ylabel("AMT_CREDIT_PREV", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yticks(rotation=30)
plt.title('Distribution of Amount Credited previously  vs Housing Type w.r.t TARGET\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
#Checking for Amount Credited vs Education type with respect to Contract Status
plt.figure(figsize=(8,6))
sns.stripplot(data=mergedloandf,y='AMT_CREDIT_PREV',hue='NAME_CONTRACT_STATUS',x='NAME_EDUCATION_TYPE')
plt.xticks(rotation=90)
plt.ylabel("AMT_CREDIT_PREV", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("count", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.yticks(rotation=30)
plt.yscale('log')
plt.legend(bbox_to_anchor=(1.26,1))
plt.title('Distribution of Amount Credited previously  vs Education Type w.r.t Contract Status\n',fontdict={'fontsize': 15, 'fontweight' : 7, 'color' : 'Blue'})
plt.show()

In [None]:
#Dropping column named SK_ID_PREV for getting correlation of other columns
mergedloandf1=mergedloandf.drop('SK_ID_PREV', axis=1)

In [None]:
#Calculating the correlations of vaious columns for merged dataframe
mergedloandfcorrelation=mergedloandf1.iloc[:,:].corr()
mergedloandfcorrelation

In [None]:
#Plotting a heatmap for correlation obtained above
f, ax = plt.subplots(figsize=(14, 9))
sns.heatmap(mergedloandfcorrelation, cmap='Greys',annot=True)
plt.title('CORRELATION TABLE FOR mergedloandf1 \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Red'})
plt.xlabel("All columns of mergedloandf1 ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("All Columns of mergedloandf1 ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.show()