# CREDIT EDA CASE STUDY

In [None]:
# Not show the warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import the required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Display all columns
pd.set_option('display.max_columns',200)

# APPLICATION DATA

## Read the Application Data.

Read the Application data file provided and store it in a dataframe `application`.

In [None]:
# Read the csv file using 'read_csv'.
application= pd.read_csv("../input/loan-defaulter/application_data.csv")

##  Check the structure of the data

Inspect the dataframe for dimensions, null-values, and summary of different numeric columns:

In [None]:
# Check the number of rows and columns in the dataframe of application
application.shape

In [None]:
# Check the data of top 5 rows of application
application.head()

Check the column-wise info of the dataframe:

In [None]:
# Types of all columns
application.info(verbose=True)

In [None]:
# Count the number of null values in each column
application.isnull().sum()

In [None]:
# Check if each column is null
application.isnull().any()

In [None]:
# List of columns which have null values
application.columns[application.isnull().any()].tolist()

In [None]:
# Check the summary for the numeric columns 
application.describe()

##  Data Quality check and missing values

### Identify missing data

In [None]:
# Percentage of null values in each column
null_percentage = application.isnull().sum() * 100 / len(application)

In [None]:
# Top columns have highest percentages of null values
null_percentage.sort_values(ascending = False).head(50)

From the list above, it can be seen that there are 49 columns having null values more than ~50%. Because they have a lot of missing values, we will remove them.

### Deal with missing data

In [None]:
# List all of the columns having null values more than ~50% (ie. 45%)
null_columns = null_percentage[null_percentage.values>45].index
null_columns

In [None]:
# Remove those columns
application.drop(null_columns, axis=1, inplace=True)

In [None]:
# See the application data after dropping the columns having null values more than 45%
application.head()

In [None]:
# List of remaining columns which have null values
null_percentage = application.isnull().sum() * 100 / len(application)
null_percentage[null_percentage>0].sort_values(ascending = False)

From the list above, it can be seen that "Occupation_Type" has the highest rows of missing value. Therefore, we are going to deal with it.

In [None]:
# Read the OCCUPATION_TYPE column to see what values it have
application.OCCUPATION_TYPE.value_counts()

Laborers are people who do unskilled manual work. Manual work is usually very hard to define in words, and typically does not have a specific name for it. This may be the reason why some people cannot name it in their application, as a result, they leave the field empty. Thus, we will replace the null value in OCCUPATION_TYPE with "Laborers".

In [None]:
# Replace the null value in OCCUPATION_TYPE with "Laborers"
application["OCCUPATION_TYPE"].fillna("Laborers", inplace = True)

In [None]:
# Check if that column still has null value
application["OCCUPATION_TYPE"].isnull().sum()

In [None]:
# Read the OCCUPATION_TYPE column to see how many rows now have "Laborers"
application.OCCUPATION_TYPE.value_counts()

In [None]:
# List of remaining columns which have null values
null_percentage = application.isnull().sum() * 100 / len(application)
null_percentage[null_percentage>0].sort_values(ascending = False)

In [None]:
# Read the AMT_REQ_CREDIT_BUREAU_YEAR column to see what values it have
application.AMT_REQ_CREDIT_BUREAU_YEAR.value_counts()

From the list of values of AMT_REQ_CREDIT_BUREAU_YEAR, it can be seen that 0 appears in the highest number of rows. 0 means no credit. When people have no credit, they also tend to give no answer to the question. Therefore, now, we will replace the null value in AMT_REQ_CREDIT_BUREAU_YEAR with 0.

In [None]:
# Replace the null value in AMT_REQ_CREDIT_BUREAU_YEAR with 0
application["AMT_REQ_CREDIT_BUREAU_YEAR"].fillna(0, inplace = True)

In [None]:
# Check if that column still has null value
application["AMT_REQ_CREDIT_BUREAU_YEAR"].isnull().sum()

In [None]:
# Read the AMT_REQ_CREDIT_BUREAU_YEAR column to see how many rows now have 0.
application.AMT_REQ_CREDIT_BUREAU_YEAR.value_counts()

In [None]:
# List of remaining columns which have null values
null_percentage = application.isnull().sum() * 100 / len(application)
null_percentage[null_percentage>0].sort_values(ascending = False)

In [None]:
# Do the same for columns related to 'AMT_REQ_CREDIT_BUREAU_'
# Check columns related to 'AMT_REQ_CREDIT_BUREAU_'
amt_req_credit_bureau_columns = [column for column in application if column.startswith('AMT_REQ_CREDIT_BUREAU_')]
amt_req_credit_bureau_columns

In [None]:
# Replace the null value in AMT_REQ_CREDIT_BUREAU_HOUR with 0
application["AMT_REQ_CREDIT_BUREAU_HOUR"].fillna(0, inplace = True)

In [None]:
# Replace the null value in AMT_REQ_CREDIT_BUREAU_DAY with mode
application["AMT_REQ_CREDIT_BUREAU_DAY"].fillna(0, inplace = True)

In [None]:
# Replace the null value in AMT_REQ_CREDIT_BUREAU_WEEK with mode
application["AMT_REQ_CREDIT_BUREAU_WEEK"].fillna(0, inplace = True)

In [None]:
# Replace the null value in AMT_REQ_CREDIT_BUREAU_MON with mode
application["AMT_REQ_CREDIT_BUREAU_MON"].fillna(0, inplace = True)

In [None]:
# Replace the null value in AMT_REQ_CREDIT_BUREAU_QRT with mode
application["AMT_REQ_CREDIT_BUREAU_QRT"].fillna(0, inplace = True)

In [None]:
# List of remaining columns which have null values
null_percentage = application.isnull().sum() * 100 / len(application)
null_percentage[null_percentage>0].sort_values(ascending = False)

In [None]:
# Read the AMT_GOODS_PRICE column to see what values it have
application.AMT_GOODS_PRICE.value_counts()

AMT_GOODS_PRICE is the price of the goods for which the loan is given. We will see the middle which is usually given to people.

In [None]:
# Find the middle value
application.AMT_GOODS_PRICE.median()

In [None]:
# Find the mode value
application.AMT_GOODS_PRICE.mode()

Interestingly, the middle value is also the price which is given to the highest number of people. Therefore, we will replace the null value with this value.

In [None]:
# Replace the null value in AMT_GOODS_PRICE with median
application["AMT_GOODS_PRICE"].fillna(application.AMT_GOODS_PRICE.median(), inplace = True)

In [None]:
# Check if that column still has null value
application["AMT_GOODS_PRICE"].isnull().sum()

In [None]:
# List of remaining columns which have null values
null_percentage = application.isnull().sum() * 100 / len(application)
null_percentage[null_percentage>0].sort_values(ascending = False)

AMT_ANNUITY is the Loan annuity

In [None]:
# Read the AMT_ANNUITY column to see what values it have
application.AMT_ANNUITY.value_counts()

In [None]:
# Find the middle value
application.AMT_ANNUITY.median()

In [None]:
# Replace the null value in AMT_ANNUITY with median
application["AMT_ANNUITY"].fillna(application.AMT_ANNUITY.median(), inplace = True)

In [None]:
# Check if that column still has null value
application["AMT_ANNUITY"].isnull().sum()

In [None]:
# List of remaining columns which have null values
null_percentage = application.isnull().sum() * 100 / len(application)
null_percentage[null_percentage>0].sort_values(ascending = False)

### Deal with invalid data

In [None]:
# Recall the total number of columns
application.shape

In [None]:
# Types of all columns
application.info(verbose=True)

By looking at the list above, it can be seen that there is no column having an inappropriate data types. Contract type, gender, income type, education type, ect. (categorical data) have OBJECT type. Number of children, amount income, amount credit, annuity, days_birth, days_employed, etc. (numerical data) have INT or FLOAT type. There is no "Date_of_birth" / "Birthday", so we don't need to check and make sure that it is in the DATE TIME type.

In [None]:
# Check values of TARGET
application.TARGET.value_counts()

=> OK, no strange value

In [None]:
# Check values of NAME_CONTRACT_TYPE
application.NAME_CONTRACT_TYPE.value_counts()

=> OK, no strange value

In [None]:
# Check values of gender
application.CODE_GENDER.value_counts()

In [None]:
# Replacing XNA with the mode
application["CODE_GENDER"].replace({"XNA": "F"}, inplace=True)

In [None]:
# Check values of gender
application.CODE_GENDER.value_counts()

In [None]:
# Find columns that contains the string 'XNA'
xna_application = application.loc[: , (application == 'XNA').any()]
xna_application

ORGANIZATION_TYPE: Type of organization where client works

In [None]:
# Check values of ORGANIZATION_TYPE
application.ORGANIZATION_TYPE.value_counts()

In [None]:
# Replace XNA with NaN
application.ORGANIZATION_TYPE.replace({"XNA": np.NaN}, inplace=True)

In [None]:
# Check values of ORGANIZATION_TYPE
application.ORGANIZATION_TYPE.value_counts()

In [None]:
# Check values of CNT_CHILDREN
application.CNT_CHILDREN.value_counts()

=> OK, no strange value

In [None]:
# Check the data type of a DAYS_ column
application.dtypes['DAYS_EMPLOYED']

In [None]:
# Check daytime columns
day_columns = [column for column in application if column.startswith('DAYS_')]
day_columns

- DAYS_BIRTH:	Client's age in days at the time of application
- DAYS_EMPLOYED: How many days before the application the person started current employment
- DAYS_REGISTRATION: How many days before the application did client change his registration
- DAYS_ID_PUBLISH: How many days before the application did client change the identity document with which he applied for the loan
- DAYS_LAST_PHONE_CHANGE: How many days before application did client change phone

=> They are the number of days, so they cannot be negative.

In [None]:
# Check to see if there are invalid values for a DAYs_ column (negative value)
application.DAYS_BIRTH[application.DAYS_BIRTH<0]

In [None]:
# Replace negative values with absolute values
application[day_columns]= abs(application[day_columns])

In [None]:
# Check to see if there are invalid values for a DAYs_ column (negative value)
application.DAYS_BIRTH[application.DAYS_BIRTH<0]

In [None]:
# Convert DAYS_BIRTH to years
application["YEARS_BIRTH"] = (application.DAYS_BIRTH/365).astype(int)

In [None]:
# Check values of YEARS_BIRTH
application["YEARS_BIRTH"].value_counts()

### Binning continuous variables

In [None]:
# Types of all columns
application.info(verbose=True)

In [None]:
# Check values of AMT_INCOME_TOTAL
application.AMT_INCOME_TOTAL.value_counts()

In [None]:
# Check IQR of AMT_INCOME_TOTAL
application.AMT_INCOME_TOTAL.describe()

In [None]:
# Plot IQR on a boxplot
application.boxplot(column='AMT_INCOME_TOTAL', return_type='axes');

-> Realise that AMT_INCOME_TOTAL has an outlier -> Will deal with in later.

In [None]:
# Define the range of quantiles to use: q=[0, .2, .4, .6, .8, 1]. Binning AMT_INCOME_RANGE based on quantiles.
range_labels = ['Very Low', 'Low', "Medium", 'High', 'Very high']
application["AMT_INCOME_RANGE"] = pd.qcut(application.AMT_INCOME_TOTAL,
                              q=[0, .2, .4, .6, .8, 1],
                              labels=range_labels)

(Source: https://pbpython.com/pandas-qcut-cut.html)

In [None]:
# Check the distribution
application["AMT_INCOME_RANGE"].value_counts()

In [None]:
# Do the same for AMT_CREDIT
# Check values of AMT_CREDIT
application.AMT_CREDIT.value_counts()

In [None]:
# Check IQR of AMT_CREDIT
application.AMT_CREDIT.describe()

In [None]:
# Binning AMT_CREDIT based on quantiles.
application["AMT_CREDIT_RANGE"] = pd.qcut(application.AMT_CREDIT,
                              q=[0, .2, .4, .6, .8, 1],
                              labels=range_labels)

In [None]:
# Check the distribution
application["AMT_CREDIT_RANGE"].value_counts()

In [None]:
# Check the values of YEARS_BIRTH
application["YEARS_BIRTH"].value_counts()

In [None]:
# Check IQR of YEARS_BIRTH
application.YEARS_BIRTH.describe()

In [None]:
# Binning YEARS_BIRTH to Group of age
application['YEARS_BIRTH_RANGE']=pd.cut(application['YEARS_BIRTH'], bins=[20,30,40,60,70], labels=['Young Adult', 'Adult', 'Middle Age', 'Senior'])

In [None]:
# Check the values of YEARS_BIRTH
application["YEARS_BIRTH_RANGE"].value_counts()

### Deal with outliers

In [None]:
# Plot AMT_INCOME_TOTAL on a boxplot
application.boxplot(column='AMT_INCOME_TOTAL', return_type='axes',vert=False);

AMT_INCOME_TOTAL is Income of the client. From the boxplot above, there is a record which is an outlier with income total more than 100000000.
Let see who is that client.

In [None]:
# Check the row of the outlier
application[application.AMT_INCOME_TOTAL>=100000000]

Check if that high income is really their income:

In [None]:
# Check all of the income types
application.NAME_INCOME_TYPE.value_counts()

In [None]:
# Check all of the education types
application.NAME_EDUCATION_TYPE.value_counts()

In [None]:
# Check all of the housing types
application.NAME_HOUSING_TYPE.value_counts()

In [None]:
# Check all of the organisation types
application.ORGANIZATION_TYPE.value_counts()

The income is very high, but income type is Working, not business man. There is nothing special in educational level and organisation type. It looks like other normal people.
Therefore, there is a chance that there is a mistake when collecting this data, leading to have this outlier.

In [None]:
# Check all columns again to find other ones with outliers
application.info(verbose=True)

In [None]:
# Plot AMT_ANNUITY & AMT_CREDIT on a scatter plot
plt.figure(figsize=(10,5))
plt.xlabel("AMT_ANNUITY")
plt.ylabel("AMT_CREDIT")
plt.scatter(application['AMT_ANNUITY'],application['AMT_CREDIT'])

It can be seen that there is no outlier for AMT_CREDIT, but there is an outlier for AMT_ANNUITY, with the value greater than 250,000.

In [None]:
# Plot CNT_CHILDREN & CNT_FAM_MEMBERS on a scatter plot
plt.figure(figsize=(10,5))
plt.xlabel("CNT_CHILDREN")
plt.ylabel("CNT_FAM_MEMBERS")
plt.scatter(application['CNT_CHILDREN'],application['CNT_FAM_MEMBERS'])

There is a linear relationship between the number of family members and the number of children in the family.
There is an outlier having more than 17 children and more than 17 people in the family.

In [None]:
# Plot YEARS_BIRTH & DAYS_EMPLOYED on a scatter plot
plt.figure(figsize=(10,5))
plt.xlabel("YEARS_BIRTH")
plt.ylabel("DAYS_EMPLOYED")
plt.scatter(application['YEARS_BIRTH'],application['DAYS_EMPLOYED'])

In [None]:
# Calculate DAYS_EMPLOYED greater than 350,000 to YEARS
application.DAYS_EMPLOYED[application.DAYS_EMPLOYED>350000] / 365

DAYS_BIRTH: Client's age in days at the time of application

DAYS_EMPLOYED: How many days before the application the person started current employment

It can be seen that there is no outlier for YEARS_BIRTH.
There are outliers for DAYS_EMPLOYED, which value is greater than 350,000.
It's more than 1,000 years, which is very unrealistic for normal people (with ages between 20 and 70 years old).
Therefore, there is a chance that this data was collected incorrectly.

In [None]:
# Plot DAYS_REGISTRATION on a boxplot
application.boxplot(column='DAYS_REGISTRATION', return_type='axes',vert=False);

DAYS_REGISTRATION: How many days before the application did client change his registration.

In [None]:
# Calculate DAYS_REGISTRATION to YEARS
application['YEARS_REGISTRATION'] = application.DAYS_REGISTRATION / 365

# Display rows with DAYS_REGISTRATION > 20000
application[['YEARS_REGISTRATION','DAYS_REGISTRATION']][application['DAYS_REGISTRATION']>20000].sort_values(by=['YEARS_REGISTRATION'],ascending = False)

In [None]:
# Plot YEARS_BIRTH & YEARS_REGISTRATION on a scatter plot
plt.figure(figsize=(10,5))
plt.xlabel("YEARS_BIRTH")
plt.ylabel("YEARS_REGISTRATION")
plt.scatter(application['YEARS_BIRTH'],application['YEARS_REGISTRATION'])

It can be seen clearly that the highest YEARS_REGISTRATION (the one greater than 67 years and greater than 24,000 days in DAYS_REGISTRATION) is an outlier.

### Drop unnecessary columns

In [None]:
#'FLAG_MOBIL','FLAG_EMP_PHONE','FLAG_WORK_PHONE','FLAG_CONT_MOBILE','FLAG_PHONE','FLAG_EMAIL' are FLAGS whether the clients provided phone numbers / emails or not
# They are not very neccessary, so drop them.
application.drop(columns=['FLAG_MOBIL',
                           'FLAG_EMP_PHONE',
                           'FLAG_WORK_PHONE',
                           'FLAG_CONT_MOBILE',
                           'FLAG_PHONE',
                           'FLAG_EMAIL'],inplace=True)

# Find all columns starting with FLAG_DOCUMENT
flag_document = application.filter(regex='^FLAG_DOCUMENT',axis=1)

# Drop FLAG_DOCUMENT Columns
application.drop(columns=flag_document.columns,inplace=True)

# Find all columns starting with AMT_REQ_CREDIT_BUREAU
flag_document = application.filter(regex='^AMT_REQ_CREDIT_BUREAU',axis=1)

# Drop AMT_REQ_CREDIT_BUREAU Columns
application.drop(columns=flag_document.columns,inplace=True)

# Drop other unnecessary columns
application.drop(columns=["EXT_SOURCE_2","EXT_SOURCE_3","OBS_30_CNT_SOCIAL_CIRCLE","DEF_30_CNT_SOCIAL_CIRCLE","OBS_60_CNT_SOCIAL_CIRCLE","DEF_60_CNT_SOCIAL_CIRCLE"],inplace=True)

application.drop(columns=['DAYS_BIRTH',
                           'DAYS_REGISTRATION'],inplace=True)

##  Analysis

### Check the distribution

Check the distribution of categorical data

In [None]:
# Check all columns again to find categorical data
application.info(verbose=True)

In [None]:
# Function to plot distribution
def plot_distribution(variable):
    plt.figure(figsize=(20,10))
    plt.title(variable)
    ax = sns.countplot(x=variable,data=application)

    # Rotate the ticklabels
    ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right", fontsize=15)
    #(Source: https://stackoverflow.com/questions/42528921/how-to-prevent-overlapping-x-axis-labels-in-sns-countplot#_=_)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Plot distribution of OCCUPATION_TYPE
plot_distribution("OCCUPATION_TYPE")

Most of the clients are laborers. Very few of them are IT staff and HR staff.

In [None]:
# Plot distribution of NAME_INCOME_TYPE
plot_distribution("NAME_INCOME_TYPE")

Most of the clients having income by working. Very few of them are umemployed, student, businessman and maternity leave.

In [None]:
# Plot distribution of NAME_EDUCATION_TYPE
plot_distribution("NAME_EDUCATION_TYPE")

Most of the clients completed their secondary schools. Very few of them had academic degree.

In [None]:
# Plot distribution of NAME_FAMILY_STATUS
plot_distribution("NAME_FAMILY_STATUS")

Most of the clients are married. Very few of them did not state their marital status.

In [None]:
# Plot distribution of NAME_HOUSING_TYPE
plot_distribution("NAME_HOUSING_TYPE")

Most of the clients are living in a house or an apartment. Very few of them live in an office apartment / co-op apartment.

In [None]:
# Plot distribution of ORGANIZATION_TYPE
plot_distribution("ORGANIZATION_TYPE")

Most of the loan applications are from business entity 3.

### Check the imbalance

In [None]:
# View the data again to see which attribute to use when checking the imbalance
application.head(5)

Check the imbalance of the Target attribute

TARGET: Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases).

In [None]:
# Check the Target attribute
application['TARGET'].value_counts(normalize=True)*100

In [None]:
# Plot it on a pie chart

labels = '0 - all other cases', '1 - client with payment difficulties'
explode = (0, 0.1)  # only "explode" the 2nd slice (i.e. '1 - client with payment difficulties')

fig1, ax1 = plt.subplots()
ax1.pie(application['TARGET'].value_counts(), explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Target Imbalance Distribution')
plt.show()
#(Source: https://matplotlib.org/stable/gallery/pie_and_polar_charts/pie_features.html)

The imbalance is high between Target values.

### Divide data into 2 sets

In [None]:
# Partition the data based on the Target Variable
target_1 = application[application['TARGET']==1] ## dataframe containing client with payment difficulties (defaulters)
target_0 = application[application['TARGET']==0] ## dataframe contianing all other cases (non-defaulters)

In [None]:
# View the data of target 1
target_1.head(5)

In [None]:
# View the data of target 0
target_0.head(5)

### Univariate Analysis (Categorical)

In [None]:
# Check all columns again to find categorical data
application.info(verbose=True)

**NAME_CONTRACT_TYPE**

In [None]:
# Check values of NAME_CONTRACT_TYPE
application.NAME_CONTRACT_TYPE.value_counts()

In [None]:
# Plot on charts
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.title('NAME_CONTRACT_TYPE of Defaulters')
sns.countplot(x='NAME_CONTRACT_TYPE',data=target_1)

plt.subplot(1,2,2)
plt.title('NAME_CONTRACT_TYPE of Non-Defaulters')
sns.countplot(x='NAME_CONTRACT_TYPE',data=target_0)
plt.show()

Bar charts cannot show the difference between defaulters and non-defaulters clearly. Thus, we use pie charts instead.

In [None]:
# Function to plot distribution
def plot_univariate_pie(variable):
    # Plot on a pie chart
    fig, (ax1, ax2) = plt.subplots(1,2,figsize=(20,20))

    # Non Defaulters
    data_0 = target_0[variable].value_counts()
    labels = data_0.index
    ax1.pie(data_0, autopct='%1.1f%%',
            shadow=True, startangle=90)
    ax1.set_title('Non-defaulter')
    ax1.legend(labels, loc="lower right")

    # Defaulters
    data_1 = target_1[variable].value_counts()
    labels = data_1.index
    ax2.pie(data_1, autopct='%1.1f%%',
            shadow=True, startangle=90)
    ax2.set_title('Defaulter')

    ax2.legend(labels, loc="lower right")

    plt.show()

In [None]:
# Plot on a pie chart
plot_univariate_pie("NAME_CONTRACT_TYPE")

Defaulters have a high percentage of revolving loans, while non-defaulters have a higher percentage of cash loans when compared defaulters and non-defaulters.

**GENDER**

In [None]:
# Plot gender on a pie chart
plot_univariate_pie("CODE_GENDER")

Female is the majority in both data sets, both Defaulters and non-Defaulters. The percentage of males in the Defaulter is more than that in the non-Defaulters.

**OCCUPATION TYPE**

In [None]:
# Plot distribution of OCCUPATION_TYPE
# Plot on charts
plt.figure(figsize=(10,10))

# Defaulters
plt.subplot(2,1,1)
plt.title('OCCUPATION_TYPE OF DEFAULTERS')
ax = sns.countplot(x='OCCUPATION_TYPE',data=target_1)

# Rotate the ticklabels
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right", fontsize=15)
plt.tight_layout()

# Non Defaulters
plt.subplot(2,1,2)
plt.title('OCCUPATION_TYPE OF NON-DEFAULTERS')
ax = sns.countplot(x='OCCUPATION_TYPE',data=target_0)

# Rotate the ticklabels
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right", fontsize=15)
plt.tight_layout()
plt.show()

It is quite similar for both defaulters and non-defaulters. Most of the clients are laborers. Very few of them are IT staff and HR staff.

**FLAG_OWN_CAR**

FLAG_OWN_CAR: Flag if the client owns a car

In [None]:
# Plot car owner on a pie chart
plot_univariate_pie("FLAG_OWN_CAR")

The majority of the bank's clients do not own a car. The percentage of non-defaulters having a car is more that that of defaulters.

**FLAG_OWN_REALTY**

FLAG_OWN_REALTY: Flag if client owns a house or flat

In [None]:
# Plot on a pie chart
plot_univariate_pie("FLAG_OWN_REALTY")

The majority of the bank's clients do own a house / flat. The percentages of non-defaulters and defaulters having a house / flat are quite similar.

**NAME_TYPE_SUITE**

NAME_TYPE_SUITE: Who was accompanying client when he was applying for the loan

In [None]:
# Plot on a pie chart
plot_univariate_pie("NAME_TYPE_SUITE")

Most of the clients have no one who accompanied them.

**INCOME SOURCE**

NAME_INCOME_TYPE: Clients income type (businessman, working, maternity leave.

In [None]:
# Income sources of Non Defaulters
plot_univariate_pie("NAME_INCOME_TYPE")

Defaulters have a higher percentage of income sources from working and commercial associate and less percentage of income sources from pension than Non-defaulters do. There are some non-defaulters having income from their own businesses and their schools. Defaults do not have income from thoses sources.

**NAME_EDUCATION_TYPE**

NAME_EDUCATION_TYPE: Level of highest education the client achieved

In [None]:
# Plot on a pie chart
plot_univariate_pie("NAME_EDUCATION_TYPE")

When compared with the percentages of education types between loan payment difficulties and loan non-payment difficulties, it can be observed that the former has a high percentage of secondary education than the later, the later has a high percentage of higher education than the former.

**NAME_FAMILY_STATUS**

NAME_FAMILY_STATUS: Family status of the client.

In [None]:
# Plot on a pie chart
plot_univariate_pie("NAME_FAMILY_STATUS")

The percentages of non-defaulters are married is higher than that of defaulters, while defaulters have a higher percentage of single and civil married people.

**NAME_HOUSING_TYPE**

NAME_HOUSING_TYPE: What is the housing situation of the client (renting, living with parents, ...)

In [None]:
# Plot on a pie chart
plot_univariate_pie("NAME_HOUSING_TYPE")

The percentages of non-defaulters have houses is higher than that of defaulters, while defaulters have a higher percentage of living with parents.

**AGE**

In [None]:
# Plot on a pie chart
plot_univariate_pie("YEARS_BIRTH_RANGE")

The ditribution of age is quite similar between Defaulters and non-defaulters: most of people are in the middle-age, least people in senior age.

The percentages of non-defaulters in middle age and senior are higher than that of defaulters, while defaulters have higher percentages of adults and young adults.

**INCOME RANGE**

In [None]:
# Plot on a pie chart
plot_univariate_pie("AMT_INCOME_RANGE")

For both Defaulters and non-defaulters, most of people have a low income.

There is not much different between defaulters and non-defaulters. The percentage of defaulters having low income is higher than that of non-defaulters.

**ORGANIZATION TYPE**

In [None]:
# Plot on a pie chart
plot_univariate_pie("ORGANIZATION_TYPE")

There is no major difference between defaulters and non-defaulters.

### Univariate Analysis (Continuous)

In [None]:
# Check all columns again to find numerical data
application.info(verbose=True)

In [None]:
# Function to plot univariate
def plot_univariate_displot(variable):
    
    # Get data
    data_0 = target_0[variable]
    data_1 = target_1[variable]

    # Plot on 2 charts side by side
    fig, (ax1, ax2) = plt.subplots(1,2,figsize=(20,8))

    # Use distplot from seaborn
    labels = ['Defaulters', 'Non-Defaulters']
    ax1.set_title('Distribution Plot')
    sns.distplot(data_1,ax=ax1,label='Defaulters')
    sns.distplot(data_0,ax=ax1,label='Non-Defaulters')
    ax1.legend(labels, loc="center right")

    # Use Box plot
    labels = ['Defaulters', 'Non-Defaulters']
    data = [data_1, data_0]
    ax2.set_title('Box Plot')
    ax2.boxplot(data, labels=labels)

    plt.show()

**Credit Amount**

In [None]:
# Distribution and Box plot for 'AMT_ANNUITY'
plot_univariate_displot('AMT_CREDIT')

The distribution is quite similar between Defaulters and non-Defaulters. Therefore, from the boxplot, it can be seen that the medians of defaulter and non-defaulters are almost equal. The outliers of non-defaulters have higher values than defaulters.

**Loan Annuity**

In [None]:
# Distribution and Box plot for 'AMT_ANNUITY'
plot_univariate_displot('AMT_ANNUITY')

Similar to the credit amount, the distribution is quite similar between Defaulters and non-Defaulters. Therefore, from the boxplot, it can be seen that the medians of defaulter and non-defaulters are almost equal. The outliers of non-defaulters have higher values than defaulters.

**Goods Price**

For consumer loans it is the price of the goods for which the loan is given.

In [None]:
# Distribution and Box plot for 'AMT_ANNUITY'
plot_univariate_displot('AMT_GOODS_PRICE')

Similar to the credit amount, the distribution is quite similar between Defaulters and non-Defaulters. Therefore, from the boxplot, it can be seen that the medians of defaulter and non-defaulters are almost equal. The outliers of non-defaulters have higher values than defaulters.

### Bivariate Analysis (Categorical vs Continuous Variables)

In [None]:
# Check all columns again to find variables
application.info(verbose=True)

In [None]:
# Function to boxplot bivariate
def boxplot_bivariate(variable_1,variable_2):

    plt.figure(figsize=(20,5))
    plt.subplot(1,2,1)
    plt.title('Defaulters')
    sns.boxplot(x=variable_1,y=variable_2,data=target_1)
    
    plt.subplot(1,2,2)    
    plt.title('Non-Defaulters')
    sns.boxplot(x=variable_1,y=variable_2,data=target_0)
    
    plt.show()

**Education & Amount of credit**

In [None]:
# Call the boxplot function
boxplot_bivariate('NAME_EDUCATION_TYPE','AMT_CREDIT')

Similar for both defaulters and non-defaulters, people with academic degree and high education have more credits.

**Family Status & Amount of credit**

In [None]:
# Call the boxplot function
boxplot_bivariate('NAME_FAMILY_STATUS','AMT_CREDIT')

Similar for both defaulters and non-defaulters, people who are married, civil married and separated have more credits.

**Income range & Amount of credit**

In [None]:
# Call the boxplot function
boxplot_bivariate('AMT_INCOME_RANGE','AMT_CREDIT')

Similar for both defaulters and non-defaulters, the higher the income is, the higher amount of credit is.

**Money-related variables together**

In [None]:
# Function to plot bivariates on heatmaps
def heatmaps_bivariate(list_values,index):

    plt.figure(figsize=[20,5])

    plt.subplot(1,2,1)
    plt.title('Defaulters') 
    res = pd.pivot_table(data=target_1,values=list_values,index=index,aggfunc=np.mean)
    sns.heatmap(res,annot=True,cmap="RdYlGn",center=0.117)

    plt.subplot(1,2,2)
    plt.title('Non-Defaulters') 
    res = pd.pivot_table(data=target_0,values=list_values,index=index,aggfunc=np.mean)
    sns.heatmap(res,annot=True,cmap="RdYlGn",center=0.117)

    plt.show()

In [None]:
# Heatmaps between Income Range and Amount of credit, annuity and goods price
heatmaps_bivariate(['AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE'],"AMT_INCOME_RANGE")

The heatmaps show that the average amounts of credit and the average amounts of goods price increase when the income increases. For the applications with very low income, the average amounts of credit, average amounts of annuity and average amounts of goods price of defaulters are higher than those of non-defaulters. For the applications with higher income ranges, average amounts of annuity and average amounts of goods price of defaulters are lower than those of non-defaulters.

**Time-related variables together**

In [None]:
# Heatmaps between Income and Amount of credit, annuity and goods price
heatmaps_bivariate(['DAYS_EMPLOYED', 'YEARS_REGISTRATION', 'DAYS_ID_PUBLISH'],"YEARS_BIRTH_RANGE")

The heatmaps show that the median of days the clients have been employed with their current positions before the applications (DAYS_EMPLOYED) is highest in the group of seniors for both Defaulters and non-Defaulters.

For all group ages younger than seniors, the median values of DAYS_EMPLOYES (the number of days the clients have been employed with their current positions before the applications), DAYS_ID_PUBLISH (the number of days since the clients updated their identity document for the last time before applying for the loans) and DAYS_REGISTRATION (the number of days since the clients updated their registration for the last time before applying for the loans) of Non-Defaulters are always higher than those on Defaulters.

For the group age of seniors, the median values of DAYS_EMPLOYED of the 2 sets of data are equal. The median values of DAYS_ID_PUBLISH and DAYS_REGISTRATION of Non-Defaulters are always higher than those on Defaulters.

### Bivariate Analysis (Categorical vs Categorical Variables)

In [None]:
# Check all columns again to find variables
application.info(verbose=True)

In [None]:
# Function to countplot categorical variables
def countplot_bivariate(variable_1,variable_2):
    
    plt.figure(figsize=(20,5))
    plt.subplot(1,2,1)
    plt.title('Defaulters')
    sns.countplot(x=variable_1,hue=variable_2,data=target_1,palette='Set1')

    plt.subplot(1,2,2)    
    plt.title('Non-Defaulters')
    sns.countplot(x=variable_1,hue=variable_2,data=target_0,palette='Set1')

    plt.show()

**Contract & Gender**

In [None]:
# Call the countplot function
countplot_bivariate('NAME_CONTRACT_TYPE','CODE_GENDER')

For both Defaulters and non-Defaulters, there are more females having all types of contracts, including cash loans and revolving loans.

**Income range & Education**

In [None]:
# Call the countplot function
countplot_bivariate('NAME_EDUCATION_TYPE','AMT_INCOME_RANGE')

The majority of Defaulters and non-Defaulters just finished their secondary schools and have low income.

**Income range & Credit Range**

In [None]:
# Call the countplot function
countplot_bivariate('AMT_CREDIT_RANGE','AMT_INCOME_RANGE')

There is no significant insight from here.

### Bivariate Analysis (Continuous vs Continuous Variables)

In [None]:
# Check all columns again to find variables
application.info(verbose=True)

**A number of continuous variables together**

In [None]:
# Function to plot pairplot
def pairplot_bivariate(df,list_values):
    pair = df[list_values].fillna(0)
    sns.pairplot(pair)
    plt.show()

In [None]:
#Pairplot for Defaulters
pairplot_bivariate(target_1,['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'YEARS_BIRTH', 'DAYS_EMPLOYED', 'YEARS_REGISTRATION'])

In [None]:
#Pairplot for Non Defaulters
pairplot_bivariate(target_0,['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'YEARS_BIRTH', 'DAYS_EMPLOYED', 'YEARS_REGISTRATION'])

From the pair plots above, it can be seen that there is a high correlation between credit amount and goods price. 

### Find correlation between different variables

#### Correlation for clients with payment difficulties (Defaulters)

In [None]:
# Check all columns again to find variables
application.info(verbose=True)

In [None]:
# Correlation between variables
corr_target_1 = round(target_1.corr(),2)
corr_target_1

In [None]:
# Plot heatmap to identify the correlation between different variables in the dataset for Defaulters - Clients with payment difficulties
plt.figure(figsize = (18,6))
sns.heatmap(corr_target_1, annot = True, fmt='.2g',cmap= 'coolwarm')
plt.show()

In [None]:
# Remove duplicated pairs in the correlation table
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=10):
    corr_list = df.abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    corr_list = corr_list.drop(labels=labels_to_drop).sort_values(ascending=False)
    return corr_list[0:n]

#(Source: https://stackoverflow.com/questions/17778394/list-highest-correlation-pairs-from-a-large-correlation-matrix-in-pandas)

In [None]:
print("The top 10 correlation pairs for Defaulters are:")
get_top_abs_correlations(corr_target_1, 10)

#### Top 10 Correlation for other clients (Non Defaulters)

In [None]:
# Correlation between variables
corr_target_0 = round(target_0.corr(),2)
corr_target_0

In [None]:
# Plot heatmap to identify the correlation between different variables in the dataset for Defaulters - Clients with payment difficulties
plt.figure(figsize = (18,6))
sns.heatmap(corr_target_0, annot = True, fmt='.2g',cmap= 'coolwarm')
plt.show()

In [None]:
print("Top 10 correlation pairs for Non Defaulters are:")
get_top_abs_correlations(corr_target_0, 10)

In [None]:
print("Top 10 correlation pairs for Defaulters are:")
get_top_abs_correlations(corr_target_1, 10)

Top 10 correlation pairs are the same for both Defaulters and Non Defaulters.

# PREVIOUS APPLICATION DATA

## Read the Previous Application Data.

Read the Previous Application data file provided and store it in a dataframe `previous_application`.

In [None]:
# Read the csv file using 'read_csv'.
previous_application= pd.read_csv("../input/loan-defaulter/previous_application.csv")

##  Check the structure of the data

Inspect the dataframe for dimensions, null-values, and summary of different numeric columns:

In [None]:
# Check the number of rows and columns in the dataframe of application
previous_application.shape

In [None]:
# Check the data of top 5 rows of application
previous_application.head()

Check the column-wise info of the dataframe:

In [None]:
# Types of all columns
previous_application.info(verbose=True)

In [None]:
# Count the number of null values in each column
previous_application.isnull().sum()

In [None]:
# Check if each column is null
previous_application.isnull().any()

In [None]:
# List of columns which have null values
previous_application.columns[previous_application.isnull().any()].tolist()

In [None]:
# Check the summary for the numeric columns 
previous_application.describe()

In [None]:
# Check the SK_ID_CURR column.
previous_application.SK_ID_CURR.value_counts()

-> One current loan can have 0,1,2 or more previous loan applications in Home Credit.

##  Data Quality check and missing values

### Identify missing data

In [None]:
# Percentage of null values in each column
null_percentage = previous_application.isnull().sum() * 100 / len(previous_application)

In [None]:
# Top columns have highest percentages of null values
null_percentage.sort_values(ascending = False).head(50)

From the list above, it can be seen that there are 49 columns having null values more than ~50%. Because they have a lot of missing values, we will remove them.

### Deal with missing data

In [None]:
# List all of the columns having null values more than ~50% (ie. 45%)
null_columns = null_percentage[null_percentage.values>45].index
null_columns

In [None]:
# Remove those columns
previous_application.drop(null_columns, axis=1, inplace=True)

In [None]:
# See the application data after dropping the columns having null values more than 45%
previous_application.head()

In [None]:
# List of remaining columns which have null values
null_percentage = previous_application.isnull().sum() * 100 / len(previous_application)
null_percentage[null_percentage>0].sort_values(ascending = False)

### Deal with invalid data

In [None]:
# Replace XNA and XAP with NaN
previous_application=previous_application.replace('XNA', np.NaN)
previous_application=previous_application.replace('XAP', np.NaN)

In [None]:
# See values of Contract Status
previous_application['NAME_CONTRACT_STATUS'].value_counts()

## Analysis

### Univariate Analysis (Categorical)

In [None]:
# Check all columns again to find categorical data
previous_application.info(verbose=True)

**NAME_CONTRACT_TYPE**

In [None]:
# Check values of NAME_CONTRACT_TYPE
previous_application.NAME_CONTRACT_TYPE.value_counts()

In [None]:
# Plot on charts
plt.figure(figsize=(10,5))
plt.title('CONTRACT_TYPE of previous application')
sns.countplot(x='NAME_CONTRACT_TYPE',data=previous_application)

plt.show()

There are more applications for consumer loans and cash loans than revolving loans.

**NAME_CONTRACT_STATUS**

In [None]:
# Plot on a pie chart
plt.figure(figsize=(10,5))
data = previous_application["NAME_CONTRACT_STATUS"].value_counts()
labels = data.index
plt.pie(data, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.title('Contract statuses of previous applications')
plt.legend(labels, loc="lower right")

plt.show()

Most applications were approved. Only a very small percentage of applications was unused offer.

**REASONS FOR REJECTING APPLICATIONS**

In [None]:
# Plot on a pie chart
plt.figure(figsize=(10,5))
data = previous_application["CODE_REJECT_REASON"].value_counts()
labels = data.index
plt.pie(data, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.title('Reject reasons of previous applications')
plt.legend(labels, loc="lower left")

plt.show()

"HC" was the reason why most of the previous applications were rejected.

**CLIENT TYPES**

Was the client old or new client when applying for the previous application.

In [None]:
# Plot on a pie chart
plt.figure(figsize=(10,5))
data = previous_application["NAME_CLIENT_TYPE"].value_counts()
labels = data.index
plt.pie(data, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.title('Client types of previous applications')
plt.legend(labels, loc="lower left")

plt.show()

The majority of previous applications were from repeaters.

**TYPE OF PORTFOLIO**

In [None]:
# Plot on a pie chart
plt.figure(figsize=(10,5))
data = previous_application["NAME_PORTFOLIO"].value_counts()
labels = data.index
plt.pie(data, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.title('PORTFOLIO TYPES of previous applications')
plt.legend(labels, loc="lower left")

plt.show()

Most of the previous applications were for POS.

**PAYMENT METHODS**

In [None]:
# Plot on a pie chart
plt.figure(figsize=(10,5))
data = previous_application["NAME_PAYMENT_TYPE"].value_counts()
labels = data.index
plt.pie(data, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.title('Payment Methods of previous applications')
plt.legend(labels, loc="lower left")

plt.show()

99% of clients paid cash throught the bank.

**WEEK DAY**

In [None]:
# Plot on a pie chart
plt.figure(figsize=(10,5))
data = previous_application["WEEKDAY_APPR_PROCESS_START"].value_counts()
labels = data.index
plt.pie(data, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.title('On which day of the week did the client apply for previous application?')
plt.legend(labels, loc="lower left")

plt.show()

There were less applications on weekends than on the week days.

**GOODS CATEGORY**

In [None]:
# Plot on a bar chart
plt.figure(figsize=(10,5))
plt.title('What kind of goods did the client apply for in the previous application?')
sns.countplot(x="NAME_GOODS_CATEGORY",data=previous_application,palette='Set1')
plt.xticks(rotation=90)
plt.show()

The majority of clients applied for mobiles, consumer electronics, computers, audio/video and furnitures.

### Univariate Analysis (Continuous)

In [None]:
# Check all columns again to find categorical data
previous_application.info(verbose=True)

In [None]:
# Function to plot univariate
def plot_univariate(variable):
    
    # Get data
    data = previous_application[variable]

    # Plot on 2 charts side by side
    fig, (ax1, ax2) = plt.subplots(1,2,figsize=(20,8))

    # Use distplot from seaborn
    ax1.set_title('Distribution Plot')
    sns.distplot(data,ax=ax1)
    
    # Use Box plot
    ax2.set_title('Box Plot')
    sns.boxplot(x=data)

    plt.show()

**Credit Amount**

In [None]:
# Distribution and Box plot for 'AMT_CREDIT'
plot_univariate('AMT_CREDIT')

There were some outliers. Most of the amount of the credit was less than 500000.

**Annuity Amount**

In [None]:
# Distribution and Box plot for 'AMT_ANNUITY'
plot_univariate('AMT_ANNUITY')

Similar to the amount of credit, there were some outliers. Most of the amount of the annuity was less than 50000.

**Goods Price Amount**

In [None]:
# Distribution and Box plot for 'AMT_GOODS_PRICE'
plot_univariate('AMT_GOODS_PRICE')

Similar to the credit amount, there were some outliers. Most of the amount of the credit was less than 500000.

# Analyse Application data together with Previous Application data

## Merge Application data together with Previous Application data

In [None]:
# Check Shape of Application data
application.shape

In [None]:
# Check all columns of Application data
application.head()

In [None]:
# Check Shape of Previous Application data
previous_application.shape

In [None]:
# Check all columns of Previous Application data
previous_application.head()

=> Notice that there are some columns of Application and Previous Application have the same names.

=> Change names of those columns in Previous Application data.

In [None]:
# Add "Previous" to the column names of Previous Application data
previous_application.columns = previous_application.columns + "_PREV"

In [None]:
# Check all columns of Previous Application data
previous_application.head()

In [None]:
# Remove "Previous" from the previous_SK_ID_PREV and previous_SK_ID_CURR columns
previous_application.rename({'SK_ID_PREV_PREV': 'SK_ID_PREV', 'SK_ID_CURR_PREV': 'SK_ID_CURR'}, axis=1, inplace=True)

In [None]:
# Check all columns of Previous Application data
previous_application.head()

In [None]:
# Merge application and previous application data on 'SK_ID_CURR'
data_merge = application.merge(previous_application,on='SK_ID_CURR',how='inner')
data_merge.shape

In [None]:
# Check all columns again to find categorical data
data_merge.info(verbose=True)

## Analysis

### Univariate Analysis (Categorical)

**Previous CONTRACT_STATUS of Defaulters vs Non-Defaulters**

In [None]:
# Function to plot distribution
def plot_univariate_pie_merge(variable):
    # Plot on a pie chart
    fig, (ax1, ax2) = plt.subplots(1,2,figsize=(20,20))

    # Defaulters
    data_0 = data_merge[data_merge['TARGET']==1][variable].value_counts()
    labels = data_0.index
    ax1.pie(data_0, autopct='%1.1f%%',
            shadow=True, startangle=90)
    ax1.set_title('Defaulters')
    ax1.legend(labels, loc="lower right")

    # Non-Defaulters
    data_1 = data_merge[data_merge['TARGET']==0][variable].value_counts()
    labels = data_1.index
    ax2.pie(data_1, autopct='%1.1f%%',
            shadow=True, startangle=90)
    ax2.set_title('Non-Defaulters')

    ax2.legend(labels, loc="lower right")

    plt.show()

In [None]:
# Plot on a pie chart
plot_univariate_pie_merge("NAME_CONTRACT_STATUS_PREV")

The percentage of applications from defaulters being refused is higher than that of the non-defaulters.

The percentage of applications from non-defaulters being approved is higher than that of the defaulters.

**NAME_CONTRACT_TYPE**

In [None]:
# Check values of NAME_CONTRACT_TYPE
data_merge.NAME_CONTRACT_TYPE_PREV.value_counts()

In [None]:
# Plot on a pie chart
plot_univariate_pie_merge("NAME_CONTRACT_TYPE_PREV")

The percentages of applications from defaulters for cash loans and revolving loans were higher than those of the non-defaulters.

The percentage of applications from non-defaulters for consumer loans was higher than that of the defaulters.

**CLIENT TYPES**

Was the client old or new client when applying for the previous application.

In [None]:
# Plot on a pie chart
plot_univariate_pie_merge("NAME_CLIENT_TYPE_PREV")

The percentages of defaulters previous applications from new and repeaters clients were higher than those of the non-defaulters.

The percentage of non-defaulters previous applications from refreshed clients was higher than those of the defaulters.

**NAME_CASH_LOAN_PURPOSE**

In [None]:
# Plot on a pie chart
plot_univariate_pie_merge("NAME_CASH_LOAN_PURPOSE_PREV")

The percentages of defaulters previous applications refused to name the goal were higher than those of the non-defaulters.

**PAYMENT METHODS**

In [None]:
# Plot on a pie chart
plot_univariate_pie_merge("NAME_PAYMENT_TYPE_PREV")

99% of clients paid cash throught the bank.
The previous payment methods of defaulters and non-defaulters were similar.

### Univariate Analysis (Continuous)

In [None]:
# Check all columns again to find categorical data
data_merge.info(verbose=True)

In [None]:
# Function to plot univariate
def plot_univariate_displot(variable):
    
    # Get data
    data_0 = data_merge[data_merge['TARGET']==0][variable]
    data_1 = data_merge[data_merge['TARGET']==1][variable]

    # Plot on charts
    plt.figure(figsize=(10,5))

    # Use distplot from seaborn
    labels = ['Defaulters', 'Non-Defaulters']
    plt.title('Distribution Plot')
    sns.distplot(data_1,label='Defaulters')
    sns.distplot(data_0,label='Non-Defaulters')
    plt.legend(labels, loc="center right")

    plt.show()

**Credit Amount**

In [None]:
# Distribution and Box plot for 'AMT_CREDIT'
plot_univariate_displot('AMT_CREDIT_PREV')

There were some outliers. Most of the amount of the credit was less than 500000. The pattern is the quite similar for defaulters and non-defaulters.

**Annuity Amount**

For how much credit did client ask on the previous application?

In [None]:
# Distribution and Box plot for 'AMT_APPLICATION'
plot_univariate_displot('AMT_APPLICATION_PREV')

Similar to the amount of credit, there were some outliers. Most of the amount of the annuity was less than 50000.  The pattern is the quite similar for defaulters and non-defaulters.

**Goods Price Amount**

In [None]:
# Distribution and Box plot for 'AMT_GOODS_PRICE'
plot_univariate_displot('AMT_GOODS_PRICE_PREV')

Similar to the credit amount, there were some outliers. Most of the amount of the credit was less than 500000.  The pattern is the quite similar for defaulters and non-defaulters.

### Bivariate Analysis (Categorical vs Categorical Variables)

In [None]:
# Check all columns again to find variables
application.info(verbose=True)

In [None]:
# Function to countplot categorical variables
def countplot_bivariate(variable_1,variable_2):
    
    # Get data
    target_0 = data_merge[data_merge['TARGET']==0]
    target_1 = data_merge[data_merge['TARGET']==1]
    
    # Plot
    plt.figure(figsize=(20,5))
    plt.subplot(1,2,1)
    plt.title('Defaulters')
    sns.countplot(x=variable_1,hue=variable_2,data=target_1,palette='Set1')

    plt.subplot(1,2,2)    
    plt.title('Non-Defaulters')
    sns.countplot(x=variable_1,hue=variable_2,data=target_0,palette='Set1')

    plt.show()

**Previous Contract & Gender**

In [None]:
# Call the countplot function
countplot_bivariate('NAME_CONTRACT_TYPE_PREV','CODE_GENDER')

Similar to current loans, for both Defaulters and non-Defaulters, there are more females having all types of previous contracts.

**Contract types & statuses**

In [None]:
# Call the countplot function
countplot_bivariate('NAME_CONTRACT_STATUS_PREV','NAME_CONTRACT_TYPE_PREV')

There is no significant insight from here.

### Bivariate Analysis (Categorical vs Continuous Variables)

In [None]:
# Check all columns again to find variables
data_merge.info(verbose=True)

In [None]:
# Function to boxplot bivariate
def boxplot_bivariate(variable_1,variable_2):
    
    # Get data
    target_0 = data_merge[data_merge['TARGET']==0]
    target_1 = data_merge[data_merge['TARGET']==1]
    
    plt.figure(figsize=(20,5))
    plt.subplot(1,2,1)
    plt.title('Defaulters')
    sns.boxplot(x=variable_1,y=variable_2,data=target_1)
    
    plt.subplot(1,2,2)    
    plt.title('Non-Defaulters')
    sns.boxplot(x=variable_1,y=variable_2,data=target_0)
    
    plt.show()

**Previous Contract Status & Amount of credit**

In [None]:
# Call the boxplot function
boxplot_bivariate('NAME_CONTRACT_STATUS_PREV','AMT_CREDIT_PREV')

Similar for both defaulters and non-defaulters, applications being refused had higher credits.

**Previous Contract Status & Amount of credit clients asked for**

In [None]:
# Call the boxplot function
boxplot_bivariate('NAME_CONTRACT_STATUS_PREV','AMT_APPLICATION_PREV')

Similar for both defaulters and non-defaulters, applications being refused had higher credits.

**Money-related variables together**

In [None]:
# Get data
target_0 = data_merge[data_merge['TARGET']==0]
target_1 = data_merge[data_merge['TARGET']==1]

# Heatmaps between Income Range and Amount of credit, annuity and goods price
plt.figure(figsize=[20,5])

plt.subplot(1,2,1)
plt.title('Defaulters') 
res = pd.pivot_table(data=target_1,values=['AMT_APPLICATION_PREV','AMT_CREDIT_PREV', 'AMT_ANNUITY_PREV', 'AMT_GOODS_PRICE_PREV'],index="AMT_INCOME_RANGE",aggfunc=np.mean)
sns.heatmap(res,annot=True,cmap="RdYlGn",center=0.117)

plt.subplot(1,2,2)
plt.title('Non-Defaulters') 
res = pd.pivot_table(data=target_0,values=['AMT_APPLICATION_PREV','AMT_CREDIT_PREV', 'AMT_ANNUITY_PREV', 'AMT_GOODS_PRICE_PREV'],index="AMT_INCOME_RANGE",aggfunc=np.mean)
sns.heatmap(res,annot=True,cmap="RdYlGn",center=0.117)

plt.show()

Similar to the current application data, the heatmaps show that the average amounts of credit and the average amounts of goods price increase when the income increases. For the applications with very low income, the average amounts of credit, average amounts of annuity and average amounts of goods price of defaulters are higher than those of non-defaulters. For the applications with higher income ranges, average amounts of annuity and average amounts of goods price of defaulters are lower than those of non-defaulters.

### Bivariate Analysis (Continuous vs Continuous Variables)

In [None]:
# Check all columns again to find variables
data_merge.info(verbose=True)

**A number of continuous variables together**

In [None]:
#Pairplot for Defaulters
pair = target_1[['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'AMT_ANNUITY_PREV', 'AMT_APPLICATION_PREV', 'AMT_CREDIT_PREV','AMT_GOODS_PRICE_PREV']].fillna(0)
sns.pairplot(pair)
plt.show()

In [None]:
#Pairplot for Non Defaulters
pair = target_0[['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'AMT_ANNUITY_PREV', 'AMT_APPLICATION_PREV', 'AMT_CREDIT_PREV','AMT_GOODS_PRICE_PREV']].fillna(0)
sns.pairplot(pair)
plt.show()

From the pair plots above, it can be seen that there is a high correlation between credit amount and goods price. Similarly, there are a high correlation between previous credit amount and previous goods price and a high correlation between previous credit applied by clients and previous goods price.

### Find correlation between different variables

#### Correlation for clients with payment difficulties (Defaulters)

In [None]:
# Correlation between variables
corr_target_1 = round(target_1.corr(),2)
corr_target_1

In [None]:
# Plot heatmap to identify the correlation between different variables in the dataset for Defaulters - Clients with payment difficulties
plt.figure(figsize = (18,6))
sns.heatmap(corr_target_1, annot = True, fmt='.2g',cmap= 'coolwarm')
plt.show()

In [None]:
print("The top 10 correlation pairs for Defaulters are:")
get_top_abs_correlations(corr_target_1, 10)

#### Top 10 Correlation for other clients (Non Defaulters)

In [None]:
# Correlation between variables
corr_target_0 = round(target_0.corr(),2)
corr_target_0

In [None]:
# Plot heatmap to identify the correlation between different variables in the dataset for Defaulters - Clients with payment difficulties
plt.figure(figsize = (18,6))
sns.heatmap(corr_target_0, annot = True, fmt='.2g',cmap= 'coolwarm')
plt.show()

In [None]:
print("Top 10 correlation pairs for Non Defaulters are:")
get_top_abs_correlations(corr_target_0, 10)

In [None]:
print("Top 10 correlation pairs for Defaulters are:")
get_top_abs_correlations(corr_target_1, 10)

Top 10 correlation pairs are the same for both Defaulters and Non Defaulters.

# Conclusion - Driving factors

##  Loan Clients in general:

* Laborers occupation
* Secondary education
* Married people
* Middle age
* Low income
* Applied the loans for goods price less than 2,000,000

## Defaulters

* More clients with low income
* More adults and young adults
* More single people
* Secondary education
* More ‘Refused’ previous applications
* More ‘Revolving Loans’ previous applications

## Non-Defaulters

* More clients with high income
* More middle-age clients and seniors
* More married people
* Higher education
* More ‘approved’ previous applications
* More ‘Consumer Loans' previous applications