#### 1. filter out warnings and import required libraries

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


#### 2. Load application data and previous data

In [4]:
app_data = pd.read_csv(r"application_data.csv")  # 'r' to take care of "file does not exist" error

FileNotFoundError: [Errno 2] File application_data.csv does not exist: 'application_data.csv'

In [None]:
prev_app = pd.read_csv(r"previous_application.csv")

#### 3. check the data for data types and null values

In [None]:
app_data.info(verbose=True)  # data type of each column

In [None]:
pd.set_option('max_rows', None)
app_data.isnull().sum()

In [None]:
# number of XNA in gender

app_data[app_data['CODE_GENDER']=='XNA'].shape

In [None]:
# number of XNA in Organization

app_data[app_data['ORGANIZATION_TYPE']=='XNA'].shape

In [None]:
app_data['CODE_GENDER'].value_counts()

In [None]:
# Updating the column 'CODE_GENDER' with "F" for the dataset

app_data.loc[app_data['CODE_GENDER']=='XNA','CODE_GENDER']='F'
app_data['CODE_GENDER'].value_counts()

# we do this because there are almost double the females than males. so it won't affect data analysis.
# also, these points could have been deleted and it still would not affected the data becaus eXNA make up roughly
# 0.001 % of the entire data set


For organisation type, the data being categorical, it is diffiult to find the mean, median or mode. at the same time, the XNA make up for only 18.07 % of the data and so deleting it would not affect the analysis.

In [None]:
#  dropping the rows  'XNA' values in the organization type 

app_data=app_data.drop(app_data.loc[app_data['ORGANIZATION_TYPE']=='XNA'].index)
app_data[app_data['ORGANIZATION_TYPE']=='XNA'].shape

In [None]:
# Creating bins for income amount

bins = [0,20000,40000,60000,80000,100000,120000,140000,160000,180000,200000,220000,240000,260000,280000,300000,320000,340000,360000,380000,400000,10000000000]
slot = ['0-20000', '20000-40000','40000-60000','60000,80000','80000-100000', '100000-120000', '120000-140000','160000-180000',
       '180000-200000','200000-220000','220000-240000','240000-260000','260000-280000','280000-300000','300000-320000',
       '320000-340000','340000-360000','360000-380000','380000-400000','400000-420000','420000 and above']

app_data['AMT_INCOME_RANGE']=pd.cut(app_data['AMT_INCOME_TOTAL'],bins,labels=slot)

In [None]:
# Creating bins for Credit amount

bins = [0,150000,200000,250000,300000,350000,400000,450000,500000,550000,600000,650000,700000,750000,800000,850000,900000,1000000000]
slots = ['0-150000', '150000-200000','200000-250000', '250000-300000', '300000-350000', '350000-400000','400000-450000',
        '450000-500000','500000-550000','550000-600000','600000-650000','650000-700000','700000-750000','750000-800000',
        '800000-850000','850000-900000','900000 and above']

app_data['AMT_CREDIT_RANGE']=pd.cut(app_data['AMT_CREDIT'],bins=bins,labels=slots)

In [None]:
app_data.head()

#### 4. condense the data set by deleting the columns/variables that are not relevant.
       We will be keeping the following variables:
 - SK_ID_CURR
 - TARGET
 - NAME_CONTRACT_TYPE
 - CODE_GENDER
 - FLAG_OWN_CAR
 - FLAG_OWN_REALTY
 - CNT_CHILDREN
 - AMT_INCOME_TOTAL
 - AMT_CREDIT
 - AMT_ANNUITY
 - AMT_GOODS_PRICE
 - NAME_INCOME_TYPE
 - NAME_EDUCATION_TYPE
 - NAME_FAMILY_STATUS
 - NAME_HOUSING_TYPE
 - DAYS_BIRTH
 - OCCUPATION_TYPE
 - CNT_FAM_MEMBERS
 - ORGANIZATION_TYPE
 - OBS_30_CNT_SOCIAL_CIRCLE
 - DEF_30_CNT_SOCIAL_CIRCLE
 - OBS_60_CNT_SOCIAL_CIRCLE
 - DEF_60_CNT_SOCIAL_CIRCLE
 

In [None]:
app_data_cnd = app_data[["SK_ID_CURR", "TARGET", "NAME_CONTRACT_TYPE", "CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY",
                         "CNT_CHILDREN", "AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE", 
                         "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE",
                         "DAYS_BIRTH", "OCCUPATION_TYPE", "CNT_FAM_MEMBERS", "ORGANIZATION_TYPE",
                         "OBS_30_CNT_SOCIAL_CIRCLE", "DEF_30_CNT_SOCIAL_CIRCLE", "OBS_60_CNT_SOCIAL_CIRCLE",
                         "DEF_60_CNT_SOCIAL_CIRCLE",'AMT_INCOME_RANGE','AMT_CREDIT_RANGE']]

#### 5. Create data frame for Traget = 1 and Target = 0

In [None]:
## use  loc
target0_app=app_data_cnd.loc[app_data_cnd["TARGET"]==0]
target1_app=app_data_cnd.loc[app_data_cnd["TARGET"]==1] 

In [None]:
target0_app.head()

In [None]:
target0_app.shape

In [None]:
target1_app.shape

In [None]:
# Calculating Imbalance percentage
    
# Since the majority is target0 and minority is target1

round(len(target0_app)/len(target1_app),2)

In [None]:
target0_app = target0_app.sort_values(by='AMT_INCOME_RANGE')

In [None]:
# Histogram for income range Target = 0


plt.figure(figsize=[12,3])
plt.hist(target0_app.AMT_INCOME_RANGE)
plt.xticks(rotation=45)
plt.show()

In [None]:
target1_app = target1_app.sort_values(by='AMT_INCOME_RANGE')

In [None]:
# Histogram for income range Target = 1


plt.figure(figsize=[12,3])
plt.hist(target1_app.AMT_INCOME_RANGE)
plt.xticks(rotation=45)
plt.show()

Distribution by Income Type

In [None]:
## Plotting for Income type of Target0

plt.figure(figsize=[8,3])
plt.hist(target0_app.NAME_INCOME_TYPE)
plt.xticks(rotation=45)
plt.show()

In [None]:
## Plotting for Income type of Target1

plt.figure(figsize=[8,3])
plt.hist(target1_app.NAME_INCOME_TYPE)
plt.xticks(rotation=45)
plt.show()

Distribution by Contract Type

In [None]:
## Plotting for Contract type of Target0

plt.figure(figsize=[4,4])
plt.hist(target0_app.NAME_CONTRACT_TYPE)
plt.xticks(rotation=45)
plt.show()

In [None]:
## Plotting for Contract type of Target1

plt.figure(figsize=[4,4])
plt.hist(target1_app.NAME_CONTRACT_TYPE)
plt.xticks(rotation=45)
plt.show()

Outlier

In [None]:
sns.pairplot(target0_app)
Plt.show()

In [None]:
sns.pairplot(target1_app)

Correlation

In [None]:
# Finding some correlation for numerical columns for both target 0 and 1 

target0_corr=target0_app.iloc[0:,2:]
target1_corr=target1_app.iloc[0:,2:]

target0=target0_corr.corr(method='spearman')
target1=target1_corr.corr(method='spearman')

In [None]:
# Correlation for target 0

target0

In [None]:
# Correlation for target 1

target1

In [None]:
target0_app.head()

In [None]:
# Now, plotting the above correlation with heat map as it is the best choice to visulaize

# figure size

def targets_corr(data,title):
    plt.figure(figsize=(15, 10))
    plt.rcParams['axes.titlesize'] = 25
    plt.rcParams['axes.titlepad'] = 70

# heatmap with a color map of choice


    sns.heatmap(data, cmap="RdYlGn",annot=True)

    plt.title(title)
    plt.yticks(rotation=0)
    plt.show()

In [None]:
# For Target 0

targets_corr(data=target0,title='Correlation for target 0')

In [None]:
# For Target 1

targets_corr(data=target1,title='Correlation for target 1')