In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
df = pd.read_csv(r'application.csv')

## Bivariate Analysis

In [4]:
## Occupation type
df['TARGET'].mean()
df.groupby(['OCCUPATION_TYPE']).agg(counts = ('SK_ID_CURR', 'count'), avg_default = ('TARGET', 'mean')).reset_index()

0.08155567117585848

Unnamed: 0,OCCUPATION_TYPE,counts,avg_default
0,Accountants,513,0.052632
1,Cleaning staff,218,0.091743
2,Cooking staff,307,0.071661
3,Core staff,1405,0.051957
4,Drivers,932,0.121245
5,HR staff,26,0.153846
6,High skill tech staff,536,0.070896
7,IT staff,31,0.0
8,Laborers,2783,0.113906
9,Low-skill Laborers,113,0.176991


In [5]:
from scipy.stats import chi2_contingency
def chisquare(df, var1, var2):
    crosstab = pd.crosstab(df[var1], df[var2])
    return chi2_contingency(crosstab)

In [6]:
chisquare(df, 'OCCUPATION_TYPE', 'TARGET')

Chi2ContingencyResult(statistic=118.048498061163, pvalue=3.6263791906785435e-17, dof=17, expected_freq=array([[4.68050876e+02, 4.49491238e+01],
       [1.98898813e+02, 1.91011871e+01],
       [2.80100622e+02, 2.68993782e+01],
       [1.28189373e+03, 1.23106275e+02],
       [8.50338044e+02, 8.16619559e+01],
       [2.37218768e+01, 2.27812323e+00],
       [4.89035613e+02, 4.69643867e+01],
       [2.82837761e+01, 2.71622386e+00],
       [2.53915319e+03, 2.43846806e+02],
       [1.03098926e+02, 9.90107405e+00],
       [9.73509327e+02, 9.34906727e+01],
       [3.95060486e+02, 3.79395138e+01],
       [1.21346523e+02, 1.16534765e+01],
       [2.82837761e+01, 2.71622386e+00],
       [1.48809158e+03, 1.42908423e+02],
       [5.29180328e+01, 5.08196721e+00],
       [3.08384398e+02, 2.96156020e+01],
       [5.38304127e+01, 5.16958734e+00]]))

In [7]:
def bivariate_cat(df, var1, var2):
    grp = df.groupby([var1]).agg(counts = ('SK_ID_CURR', 'count'), avg_default = (var2, 'mean')).reset_index()
    return  grp

In [8]:
# FLAG_OWN_CAR
bivariate_cat(df, 'FLAG_OWN_CAR', 'TARGET')

Unnamed: 0,FLAG_OWN_CAR,counts,avg_default
0,N,10179,0.084488
1,Y,5197,0.075813


In [9]:
chisquare(df, 'FLAG_OWN_CAR', 'TARGET')

Chi2ContingencyResult(statistic=3.3415004792077827, pvalue=0.06755298030701706, dof=1, expected_freq=array([[9348.8448231,  830.1551769],
       [4773.1551769,  423.8448231]]))

In [10]:
## DAYS_BIRTH

df['age'] = np.round((df['DAYS_BIRTH']*-1)/365, 0)
bivariate_cat(df, 'age', 'TARGET')

Unnamed: 0,age,counts,avg_default
0,21.0,24,0.125
1,22.0,97,0.154639
2,23.0,169,0.183432
3,24.0,186,0.134409
4,25.0,208,0.129808
5,26.0,221,0.090498
6,27.0,335,0.092537
7,28.0,461,0.097614
8,29.0,345,0.084058
9,30.0,426,0.126761


In [11]:
# two types of groupings
# 1. Ethical grouping
def age_buckets(x):
    if(x<30):
        return '20-30'
    elif(x<40):
        return '30-40'
    elif(x<50):
        return '40-50'
    elif(x<60):
        return '50-60'
    else:
        return '60+'

df['age_buckets'] =df['age'].apply(age_buckets)
df['age_buckets'].value_counts()

age_buckets
30-40    4109
40-50    3873
50-60    3367
20-30    2046
60+      1981
Name: count, dtype: int64

In [12]:
bivariate_cat(df, 'age_buckets', 'TARGET').to_clipboard()
chisquare(df, 'age_buckets', 'TARGET')

Chi2ContingencyResult(statistic=100.29109108665699, pvalue=8.52852827970991e-21, dof=4, expected_freq=array([[1879.13709677,  166.86290323],
       [3773.88774714,  335.11225286],
       [3557.13488554,  315.86511446],
       [3092.40205515,  274.59794485],
       [1819.4382154 ,  161.5617846 ]]))

In [13]:
# OCCUPATION_TYPE, FLAG_OWN_CAR, age

# Future analysis - Look at Drivers, Laborers and Sales Staff vs Age




In [14]:
# AMT_ANNUITY
# Mathematical Bucketing
df['amt_annuity_buckets'] = pd.qcut(df['AMT_ANNUITY'], 10, labels = False)
bivariate_cat(df, 'amt_annuity_buckets', 'TARGET').to_clipboard()
chisquare(df, 'amt_annuity_buckets', 'TARGET')

Chi2ContingencyResult(statistic=46.208385821005265, pvalue=5.50873683039202e-07, dof=9, expected_freq=array([[1412.55921951,  125.44078049],
       [1437.35707317,  127.64292683],
       [1386.84292683,  123.15707317],
       [1411.64078049,  125.35921951],
       [1414.39609756,  125.60390244],
       [1409.80390244,  125.19609756],
       [1412.55921951,  125.44078049],
       [1411.64078049,  125.35921951],
       [1411.64078049,  125.35921951],
       [1412.55921951,  125.44078049]]))

In [15]:
from scipy.stats import f_oneway
groups = []
for group in df['TARGET'].unique():
    groups.append(df.loc[df['TARGET'] == group, 'AMT_ANNUITY'])
    
group1 = df.loc[df['TARGET'] == 1, 'AMT_ANNUITY']
group2 = df.loc[df['TARGET'] == 0, 'AMT_ANNUITY']

In [16]:
group1.isnull().sum()
group2.isnull().sum()

0

1

In [17]:
f_oneway(group1, group2.dropna())

F_onewayResult(statistic=1.495929384772008, pvalue=0.22131750484058052)

In [35]:
from scipy.stats import chi2_contingency
def chisquare(df, var1, var2):
    crosstab = pd.crosstab(df[var1], df[var2])
    return chi2_contingency(crosstab)
group1 = df.loc[df['TARGET'] == 1, 'AMT_INCOME_TOTAL']
group2 = df.loc[df['TARGET'] == 0, 'AMT_INCOME_TOTAL']
f_oneway(group1, group2.dropna())

F_onewayResult(statistic=8.688196083251267, pvalue=0.003207586837755278)

In [37]:
# AMT_ANNUITY
# Mathematical Bucketing
df['REGION_POPULATION_RELATIVE_buckets'] = pd.qcut(df['REGION_POPULATION_RELATIVE'], 5, labels = False)
bivariate_cat(df, 'REGION_POPULATION_RELATIVE_buckets', 'TARGET').to_clipboard()
chisquare(df, 'REGION_POPULATION_RELATIVE_buckets', 'TARGET')

Chi2ContingencyResult(statistic=36.69464151773973, pvalue=2.0819997287379061e-07, dof=4, expected_freq=array([[2846.25897503,  252.74102497],
       [2823.29786681,  250.70213319],
       [2923.40829865,  259.59170135],
       [2900.44719043,  257.55280957],
       [2628.58766909,  233.41233091]]))

In [39]:
from scipy.stats import f_oneway
groups = []
for group in df['TARGET'].unique():
    groups.append(df.loc[df['TARGET'] == group, 'REGION_POPULATION_RELATIVE'])  
group1 = df.loc[df['TARGET'] == 1, 'REGION_POPULATION_RELATIVE']
group2 = df.loc[df['TARGET'] == 0, 'REGION_POPULATION_RELATIVE']

In [43]:
bivariate_cat(df, 'REGION_POPULATION_RELATIVE_buckets', 'TARGET').to_clipboard()
chisquare(df, 'REGION_POPULATION_RELATIVE_buckets', 'TARGET')

Chi2ContingencyResult(statistic=36.69464151773973, pvalue=2.0819997287379061e-07, dof=4, expected_freq=array([[2846.25897503,  252.74102497],
       [2823.29786681,  250.70213319],
       [2923.40829865,  259.59170135],
       [2900.44719043,  257.55280957],
       [2628.58766909,  233.41233091]]))

In [41]:
from scipy.stats import chi2_contingency
def chisquare(df, var1, var2):
    crosstab = pd.crosstab(df[var1], df[var2])
    return chi2_contingency(crosstab)
group1 = df.loc[df['TARGET'] == 1, 'REGION_POPULATION_RELATIVE']
group2 = df.loc[df['TARGET'] == 0, 'REGION_POPULATION_RELATIVE']
f_oneway(group1, group2.dropna())

F_onewayResult(statistic=27.054286209685028, pvalue=2.0036043163930077e-07)

In [47]:
# AMT_CREDIT
# Mathematical Bucketing
df['AMT_CREDIT_buckets'] = pd.qcut(df['AMT_CREDIT'], 10, labels = False)
bivariate_cat(df, 'AMT_CREDIT_buckets', 'TARGET').to_clipboard()
chisquare(df, 'AMT_CREDIT_buckets', 'TARGET')

Chi2ContingencyResult(statistic=52.151740799174874, pvalue=4.231250871613609e-08, dof=9, expected_freq=array([[1491.55359001,  132.44640999],
       [1452.9789282 ,  129.0210718 ],
       [1294.08805931,  114.91194069],
       [1418.99648803,  126.00351197],
       [1403.38293444,  124.61706556],
       [1417.15959938,  125.84040062],
       [1408.89360042,  125.10639958],
       [1582.47957856,  140.52042144],
       [1242.6551769 ,  110.3448231 ],
       [1409.81204475,  125.18795525]]))