# Distribution of non-categorical features

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import gc
from matplotlib.pyplot import figure

In [2]:
df_train = pd.read_parquet("../input/amex-parquet/train_data.parquet")            

In [3]:
df_train1=df_train.drop(['B_29','D_82','D_75', 'D_77', 'D_143','D_119', 'D_74','D_141', 
                        'D_104', 'S_24', 'S_7','B_15','B_33','B_37','B_23','B_11','S_2','B_30', 
                        'B_38', 'D_63', 'D_64', 'D_66', 'D_68', 'D_114', 'D_116', 'D_117', 'D_120',
                         'D_126','customer_ID'], axis=1)  # EDA + Data wranging gives the details and 
                                                          # we also delete categorical features

In [4]:
df_train1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5531451 entries, 0 to 5531450
Columns: 162 entries, P_2 to target
dtypes: float32(160), int64(2)
memory usage: 3.4 GB


In [5]:
## distribution of fetures of 'target=1' and 'target=0'
from matplotlib.pyplot import figure
figure(figsize=(20, 10))
plt.style.use('Solarize_Light2')
def show_kdeplots(letter, figsize):   
    cols = [c for c in df_train1.columns if c.startswith((letter,'t'))]
    df_tmp = df_train1[cols]
    plt_cols = 5
    plt_rows = math.ceil(len(cols)/plt_cols)

    fig, axes = plt.subplots(plt_rows, plt_cols, figsize=figsize)
    for i, ax in enumerate(axes.reshape(-1)):
        if i<len(cols)-1:
            sns.kdeplot(x=cols[i], hue='target', hue_order=[1,0], label=['Default','Paid'], data=df_tmp, 
                        fill=True, linewidth=2, legend=False, ax=ax)
        ax.tick_params(left=False, bottom=False, labelsize=5)
        ax.xaxis.get_label().set_fontsize(10)
        ax.set_ylabel('')

    sns.despine(bottom=True, trim=True)
    plt.tight_layout(rect=[0, 0.2, 1, 0.99])
    plt.show()

<Figure size 1440x720 with 0 Axes>

In [6]:
# overlapping area of normalized features of 'target=0' and 'target=1'
def return_overlapping_area(df,col):
    
    groups = df.groupby(['target',pd.cut(df[col],50)]) # get values grouped by target and also get counts of 50 bins
    groups_result_df = groups.size().unstack().transpose()
    
    total_counts = df.groupby('target').count()[col] # just one count to avoid further values
    overlapping = groups_result_df/total_counts    # nomalize 
    overlapping['min'] = overlapping.min(axis=1)
    return_result = overlapping.sum()['min']
    
    del groups,groups_result_df,total_counts,overlapping
    gc.collect()
    
    return return_result

In [7]:
#show_kdeplots('D', (15,30))

In [8]:
D = [c for c in df_train1.columns if c.startswith('D')]
over_lapping_D = []
for i in D:
    over_lapping_D.append(return_overlapping_area(df_train1,i))
df_D = pd.DataFrame({'D':D,
                     'over_lapping_D':over_lapping_D}).sort_values('over_lapping_D',ascending=False)

In [9]:
df_D.head(20)

Unnamed: 0,D,over_lapping_D
36,D_87,1.0
20,D_61,0.999981
59,D_123,0.999981
23,D_69,0.999927
47,D_106,0.99987
22,D_65,0.999415
72,D_137,0.998253
50,D_109,0.998057
9,D_49,0.996691
70,D_135,0.995603


In [10]:
P = [c for c in df_train1.columns if c.startswith('P')]
over_lapping_P = []
for i in P:
    over_lapping_P.append(return_overlapping_area(df_train1,i))
df_P = pd.DataFrame({'P':P,
                     'over_lapping_P':over_lapping_P}).sort_values('over_lapping_P',ascending=False)
df_P

Unnamed: 0,P,over_lapping_P
2,P_4,0.808585
1,P_3,0.73993
0,P_2,0.360163


In [11]:
B = [c for c in df_train1.columns if c.startswith('B')]
over_lapping_B = []
for i in B:
    over_lapping_B.append(return_overlapping_area(df_train1,i))
df_B = pd.DataFrame({'B':B,
                     'over_lapping_B':over_lapping_B})
df_B.sort_values('over_lapping_B',ascending=False)

Unnamed: 0,B,over_lapping_B
29,B_40,0.999946
9,B_10,0.999892
5,B_6,0.999852
10,B_12,0.999772
23,B_27,0.99973
11,B_13,0.999699
22,B_26,0.999547
4,B_5,0.998508
18,B_21,0.993434
25,B_31,0.992295


In [12]:
S = [c for c in df_train1.columns if c.startswith('S')]
over_lapping_S = []
for i in S:
    over_lapping_S.append(return_overlapping_area(df_train1,i))
df_S = pd.DataFrame({'S':S,
                     'over_lapping_S':over_lapping_S})
df_S.sort_values('over_lapping_S',ascending=False)

Unnamed: 0,S,over_lapping_S
15,S_23,0.999924
14,S_22,0.999792
1,S_5,0.999671
6,S_12,0.999332
12,S_19,0.998704
17,S_26,0.998193
11,S_18,0.99628
9,S_16,0.993683
10,S_17,0.990192
13,S_20,0.965843
