In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns
from matplotlib import cm
import statsmodels.api as sm

from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

%matplotlib inline

  from pandas.core import datetools


In [5]:
train = pd.read_csv('./train.csv')
bincat = pd.read_csv('./data/class_associated_bincat_features.csv')
ordinal = pd.read_csv('./data/class_associated_ordinal_features.csv')
data = pd.concat([bincat, ordinal, train.target], axis=1)
target = data.target

In [55]:
len(train.columns)

59

In [7]:
num_bincat_features = bincat.shape[1]
num_ordinal_features = ordinal.shape[1]
num_features = data.shape[1]

print('Total # features: %d' % num_features)
print('# bincat features: %d' % num_bincat_features)
print('# ordinal features: %d' % num_ordinal_features)

Total # features: 144
# bincat features: 132
# ordinal features: 11


In [12]:
bincat_features = bincat.columns
ordinal_features = ordinal.columns

In [21]:
bincat_features

Index(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin',
       'ps_ind_12_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin',
       'ps_car_01_cat_NaN', 'ps_car_01_cat_0',
       ...
       'ps_ind_04_cat_0', 'ps_ind_04_cat_1', 'ps_ind_05_cat_NaN',
       'ps_ind_05_cat_0', 'ps_ind_05_cat_1', 'ps_ind_05_cat_2',
       'ps_ind_05_cat_3', 'ps_ind_05_cat_4', 'ps_ind_05_cat_5',
       'ps_ind_05_cat_6'],
      dtype='object', length=132)

In [20]:
# There should be no (1, 1) pair
significance_level = 0.05
table = sm.stats.Table.from_data(bincat.iloc[:, 0:2])
rslt = table.test_nominal_association()
pval = rslt.pvalue
if pval < significance_level:
    text = '  NOT  independent'
else:
    text = 'likely independent'
#print('%s-%s pair is %s (p-value=%.3f)' % (bincat_features[0], bincat_features[1], text, pval))
print(table.table_orig)

ps_ind_07_bin       0       1
ps_ind_06_bin                
0              207863  152989
1              234360       0


In [44]:
important_features = pd.read_csv('./data/important_features1.csv')
important_features = sorted(important_features.iloc[:,0].tolist())

In [46]:
print(len(important_features))
important_features

127


['ps_car_01_cat_0',
 'ps_car_01_cat_1',
 'ps_car_01_cat_11',
 'ps_car_01_cat_4',
 'ps_car_01_cat_5',
 'ps_car_01_cat_6',
 'ps_car_01_cat_7',
 'ps_car_01_cat_8',
 'ps_car_01_cat_9',
 'ps_car_01_cat_NaN',
 'ps_car_02_cat_0',
 'ps_car_03_cat_0',
 'ps_car_03_cat_1',
 'ps_car_03_cat_NaN',
 'ps_car_04_cat_0',
 'ps_car_04_cat_1',
 'ps_car_04_cat_2',
 'ps_car_04_cat_6',
 'ps_car_04_cat_7',
 'ps_car_04_cat_8',
 'ps_car_04_cat_9',
 'ps_car_05_cat_0',
 'ps_car_05_cat_1',
 'ps_car_05_cat_NaN',
 'ps_car_06_cat_0',
 'ps_car_06_cat_1',
 'ps_car_06_cat_10',
 'ps_car_06_cat_11',
 'ps_car_06_cat_12',
 'ps_car_06_cat_13',
 'ps_car_06_cat_14',
 'ps_car_06_cat_15',
 'ps_car_06_cat_16',
 'ps_car_06_cat_17',
 'ps_car_06_cat_2',
 'ps_car_06_cat_4',
 'ps_car_06_cat_5',
 'ps_car_06_cat_8',
 'ps_car_06_cat_9',
 'ps_car_07_cat_0',
 'ps_car_07_cat_1',
 'ps_car_07_cat_NaN',
 'ps_car_08_cat_0',
 'ps_car_09_cat_0',
 'ps_car_09_cat_1',
 'ps_car_09_cat_4',
 'ps_car_09_cat_NaN',
 'ps_car_11_cat_10',
 'ps_car_11_cat_100'

In [53]:
features = ['ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
             'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
             'ps_car_09_cat', 'ps_car_11_cat',
             'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15',
             'ps_ind_01', 'ps_ind_02_cat', 'ps_ind_06_bin', 'ps_ind_07_bin',
             'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_12_bin', 'ps_ind_14',
             'ps_ind_15', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin',
             'ps_reg_01', 'ps_reg_02', 'ps_reg_03']

In [54]:
len(features)

29

In [52]:
significance_level = 0.05

for i in range(10):
    f1 = important_features[0]
    f2 = important_features[i]
    table = sm.stats.Table.from_data(bincat.loc[:, [f1, f2]])
    rslt = table.test_nominal_association()
    pval = rslt.pvalue
    if pval < significance_level:
        text = 'NOT independent'
    else:
        text = 'Independent'
    print('==== %s vs %s: %s (p-val=%.3f) ====' % (f1, f2, text, pval))
    print(table.table_orig)

==== ps_car_01_cat_0 vs ps_car_01_cat_0: NOT independent (p-val=0.000) ====
ps_car_01_cat_0       0     1
ps_car_01_cat_0              
0                589308     0
1                     0  5904
==== ps_car_01_cat_0 vs ps_car_01_cat_1: NOT independent (p-val=0.000) ====
ps_car_01_cat_1       0     1
ps_car_01_cat_0              
0                587941  1367
1                  5904     0
==== ps_car_01_cat_0 vs ps_car_01_cat_11: NOT independent (p-val=0.000) ====
ps_car_01_cat_11       0       1
ps_car_01_cat_0                 
0                 381735  207573
1                   5904       0
==== ps_car_01_cat_0 vs ps_car_01_cat_4: NOT independent (p-val=0.000) ====
ps_car_01_cat_4       0      1
ps_car_01_cat_0               
0                563134  26174
1                  5904      0
==== ps_car_01_cat_0 vs ps_car_01_cat_5: NOT independent (p-val=0.000) ====
ps_car_01_cat_5       0      1
ps_car_01_cat_0               
0                571166  18142
1                  5904      0