Manually Reduce Number of Categories In Each Feature

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
%matplotlib inline

from matplotlib import cm
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from feature_processing import (create_contingency_table,
                                calculate_conditional_prob_bin,
                                encode_my_categorical_labels,
                                calculate_conditional_prob_cat,
                                estimate_cond_prob_density,
                                bin_myFeature)
import feature_analysis as fa

  from pandas.core import datetools


In [2]:
from importlib import reload

In [4]:
train = pd.read_csv('train.csv', header=0)

# Get different kinds of features
all_fs = train.columns[:-1]
binary_fs = sorted([f for f in all_fs if '_bin' in f])
categorical_fs = sorted([f for f in all_fs if '_cat' in f])
other_fs = sorted([f for f in all_fs
            if f not in binary_fs
            if f not in categorical_fs])
bincat_fs = binary_fs + categorical_fs
categoricals = train[categorical_fs]
binaries = train[binary_fs]

In [5]:
for f in categorical_fs:
    print('%s: # unique values: %d' % (f, len(categoricals[f].unique())))

ps_car_01_cat: # unique values: 13
ps_car_02_cat: # unique values: 3
ps_car_03_cat: # unique values: 3
ps_car_04_cat: # unique values: 10
ps_car_05_cat: # unique values: 3
ps_car_06_cat: # unique values: 18
ps_car_07_cat: # unique values: 3
ps_car_08_cat: # unique values: 2
ps_car_09_cat: # unique values: 6
ps_car_10_cat: # unique values: 3
ps_car_11_cat: # unique values: 104
ps_ind_02_cat: # unique values: 5
ps_ind_04_cat: # unique values: 3
ps_ind_05_cat: # unique values: 8


In [73]:
reload(fa)

<module 'feature_analysis' from '/home/ryohayama/python_current/porto_seguro/feature_analysis.py'>

In [7]:
target = train.target

## Selection Criteria
- Calculate ratio of Conditional Probability of being in class1 to the bulk class1 frequency
- Create new categories by binning categories based on those ratios
- Criteria: <5% gain, 5-10% gain, 10-20% gain, 20% and above, etc. It's subjective...

In [13]:
# ps_car_01_cat
print(fa.FeatureVsTarget(categoricals.ps_car_01_cat, target).calculate_deviation(mode='ratio')[0]-1)

target         0         1
-1     -0.291950  7.718207
 0     -0.007702  0.203609
 1     -0.010763  0.284528
 2     -0.006707  0.177322
 3     -0.001143  0.030216
 4     -0.002816  0.074449
 5     -0.002904  0.076780
 6      0.006937 -0.183402
 7      0.007968 -0.210636
 8     -0.007213  0.190687
 9     -0.018653  0.493136
 10    -0.000548  0.014491
 11    -0.005327  0.140834


6 Groups: (-1), (0, 1), (2, 8, 11), (3, 4, 5, 10), (6, 7), (9)

In [27]:
# ps_car_02_cat
print(fa.FeatureVsTarget(categoricals.ps_car_02_cat, target).calculate_deviation(mode='ratio')[0]-1)

target         0         1
-1      0.037826 -1.000000
 0     -0.013554  0.358323
 1      0.002777 -0.073409


2 Groups: (-1, 1) (0)

In [28]:
# ps_car_03_cat
print(fa.FeatureVsTarget(categoricals.ps_car_03_cat, target).calculate_deviation(mode='ratio')[0]-1)

target         0         1
-1      0.003882 -0.102636
 0     -0.003207  0.084782
 1     -0.012299  0.325132


No Change Needed

In [29]:
# ps_car_04_cat
print(fa.FeatureVsTarget(categoricals.ps_car_04_cat, target).calculate_deviation(mode='ratio')[0]-1)

target         0         1
0       0.003110 -0.082222
1      -0.012845  0.339585
2      -0.011162  0.295077
3      -0.015687  0.414705
4       0.015265 -0.403550
5      -0.036440  0.963361
6      -0.023379  0.618062
7      -0.059237  1.566023
8      -0.014221  0.375965
9      -0.026404  0.698037


5 Groups: (0, 4), (1, 2, 3, 8), (5) (6, 9), (7)

In [30]:
# ps_car_05_cat
print(fa.FeatureVsTarget(categoricals.ps_car_05_cat, target).calculate_deviation(mode='ratio')[0]-1)

target         0         1
-1      0.004918 -0.130016
 0     -0.003802  0.100501
 1     -0.004158  0.109913


2 Groups: (-1), (0, 1)

In [31]:
# ps_car_06_cat
print(fa.FeatureVsTarget(categoricals.ps_car_06_cat, target).calculate_deviation(mode='ratio')[0]-1)

target         0         1
0       0.003191 -0.084366
1       0.002348 -0.062082
2      -0.030630  0.809764
3      -0.001794  0.047430
4       0.002694 -0.071226
5      -0.031807  0.840864
6      -0.000366  0.009675
7      -0.002446  0.064663
8      -0.034204  0.904248
9      -0.019258  0.509120
10     -0.007947  0.210081
11      0.004907 -0.129715
12     -0.012630  0.333889
13     -0.021326  0.563796
14      0.002235 -0.059096
15     -0.015230  0.402641
16     -0.009059  0.239502
17     -0.028418  0.751279


5 Groups: (0, 1, 4, 11, 14), (2, 5, 8, 17), (3, 6, 7), (9, 13, 15), (10, 12, 16)

In [32]:
# ps_car_07_cat
print(fa.FeatureVsTarget(categoricals.ps_car_07_cat, target).calculate_deviation(mode='ratio')[0]-1)

target         0         1
-1     -0.043292  1.144500
 0     -0.015296  0.404365
 1      0.001745 -0.046123


No change needed

In [33]:
# ps_car_08_cat
print(fa.FeatureVsTarget(categoricals.ps_car_08_cat, target).calculate_deviation(mode='ratio')[0]-1)

target         0         1
0      -0.008807  0.232824
1       0.001777 -0.046986


No change needed (this is binary?)

In [34]:
# ps_car_09_cat
print(fa.FeatureVsTarget(categoricals.ps_car_09_cat, target).calculate_deviation(mode='ratio')[0]-1)

target         0         1
-1     -0.057019  1.507397
 0      0.003221 -0.085151
 1     -0.021274  0.562421
 2      0.000134 -0.003535
 3      0.001394 -0.036852
 4     -0.015415  0.407512


3 Groups: (-1), (0, 2, 3), (1, 4)

In [35]:
# ps_car_10_cat
print(fa.FeatureVsTarget(categoricals.ps_car_10_cat, target).calculate_deviation(mode='ratio')[0]-1)

target         0         1
0       0.002142 -0.056634
1      -0.000017  0.000439
2      -0.003451  0.091233


No change needed

<b>Come back to car_11 later because it has 104 categories</b>

In [37]:
# ps_ind_02_cat
print(fa.FeatureVsTarget(categoricals.ps_ind_02_cat, target).calculate_deviation(mode='ratio')[0]-1)

target         0         1
-1     -0.154364  4.080872
 1      0.000750 -0.019834
 2     -0.001756  0.046420
 3     -0.000799  0.021114
 4     -0.004497  0.118882


4 Groups: (-1), (1), (2, 3), (4)

In [38]:
# ps_ind_04_cat
print(fa.FeatureVsTarget(categoricals.ps_ind_04_cat, target).calculate_deviation(mode='ratio')[0]-1)

target         0         1
-1     -0.374804  9.908572
 0      0.001720 -0.045470
 1     -0.002279  0.060259


No change needed

In [39]:
# ps_ind_05_cat
print(fa.FeatureVsTarget(categoricals.ps_ind_05_cat, target).calculate_deviation(mode='ratio')[0]-1)

target         0         1
-1     -0.048644  1.285999
 0      0.002680 -0.070857
 1     -0.011808  0.312162
 2     -0.040556  1.072180
 3     -0.006798  0.179715
 4     -0.016204  0.428372
 5     -0.012523  0.331071
 6     -0.023754  0.627984


5 Groups: (-1, 2), (0), (1, 4, 5), (3), (6)