In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
%matplotlib inline

from importlib import reload
from matplotlib import cm

from sklearn.decomposition import PCA

import feature_analysis as fa
import ps.joint_probability as psjp

import warnings
warnings.simplefilter("ignore", category=PendingDeprecationWarning)
warnings.simplefilter("ignore", category=DeprecationWarning)

  from pandas.core import datetools


In [2]:
train = pd.read_csv('train.csv', header=0)

all_fs = train.columns[2:]
binary_fs = sorted([f for f in all_fs if '_bin' in f])
categorical_fs = sorted([f for f in all_fs if '_cat' in f])
other_fs = sorted([f for f in all_fs
                if f not in binary_fs
                if f not in categorical_fs])

target = train.target

In [3]:
num_samples = len(target)
num_target = np.sum(target)
freq_target = num_target/num_samples
freq_target

0.036447517859182946

### ind_bin

In [4]:
ind_bins_fs = [f for f in all_fs 
               if '_bin' in f
               if '_ind' in f]
ind_bins = train[ind_bins_fs]

ind_bins.columns

Index(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin',
       'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin'],
      dtype='object')

In [5]:
reload(psjp)

<module 'ps.joint_probability' from '/home/ryohayama/python_current/porto_seguro/ps/joint_probability.py'>

In [6]:
jpopt = psjp.JointProbabilityOptimizer(ind_bins, verbose=True)

In [7]:
jpopt.feature_list_

['ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin']

In [8]:
%%time
res = jpopt.selectByElimination(target)

 0/11: Processing Original
 1/11: Processing without ps_ind_06_bin
 2/11: Processing without ps_ind_07_bin
 3/11: Processing without ps_ind_08_bin
 4/11: Processing without ps_ind_09_bin
 5/11: Processing without ps_ind_10_bin
 6/11: Processing without ps_ind_11_bin
 7/11: Processing without ps_ind_12_bin
 8/11: Processing without ps_ind_13_bin
 9/11: Processing without ps_ind_16_bin
10/11: Processing without ps_ind_17_bin
11/11: Processing without ps_ind_18_bin
CPU times: user 1min 30s, sys: 711 ms, total: 1min 31s
Wall time: 1min 31s


In [9]:
pd.DataFrame({'exp_val': res[0], 'importance_': res[1]}).sort_values('importance_', ascending=False)

Unnamed: 0,exp_val,importance_
ps_ind_17_bin,855.039591,0.01223528
ps_ind_12_bin,861.914458,0.004293249
ps_ind_13_bin,861.985986,0.004210617
ps_ind_11_bin,863.484227,0.002479809
ps_ind_10_bin,863.597897,0.002348495
ps_ind_16_bin,864.160795,0.00169822
ps_ind_18_bin,864.57118,0.001224132
ps_ind_07_bin,865.630826,2.220446e-16
ps_ind_06_bin,865.630826,1.110223e-16
initial,865.630826,0.0


In [10]:
new_list = jpopt.feature_list_.copy()
new_list.remove('ps_ind_08_bin')
new_list.remove('ps_ind_09_bin')
new_feature = train[new_list]

In [11]:
new_feature.columns

Index(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_10_bin', 'ps_ind_11_bin',
       'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_16_bin', 'ps_ind_17_bin',
       'ps_ind_18_bin'],
      dtype='object')

In [12]:
%%time
jpopt2 = psjp.JointProbabilityOptimizer(new_feature, verbose=True)
res2 = jpopt2.selectByElimination(target)

 0/ 9: Processing Original
 1/ 9: Processing without ps_ind_06_bin
 2/ 9: Processing without ps_ind_07_bin
 3/ 9: Processing without ps_ind_10_bin
 4/ 9: Processing without ps_ind_11_bin
 5/ 9: Processing without ps_ind_12_bin
 6/ 9: Processing without ps_ind_13_bin
 7/ 9: Processing without ps_ind_16_bin
 8/ 9: Processing without ps_ind_17_bin
 9/ 9: Processing without ps_ind_18_bin
CPU times: user 1min 15s, sys: 358 ms, total: 1min 15s
Wall time: 1min 15s


In [13]:
pd.DataFrame({'exp_val_removed': res2[0], 'importance_': res2[1]}).sort_values('importance_', ascending=False)

Unnamed: 0,exp_val_removed,importance_
ps_ind_07_bin,848.175564,0.013909
ps_ind_17_bin,849.473272,0.0124
ps_ind_06_bin,850.736883,0.010931
ps_ind_12_bin,856.52182,0.004206
ps_ind_13_bin,857.290724,0.003312
ps_ind_11_bin,858.043018,0.002437
ps_ind_10_bin,858.484651,0.001924
ps_ind_16_bin,858.685634,0.00169
ps_ind_18_bin,859.157138,0.001142
initial,860.139383,0.0


In [14]:
calc_bins_fs = [f for f in binary_fs 
               if '_bin' in f
               if '_calc' in f]
calc_bins = train[calc_bins_fs]
calc_bins_fs

['ps_calc_15_bin',
 'ps_calc_16_bin',
 'ps_calc_17_bin',
 'ps_calc_18_bin',
 'ps_calc_19_bin',
 'ps_calc_20_bin']

In [15]:
%%time
jpopt3 = psjp.JointProbabilityOptimizer(calc_bins, verbose=True)
res3 = jpopt3.selectByElimination(target)

 0/ 6: Processing Original
 1/ 6: Processing without ps_calc_15_bin
 2/ 6: Processing without ps_calc_16_bin
 3/ 6: Processing without ps_calc_17_bin
 4/ 6: Processing without ps_calc_18_bin
 5/ 6: Processing without ps_calc_19_bin
 6/ 6: Processing without ps_calc_20_bin
CPU times: user 51.8 s, sys: 160 ms, total: 52 s
Wall time: 52 s


In [16]:
pd.DataFrame({'exp_val_removed': res3[0], 'importance_': res3[1]}).sort_values('importance_', ascending=False)

Unnamed: 0,exp_val_removed,importance_
ps_calc_17_bin,791.570945,0.002123
ps_calc_19_bin,791.770398,0.001872
ps_calc_18_bin,791.772855,0.001869
ps_calc_15_bin,791.815768,0.001815
ps_calc_16_bin,791.86214,0.001756
ps_calc_20_bin,792.08762,0.001472
initial,793.255422,0.0


In [17]:
car_cats_fs = [f for f in categorical_fs if 'car' in f]
car_cats = train[car_cats_fs] + 1

car_cat_small_fs = ['ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat',
                     'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat']
car_cat_big_fs = [f for f in car_cats if f not in car_cat_small_fs]

car_cat_small = train[car_cat_small_fs]
car_cat_big = train[car_cat_big_fs]

print(car_cat_small_fs)
print(car_cat_big_fs)

['ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat']
['ps_car_01_cat', 'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_11_cat']


In [18]:
%%time
jpopt4 = psjp.JointProbabilityOptimizer(car_cat_small, verbose=True)
res4 = jpopt4.selectByElimination(target)

 0/ 7: Processing Original
 1/ 7: Processing without ps_car_02_cat
 2/ 7: Processing without ps_car_03_cat
 3/ 7: Processing without ps_car_05_cat
 4/ 7: Processing without ps_car_07_cat
 5/ 7: Processing without ps_car_08_cat
 6/ 7: Processing without ps_car_09_cat
 7/ 7: Processing without ps_car_10_cat
CPU times: user 59.9 s, sys: 83.8 ms, total: 60 s
Wall time: 1min


In [19]:
pd.DataFrame({'exp_val_removed': res4[0], 'importance_': res4[1]}).sort_values('importance_', ascending=False)

Unnamed: 0,exp_val_removed,importance_
ps_car_07_cat,863.501334,0.042701
ps_car_09_cat,868.451593,0.037213
ps_car_02_cat,873.955724,0.031111
ps_car_08_cat,881.151259,0.023134
ps_car_03_cat,887.098088,0.016541
ps_car_10_cat,888.145933,0.015379
ps_car_05_cat,888.630236,0.014842
initial,902.018149,0.0


In [20]:
%%time
jpopt5 = psjp.JointProbabilityOptimizer(car_cat_big, verbose=True)
res5 = jpopt5.selectByElimination(target)

 0/ 4: Processing Original
 1/ 4: Processing without ps_car_01_cat
 2/ 4: Processing without ps_car_04_cat
 3/ 4: Processing without ps_car_06_cat
 4/ 4: Processing without ps_car_11_cat
CPU times: user 36.8 s, sys: 35.4 ms, total: 36.8 s
Wall time: 36.9 s


In [21]:
pd.DataFrame({'exp_val_removed': res5[0], 'importance_': res5[1]}).sort_values('importance_', ascending=False)

Unnamed: 0,exp_val_removed,importance_
ps_car_01_cat,862.548433,0.123288
ps_car_11_cat,919.561125,0.06534
ps_car_06_cat,941.458706,0.043082
ps_car_04_cat,941.678964,0.042859
initial,983.845102,0.0


In [22]:
ind_cat_fs = [f for f in categorical_fs if 'ind' in f]
ind_cat = train[ind_cat_fs] + 1

In [23]:
%%time
jpopt6 = psjp.JointProbabilityOptimizer(ind_cat, verbose=True)
res6 = jpopt6.selectByElimination(target)

 0/ 3: Processing Original
 1/ 3: Processing without ps_ind_02_cat
 2/ 3: Processing without ps_ind_04_cat
 3/ 3: Processing without ps_ind_05_cat
CPU times: user 29.2 s, sys: 35.9 ms, total: 29.3 s
Wall time: 29.3 s


In [24]:
pd.DataFrame({'exp_val_removed': res6[0], 'importance_': res6[1]}).sort_values('importance_', ascending=False)

Unnamed: 0,exp_val_removed,importance_
ps_ind_05_cat,806.805947,0.047702
ps_ind_04_cat,841.440606,0.006822
ps_ind_02_cat,842.243571,0.005874
initial,847.220418,0.0


## BinarySelection

In [89]:
reload(psjp)

<module 'ps.joint_probability' from '/home/ryohayama/python_current/porto_seguro/ps/joint_probability.py'>

In [86]:
%%time
jpopt7 = psjp.JointProbabilityOptimizer(ind_cat, verbose=True)
res7 = jpopt7.combinatorialSelection(target, N=2)

 1/ 3: Processed ('ps_ind_02_cat', 'ps_ind_04_cat') E=806.81
 2/ 3: Processed ('ps_ind_02_cat', 'ps_ind_05_cat') E=841.44
 3/ 3: Processed ('ps_ind_04_cat', 'ps_ind_05_cat') E=842.24
CPU times: user 22.1 s, sys: 31.9 ms, total: 22.2 s
Wall time: 22.2 s


In [87]:
pd.DataFrame({'exp_val': res7}).sort_values('exp_val', ascending=False)

Unnamed: 0,Unnamed: 1,exp_val
ps_ind_04_cat,ps_ind_05_cat,842.243571
ps_ind_02_cat,ps_ind_05_cat,841.440606
ps_ind_02_cat,ps_ind_04_cat,806.805947


In [90]:
%%time
jpopt7f = psjp.JointProbabilityOptimizer(ind_cat, verbose=True)
res7f = jpopt7f.exhaustiveCombinatorialSelection(target)

==== 1_feature_combination ====
 1/ 3: Processed ('ps_ind_02_cat',) E=796.28
 2/ 3: Processed ('ps_ind_04_cat',) E=803.67
 3/ 3: Processed ('ps_ind_05_cat',) E=830.33
==== 2_feature_combination ====
 1/ 3: Processed ('ps_ind_02_cat', 'ps_ind_04_cat') E=806.81
 2/ 3: Processed ('ps_ind_02_cat', 'ps_ind_05_cat') E=841.44
 3/ 3: Processed ('ps_ind_04_cat', 'ps_ind_05_cat') E=842.24
==== 3_feature_combination ====
 1/ 1: Processed ('ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat') E=847.22
CPU times: user 51.6 s, sys: 112 ms, total: 51.7 s
Wall time: 51.7 s


In [91]:
print(jpopt7f.best_expected_value_)
print(jpopt7f.best_combination_)

847.220417877
('ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat')


In [94]:
res7f

{'1_feature_combination': {('ps_ind_02_cat',): 796.28070601434399,
  ('ps_ind_04_cat',): 803.66766394256877,
  ('ps_ind_05_cat',): 830.33226417225774},
 '2_feature_combination': {('ps_ind_02_cat',
   'ps_ind_04_cat'): 806.80594721365856,
  ('ps_ind_02_cat', 'ps_ind_05_cat'): 841.44060627987335,
  ('ps_ind_04_cat', 'ps_ind_05_cat'): 842.24357073789861},
 '3_feature_combination': {('ps_ind_02_cat',
   'ps_ind_04_cat',
   'ps_ind_05_cat'): 847.22041787727562}}

In [95]:
%%time
jpopt3f = psjp.JointProbabilityOptimizer(calc_bins, verbose=True)
res3f = jpopt3f.exhaustiveCombinatorialSelection(target)

==== 1_feature_combination ====
 1/ 6: Processed ('ps_calc_15_bin',) E=790.70
 2/ 6: Processed ('ps_calc_16_bin',) E=790.70
 3/ 6: Processed ('ps_calc_17_bin',) E=790.69
 4/ 6: Processed ('ps_calc_18_bin',) E=790.70
 5/ 6: Processed ('ps_calc_19_bin',) E=790.76
 6/ 6: Processed ('ps_calc_20_bin',) E=790.72
==== 2_feature_combination ====
 1/15: Processed ('ps_calc_15_bin', 'ps_calc_16_bin') E=790.71
 2/15: Processed ('ps_calc_15_bin', 'ps_calc_17_bin') E=790.72
 3/15: Processed ('ps_calc_15_bin', 'ps_calc_18_bin') E=790.73
 4/15: Processed ('ps_calc_15_bin', 'ps_calc_19_bin') E=790.81
 5/15: Processed ('ps_calc_15_bin', 'ps_calc_20_bin') E=790.74
 6/15: Processed ('ps_calc_16_bin', 'ps_calc_17_bin') E=790.71
 7/15: Processed ('ps_calc_16_bin', 'ps_calc_18_bin') E=790.71
 8/15: Processed ('ps_calc_16_bin', 'ps_calc_19_bin') E=790.77
 9/15: Processed ('ps_calc_16_bin', 'ps_calc_20_bin') E=790.76
10/15: Processed ('ps_calc_17_bin', 'ps_calc_18_bin') E=790.75
11/15: Processed ('ps_calc_17_

In [52]:
%%time
jpopt8 = psjp.JointProbabilityOptimizer(new_feature, verbose=True)
res8 = jpopt8.binarySelection(target)

 1/36: Processing -  ('ps_ind_06_bin', 'ps_ind_07_bin')
 2/36: Processing -  ('ps_ind_06_bin', 'ps_ind_10_bin')
 3/36: Processing -  ('ps_ind_06_bin', 'ps_ind_11_bin')
 4/36: Processing -  ('ps_ind_06_bin', 'ps_ind_12_bin')
 5/36: Processing -  ('ps_ind_06_bin', 'ps_ind_13_bin')
 6/36: Processing -  ('ps_ind_06_bin', 'ps_ind_16_bin')
 7/36: Processing -  ('ps_ind_06_bin', 'ps_ind_17_bin')
 8/36: Processing -  ('ps_ind_06_bin', 'ps_ind_18_bin')
 9/36: Processing -  ('ps_ind_07_bin', 'ps_ind_10_bin')
10/36: Processing -  ('ps_ind_07_bin', 'ps_ind_11_bin')
11/36: Processing -  ('ps_ind_07_bin', 'ps_ind_12_bin')
12/36: Processing -  ('ps_ind_07_bin', 'ps_ind_13_bin')
13/36: Processing -  ('ps_ind_07_bin', 'ps_ind_16_bin')
14/36: Processing -  ('ps_ind_07_bin', 'ps_ind_17_bin')
15/36: Processing -  ('ps_ind_07_bin', 'ps_ind_18_bin')
16/36: Processing -  ('ps_ind_10_bin', 'ps_ind_11_bin')
17/36: Processing -  ('ps_ind_10_bin', 'ps_ind_12_bin')
18/36: Processing -  ('ps_ind_10_bin', 'ps_ind_1

In [54]:
pd.DataFrame({'exp_val': res8}).sort_values('exp_val', ascending=False)

Unnamed: 0,Unnamed: 1,exp_val
ps_ind_07_bin,ps_ind_17_bin,844.485051
ps_ind_06_bin,ps_ind_17_bin,842.574782
ps_ind_06_bin,ps_ind_16_bin,831.189598
ps_ind_07_bin,ps_ind_16_bin,829.520927
ps_ind_06_bin,ps_ind_07_bin,823.707287
ps_ind_17_bin,ps_ind_18_bin,821.714021
ps_ind_16_bin,ps_ind_17_bin,821.494871
ps_ind_12_bin,ps_ind_17_bin,820.484558
ps_ind_13_bin,ps_ind_17_bin,819.533361
ps_ind_11_bin,ps_ind_17_bin,819.517595


In [61]:
%%time
jpopt9 = psjp.JointProbabilityOptimizer(car_cat_small, verbose=True)
res9 = jpopt9.binarySelection(target)

 1/21: Processed ('ps_car_02_cat', 'ps_car_03_cat') E=831.54
 2/21: Processed ('ps_car_02_cat', 'ps_car_05_cat') E=820.69
 3/21: Processed ('ps_car_02_cat', 'ps_car_07_cat') E=835.45
 4/21: Processed ('ps_car_02_cat', 'ps_car_08_cat') E=819.22
 5/21: Processed ('ps_car_02_cat', 'ps_car_09_cat') E=825.95
 6/21: Processed ('ps_car_02_cat', 'ps_car_10_cat') E=811.54
 7/21: Processed ('ps_car_03_cat', 'ps_car_05_cat') E=813.58
 8/21: Processed ('ps_car_03_cat', 'ps_car_07_cat') E=834.76
 9/21: Processed ('ps_car_03_cat', 'ps_car_08_cat') E=816.80
10/21: Processed ('ps_car_03_cat', 'ps_car_09_cat') E=828.36
11/21: Processed ('ps_car_03_cat', 'ps_car_10_cat') E=812.96
12/21: Processed ('ps_car_05_cat', 'ps_car_07_cat') E=826.70
13/21: Processed ('ps_car_05_cat', 'ps_car_08_cat') E=808.87
14/21: Processed ('ps_car_05_cat', 'ps_car_09_cat') E=826.17
15/21: Processed ('ps_car_05_cat', 'ps_car_10_cat') E=801.64
16/21: Processed ('ps_car_07_cat', 'ps_car_08_cat') E=825.61
17/21: Processed ('ps_ca

In [62]:
pd.DataFrame({'exp_val': res9}).sort_values('exp_val', ascending=False)

Unnamed: 0,Unnamed: 1,exp_val
ps_car_02_cat,ps_car_07_cat,835.446078
ps_car_03_cat,ps_car_07_cat,834.760239
ps_car_07_cat,ps_car_09_cat,832.819819
ps_car_02_cat,ps_car_03_cat,831.543154
ps_car_03_cat,ps_car_09_cat,828.359375
ps_car_05_cat,ps_car_07_cat,826.696892
ps_car_05_cat,ps_car_09_cat,826.174973
ps_car_02_cat,ps_car_09_cat,825.945368
ps_car_07_cat,ps_car_08_cat,825.61182
ps_car_02_cat,ps_car_05_cat,820.690695
