# Setting up a class for converting features into joint probability space

<b>Groupings:</b>
- ind_bin (no missing values)
- calc_bin (no missing values)
- car_cat (add 1 to all to create category of missing values (i.e. cat=0)
  1. ['ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat',
      'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat']
  2. The rest
- ind_cat
<br><br>

<b>Processsing:</b>
1. Create a df with two columns:
  1. id from the original data
  2. tuple of features above (joint_vector)
2. Create a crosstab (i.e. contingency table) of the joint_vector with target vector
  - Note: Index for this will be the tuples
3. Create two new columns on the crosstab:
  1. Total count for each row/tuple
  2. Cond. Proba. of being in class1 given the tuple (divide class1 count by total)
4. Merge the 1st df with the crosstab['proba'] on the tuples.
  - Note1: Retain all the df tuples
  - Note2: In the test set, there might be some tuples not seen in train set.
    In that case, check for NaN, replace them with zeros.
5. Create a new df with just id and the proba column
6. Combined different groups of proba-converted features.
<br><br>

<b>Optimization</b><br>
Some features probably won't yield any higher joint probability even if it's combined. One could potentially improve the discriminatory power of joint distribution by omitting some features from each grouping. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import statsmodels.api as sm
%matplotlib inline

from collections import namedtuple
from importlib import reload
from matplotlib import cm
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.metrics import (make_scorer, roc_auc_score, 
                             classification_report, 
                             precision_recall_curve,
                             roc_curve)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
#from xgboost import XGBClassifier

from feature_processing import (create_contingency_table,
                                calculate_conditional_prob_bin,
                                encode_my_categorical_labels,
                                calculate_conditional_prob_cat,
                                estimate_cond_prob_density,
                                bin_myFeature)
import feature_analysis as fa
import porto_seguro as ps

import warnings
warnings.simplefilter("ignore", category=PendingDeprecationWarning)
warnings.simplefilter("ignore", category=DeprecationWarning)

  from pandas.core import datetools


In [2]:
def my_gini(y_true, y_probas):
    auc = roc_auc_score(y_true, y_probas[:, 1])
    gini = 2*auc - 1
    return gini

gini_scorer = make_scorer(my_gini, needs_proba=True, greater_is_better=True)

In [3]:
train = pd.read_csv('train.csv', header=0)
test = pd.read_csv('test.csv', header=0)

all_fs = train.columns[2:]
binary_fs = sorted([f for f in all_fs if '_bin' in f])
categorical_fs = sorted([f for f in all_fs if '_cat' in f])
other_fs = sorted([f for f in all_fs
                if f not in binary_fs
                if f not in categorical_fs])

binaries = train[binary_fs]
categoricals = ps.fuseCategoricalFeatures(train[categorical_fs])
cont_ordinals = train[other_fs]
target = train.target

train.shape, binaries.shape, categoricals.shape, cont_ordinals.shape

((595212, 59), (595212, 17), (595212, 14), (595212, 26))

In [4]:
train.columns

Index(['id', 'target', 'ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03',
       'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin',
       'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin',
       'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01',
       'ps_reg_02', 'ps_reg_03', 'ps_car_01_cat', 'ps_car_02_cat',
       'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
       'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat',
       'ps_car_11_cat', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14',
       'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04',
       'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09',
       'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14',
       'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin',
       'ps_calc_19_bin', 'ps_calc_20_bin'],


In [5]:
num_samples = len(target)
num_target = np.sum(target)
freq_target = num_target/num_samples
freq_target

0.036447517859182946

In [6]:
class JointProbability:
    def __init__(self):
        self.contingency = None
    
    def fit(self, id_, feature, target, label=None):
        '''
        id_ : (n_samples,)
            column vector containing ids for each sample
        feature: (n_samples, n_features)
            features to be joined
        target : (n_sampes, )
            column vector containing class label of each sample
        '''
        if label is None:
            label = 'cond_proba'
        else:
            label = label
            
        df1 = pd.DataFrame()
        df1['id'] = id_
        df1['combined_feature'] = feature.apply(tuple, axis=1)
        
        contingency = pd.crosstab(df1['combined_feature'], target)
        contingency[label] = contingency[1]/(contingency[0] + contingency[1])
        self.contingency = contingency.reset_index()
        
        return
    
    
    def transform(self, id_, feature, label=None):
        if label is None:
            label = 'cond_proba'
        else:
            label = label
        
        df1 = pd.DataFrame()
        df1['id'] = id_
        df1['combined_feature'] = feature.apply(tuple, axis=1)
        
        if self.contingency is None:
            print('Error: Object has to be fit first')
            return
        
        df2 = df1.merge(self.contingency[['combined_feature', label]], how='left',
                       left_on='combined_feature', right_on='combined_feature')
        
        return df2[['id', label]]
        
        
    def fit_transform(self, id_, feature, target, label=None):
        '''
        id_ : (n_samples,)
            column vector containing ids for each sample
        feature: (n_samples, n_features)
            features to be joined
        target : (n_sampes, )
            column vector containing class label of each sample
        '''
        if label is None:
            label = 'cond_proba'
        else:
            label = label
            
        df1 = pd.DataFrame()
        df1['id'] = id_
        df1['combined_feature'] = feature.apply(tuple, axis=1)
        
        contingency = pd.crosstab(df1['combined_feature'], target)
        contingency[label] = contingency[1]/(contingency[0] + contingency[1])
        self.contingency = contingency.reset_index()
        
        df2 = df1.merge(self.contingency[['combined_feature', label]], how='left',
                   left_on='combined_feature', right_on='combined_feature')
    
        return df2[['id', label]]

### Try on ind_bins

In [7]:
ind_bins_fs = [f for f in all_fs 
               if '_bin' in f
               if '_ind' in f]
ind_bins = train[ind_bins_fs]
ind_bins_test = test[ind_bins_fs]

ind_bins.columns

Index(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin',
       'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin'],
      dtype='object')

In [8]:
jp = JointProbability()
jp.fit(train.id, ind_bins, target, label='ind_bin_proba')

In [9]:
a1 = jp.transform(train.id, ind_bins, label='ind_bin_proba')

In [10]:
jp = JointProbability()
a2 = jp.fit_transform(train.id, ind_bins, target, label='ind_bin_proba')

In [11]:
a1 == a2

Unnamed: 0,id,ind_bin_proba
0,True,True
1,True,True
2,True,True
3,True,True
4,True,True
5,True,True
6,True,True
7,True,True
8,True,True
9,True,True


In [12]:
a1.head()

Unnamed: 0,id,ind_bin_proba
0,7,0.072793
1,9,0.044951
2,13,0.036999
3,16,0.026372
4,17,0.026372


In [13]:
test.id.shape, test.shape

((892816,), (892816, 58))

In [14]:
res = jp.transform(test.id, ind_bins_test, label='ind_bin_proba')

In [15]:
ind_bins_test.shape, res.shape

((892816, 11), (892816, 2))

In [16]:
res['ind_bin_proba'].unique()

array([ 0.04157676,  0.03143546,  0.03479532,  0.0263716 ,  0.07279267,
        0.04429821,  0.0254136 ,  0.03699938,  0.04495058,  0.02976439,
        0.03207998,  0.06195575,  0.04774508,  0.04236012,  0.04311716,
        0.04429337,  0.        ,  0.0626506 ,  0.03205128,  0.05979761,
        0.04564315,  0.03424658,  0.03738318,  0.06728538,  0.03813038,
        0.03401361,  0.02272727,  0.05128205,  0.04054054,  0.06382979,
        0.05882353,  0.06666667,  0.03846154,  0.05660377,  0.028125  ,
        0.02380952,  0.13333333,  0.28571429,  0.09166667,  0.12173913,
        0.06024096,  0.08333333,         nan,  0.03125   ,  0.07142857,
        0.06451613,  0.2       ,  0.5       ,  0.14285714,  0.03174603,
        0.18181818,  0.03030303,  0.10714286,  0.25      ,  0.125     ,
        0.1       ,  0.10344828,  0.06896552,  1.        ])

In [17]:
len(res[res['ind_bin_proba'].isnull()])

46

In [18]:
res2 = res.replace({np.NaN:freq_target})

In [19]:
res2['ind_bin_proba'].unique()

array([ 0.04157676,  0.03143546,  0.03479532,  0.0263716 ,  0.07279267,
        0.04429821,  0.0254136 ,  0.03699938,  0.04495058,  0.02976439,
        0.03207998,  0.06195575,  0.04774508,  0.04236012,  0.04311716,
        0.04429337,  0.        ,  0.0626506 ,  0.03205128,  0.05979761,
        0.04564315,  0.03424658,  0.03738318,  0.06728538,  0.03813038,
        0.03401361,  0.02272727,  0.05128205,  0.04054054,  0.06382979,
        0.05882353,  0.06666667,  0.03846154,  0.05660377,  0.028125  ,
        0.02380952,  0.13333333,  0.28571429,  0.09166667,  0.12173913,
        0.06024096,  0.08333333,  0.03644752,  0.03125   ,  0.07142857,
        0.06451613,  0.2       ,  0.5       ,  0.14285714,  0.03174603,
        0.18181818,  0.03030303,  0.10714286,  0.25      ,  0.125     ,
        0.1       ,  0.10344828,  0.06896552,  1.        ])

In [20]:
len(res2[res2['ind_bin_proba'].isnull()])

0

### calc_binds

In [21]:
calc_bins_fs = [f for f in binary_fs 
               if '_bin' in f
               if '_calc' in f]
calc_bins = train[calc_bins_fs]
calc_bins_test = test[calc_bins_fs]
calc_bins_fs

['ps_calc_15_bin',
 'ps_calc_16_bin',
 'ps_calc_17_bin',
 'ps_calc_18_bin',
 'ps_calc_19_bin',
 'ps_calc_20_bin']

In [22]:
jp2 = JointProbability()
jp2.fit(train.id, calc_bins, target, label='calc_bin_proba')
a3_train = jp2.transform(train.id, calc_bins, label='calc_bin_proba')
a3_test = jp2.transform(test.id, calc_bins_test, label='calc_bin_proba')

In [23]:
train.shape, a3_train.shape

((595212, 59), (595212, 2))

In [24]:
test.shape, a3_test.shape

((892816, 58), (892816, 2))

In [25]:
a3_test.iloc[:, 1].unique()

array([ 0.03587201,  0.03357851,  0.03749122,  0.03217113,  0.04086765,
        0.03581337,  0.0379685 ,  0.0358176 ,  0.0361297 ,  0.03699972,
        0.03792441,  0.03382664,  0.0359114 ,  0.03605727,  0.03796999,
        0.03589329,  0.03398208,  0.03674328,  0.03640309,  0.03633984,
        0.03378995,  0.03430942,  0.03659989,  0.03663723,  0.03946707,
        0.03853144,  0.03266713,  0.0352896 ,  0.03107789,  0.03908555,
        0.03403331,  0.0272045 ,  0.0381738 ,  0.03691363,  0.03658537,
        0.03381669,  0.03566787,  0.03342416,  0.03805478,  0.04458599,
        0.03837561,  0.03921569,  0.03435583,  0.03762493,  0.04084211,
        0.03917051,  0.03805497,  0.04229848,  0.04044295,  0.04106776,
        0.03246014,  0.04096229,  0.03155604,  0.04076739,  0.03588749,
        0.02955665,  0.0261324 ,  0.01565996,  0.0298103 ,  0.04217926,
        0.08095238,  0.01577287,  0.03030303,  0.03937008])

### Test on categoricals

In [26]:
car_cats_fs = [f for f in categorical_fs if 'car' in f]

# Add one to make missing value a category with 0 label
car_cats = train[car_cats_fs] + 1

car_cat_small_fs = ['ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat',
                     'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat']
car_cat_big_fs = [f for f in car_cats if f not in car_cat_small_fs]

car_cat_small = train[car_cat_small_fs]
car_cat_big = train[car_cat_big_fs]

car_cat_small_test = test[car_cat_small_fs]
car_cat_big_test = test[car_cat_big_fs]

In [27]:
jp3 = JointProbability()
jp3.fit(train.id, car_cat_small, target, label='car_cat_small_proba')
a4_train = jp3.transform(train.id, car_cat_small, label='car_cat_small_proba')
a4_test = jp3.transform(test.id, car_cat_small_test, label='car_cat_small_proba')

In [28]:
a4_train.shape, a4_test.shape

((595212, 2), (892816, 2))

In [29]:
a4_train.head(5)

Unnamed: 0,id,car_cat_small_proba
0,7,0.050438
1,9,0.028005
2,13,0.028005
3,16,0.033865
4,17,0.028005


In [30]:
a4_test.head(5)

Unnamed: 0,id,car_cat_small_proba
0,0,0.028005
1,1,0.031067
2,2,0.028005
3,3,0.028005
4,4,0.028005


In [31]:
a4_test.iloc[:, 1].unique()

array([ 0.02800542,  0.0310666 ,  0.04752475,  0.08196721,  0.03951424,
        0.0306845 ,  0.03359006,  0.05822744,  0.05942948,  0.0389968 ,
        0.06929638,  0.0271418 ,  0.04213695,  0.05043828,  0.04917726,
        0.02516657,  0.06593407,  0.04814341,  0.03839567,  0.0423016 ,
        0.03006823,  0.03143816,  0.04819277,  0.03688063,  0.0451137 ,
        0.07287449,  0.03807928,  0.05383265,  0.04014168,  0.0652819 ,
        0.03362665,  0.04381245,  0.03797468,  0.0513834 ,  0.0371517 ,
        0.03193003,  0.06945607,  0.07875895,  0.03359173,  0.04780115,
        0.07017544,  0.05588235,  0.06096131,  0.03878217,  0.03637686,
        0.06512605,  0.        ,  0.03993005,  0.02756245,  0.0939759 ,
        0.07822686,  0.11538462,  0.03386454,  0.05463183,  0.05380477,
        0.05562644,  0.05987879,  0.04189723,  0.04395604,  0.04587156,
        0.02893309,  0.07824427,  0.03477949,  0.02949062,  0.10569106,
        0.03745318,  0.053407  ,  0.04888889,  0.04780711,  0.06

In [32]:
reload(ps)

<module 'porto_seguro' from '/home/ryohayama/python_current/porto_seguro/porto_seguro.py'>

In [33]:
jp4 = ps.JointProbability()

In [34]:
jp4

<porto_seguro.JointProbability at 0x7fdf70611470>

## Processing Test Data

### convert bins and categoricals

In [35]:
ind_bins_fs = [f for f in all_fs 
               if '_bin' in f
               if '_ind' in f]
calc_bins_fs = [f for f in binary_fs 
               if '_bin' in f
               if '_calc' in f]
car_cats_fs = [f for f in categorical_fs if 'car' in f]
car_cat_small_fs = ['ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat',
                     'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat']
car_cat_big_fs = [f for f in car_cats if f not in car_cat_small_fs]
ind_cat_fs = [f for f in categorical_fs if 'ind' in f]


ind_bins = train[ind_bins_fs]
ind_bins_test = test[ind_bins_fs]

calc_bins = train[calc_bins_fs]
calc_bins_test = test[calc_bins_fs]

car_cat_small = train[car_cat_small_fs] + 1
car_cat_big = train[car_cat_big_fs] + 1

car_cat_small_test = test[car_cat_small_fs] + 1
car_cat_big_test = test[car_cat_big_fs] + 1

ind_cat = train[ind_cat_fs] + 1
ind_cat_test = test[ind_cat_fs] + 1

In [66]:
jp1 = ps.JointProbability()
proba_train1 = jp1.fit_transform(train.id, ind_bins, target, label='ind_bin_proba')
proba_test1 = jp1.transform(test.id, ind_bins_test, label='ind_bin_proba')

In [67]:
jp2 = ps.JointProbability()
proba_train2 = jp2.fit_transform(train.id, calc_bins, target, label='calc_bin_proba')
proba_test2 = jp2.transform(test.id, calc_bins_test, label='calc_bin_proba')

In [68]:
jp3 = ps.JointProbability()
proba_train3 = jp3.fit_transform(train.id, car_cat_small, target, label='car_cat_proba1')
proba_test3 = jp3.transform(test.id, car_cat_small_test, label='car_cat_proba1')

In [69]:
jp4 = ps.JointProbability()
proba_train4 = jp4.fit_transform(train.id, car_cat_big, target, label='car_cat_proba2')
proba_test4 = jp4.transform(test.id, car_cat_big_test, label='car_cat_proba2')

In [70]:
jp5 = ps.JointProbability()
proba_train5 = jp5.fit_transform(train.id, ind_cat, target, label='ind_cat_proba')
proba_test5 = jp5.transform(test.id, ind_cat_test, label='ind_cat_proba')

In [71]:
proba_train = pd.concat([proba_train1.iloc[:, 1], proba_train2.iloc[:, 1], proba_train3.iloc[:, 1], 
                         proba_train4.iloc[:, 1], proba_train5.iloc[:, 1]], axis=1)
proba_test = pd.concat([proba_test1.iloc[:, 1], proba_test2.iloc[:, 1], proba_test3.iloc[:, 1], 
                        proba_test4.iloc[:, 1], proba_test5.iloc[:, 1]], axis=1)

In [72]:
proba_train.shape, proba_test.shape

((595212, 5), (892816, 5))

In [59]:
reload(ps)

<module 'porto_seguro' from '/home/ryohayama/python_current/porto_seguro/porto_seguro.py'>

In [62]:
cont_ordinals = train[other_fs]
cont_ordinals.columns

Index(['ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05',
       'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10',
       'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_ind_01',
       'ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_reg_01', 'ps_reg_02',
       'ps_reg_03'],
      dtype='object')

In [61]:
%%time
ord_proba_train, ord_proba_test = ps.convertAllOrdinals(cont_ordinals, target, test=test[other_fs], verbose=True)

1/26 - Processing ps_calc_01_proba
2/26 - Processing ps_calc_02_proba
3/26 - Processing ps_calc_03_proba
4/26 - Processing ps_calc_04_proba
5/26 - Processing ps_calc_05_proba
6/26 - Processing ps_calc_06_proba
7/26 - Processing ps_calc_07_proba
8/26 - Processing ps_calc_08_proba
9/26 - Processing ps_calc_09_proba
10/26 - Processing ps_calc_10_proba
11/26 - Processing ps_calc_11_proba
12/26 - Processing ps_calc_12_proba
13/26 - Processing ps_calc_13_proba
14/26 - Processing ps_calc_14_proba
15/26 - Processing ps_car_11_proba
16/26 - Processing ps_car_12_proba
17/26 - Processing ps_car_13_proba
18/26 - Processing ps_car_14_proba
19/26 - Processing ps_car_15_proba
20/26 - Processing ps_ind_01_proba
21/26 - Processing ps_ind_03_proba
22/26 - Processing ps_ind_14_proba
23/26 - Processing ps_ind_15_proba
24/26 - Processing ps_reg_01_proba
25/26 - Processing ps_reg_02_proba
26/26 - Processing ps_reg_03_proba
CPU times: user 7min 31s, sys: 10.8 s, total: 7min 42s
Wall time: 7min 39s


In [63]:
ord_proba_train.shape, ord_proba_test.shape

((595212, 26), (892816, 26))

In [64]:
ord_proba_train.head(5)

Unnamed: 0,ps_calc_01_proba,ps_calc_02_proba,ps_calc_03_proba,ps_calc_04_proba,ps_calc_05_proba,ps_calc_06_proba,ps_calc_07_proba,ps_calc_08_proba,ps_calc_09_proba,ps_calc_10_proba,...,ps_car_13_proba,ps_car_14_proba,ps_car_15_proba,ps_ind_01_proba,ps_ind_03_proba,ps_ind_14_proba,ps_ind_15_proba,ps_reg_01_proba,ps_reg_02_proba,ps_reg_03_proba
0,0.03646,0.036653,0.036569,0.036238,0.036416,0.035856,0.037201,0.036056,0.035772,0.035926,...,0.038135,0.034162,0.041611,0.036703,0.038238,0.036301,0.03135,0.038495,0.03008,0.034786
1,0.036705,0.035906,0.036934,0.036815,0.036416,0.036244,0.035555,0.037104,0.035772,0.036472,...,0.029289,0.034343,0.029716,0.033699,0.041305,0.036301,0.039499,0.0388,0.03595,0.035853
2,0.037187,0.036696,0.035805,0.036815,0.036599,0.036244,0.037201,0.037104,0.036717,0.036472,...,0.029937,0.034555,0.038935,0.040509,0.036296,0.036301,0.030678,0.038008,0.032287,0.028393
3,0.03646,0.036205,0.035805,0.036815,0.036617,0.036031,0.037201,0.037104,0.036884,0.036159,...,0.0274,0.036728,0.026866,0.033238,0.030755,0.036301,0.036074,0.041095,0.03008,0.032475
4,0.037041,0.036908,0.035256,0.036815,0.036599,0.036363,0.036004,0.036056,0.036717,0.035816,...,0.02802,0.034173,0.026866,0.033238,0.053244,0.036301,0.032327,0.038495,0.04189,0.037689


In [65]:
ord_proba_test.head(5)

Unnamed: 0,ps_calc_01_proba,ps_calc_02_proba,ps_calc_03_proba,ps_calc_04_proba,ps_calc_05_proba,ps_calc_06_proba,ps_calc_07_proba,ps_calc_08_proba,ps_calc_09_proba,ps_calc_10_proba,...,ps_car_13_proba,ps_car_14_proba,ps_car_15_proba,ps_ind_01_proba,ps_ind_03_proba,ps_ind_14_proba,ps_ind_15_proba,ps_reg_01_proba,ps_reg_02_proba,ps_reg_03_proba
0,0.03545,0.036588,0.036343,0.036393,0.036416,0.036363,0.036004,0.037142,0.036717,0.037609,...,0.030823,0.034439,0.040571,0.033238,0.038862,0.036301,0.030678,0.031729,0.032322,0.032943
1,0.037041,0.036653,0.036847,0.036238,0.03609,0.037164,0.036759,0.036056,0.036717,0.036472,...,0.029075,0.034263,0.032915,0.044386,0.038238,0.036301,0.03926,0.041095,0.038478,0.036075
2,0.03646,0.036908,0.036343,0.036815,0.03609,0.036031,0.036759,0.037142,0.036606,0.035816,...,0.038818,0.034715,0.038935,0.040509,0.029917,0.036301,0.0335,0.034311,0.032287,0.039321
3,0.03545,0.036653,0.036654,0.036815,0.036416,0.036031,0.036004,0.036056,0.035772,0.035319,...,0.030156,0.034264,0.029716,0.033238,0.041493,0.036301,0.038997,0.024206,0.03008,0.028393
4,0.036925,0.036908,0.036995,0.036238,0.036617,0.036031,0.037201,0.036056,0.036884,0.035816,...,0.035231,0.034343,0.038935,0.040509,0.041305,0.036301,0.038997,0.041095,0.03595,0.036989


In [78]:
train_probas = pd.concat([ord_proba_train, proba_train], axis=1)
test_probas = pd.concat([ord_proba_test, proba_test], axis=1)

In [79]:
train_probas.shape, test_probas.shape

((595212, 31), (892816, 31))

In [80]:
train_probas.head(5)

Unnamed: 0,ps_calc_01_proba,ps_calc_02_proba,ps_calc_03_proba,ps_calc_04_proba,ps_calc_05_proba,ps_calc_06_proba,ps_calc_07_proba,ps_calc_08_proba,ps_calc_09_proba,ps_calc_10_proba,...,ps_ind_14_proba,ps_ind_15_proba,ps_reg_01_proba,ps_reg_02_proba,ps_reg_03_proba,ind_bin_proba,calc_bin_proba,car_cat_proba1,car_cat_proba2,ind_cat_proba
0,0.03646,0.036653,0.036569,0.036238,0.036416,0.035856,0.037201,0.036056,0.035772,0.035926,...,0.036301,0.03135,0.038495,0.03008,0.034786,0.072793,0.035872,0.050438,0.032401,0.033319
1,0.036705,0.035906,0.036934,0.036815,0.036416,0.036244,0.035555,0.037104,0.035772,0.036472,...,0.036301,0.039499,0.0388,0.03595,0.035853,0.044951,0.035893,0.028005,0.026249,0.031885
2,0.037187,0.036696,0.035805,0.036815,0.036599,0.036244,0.037201,0.037104,0.036717,0.036472,...,0.036301,0.030678,0.038008,0.032287,0.028393,0.036999,0.035893,0.028005,0.023227,0.039012
3,0.03646,0.036205,0.035805,0.036815,0.036617,0.036031,0.037201,0.037104,0.036884,0.036159,...,0.036301,0.036074,0.041095,0.03008,0.032475,0.026372,0.037491,0.033865,0.023859,0.031885
4,0.037041,0.036908,0.035256,0.036815,0.036599,0.036363,0.036004,0.036056,0.036717,0.035816,...,0.036301,0.032327,0.038495,0.04189,0.037689,0.026372,0.032667,0.028005,0.030589,0.033319


In [81]:
train_probas.describe()

Unnamed: 0,ps_calc_01_proba,ps_calc_02_proba,ps_calc_03_proba,ps_calc_04_proba,ps_calc_05_proba,ps_calc_06_proba,ps_calc_07_proba,ps_calc_08_proba,ps_calc_09_proba,ps_calc_10_proba,...,ps_ind_14_proba,ps_ind_15_proba,ps_reg_01_proba,ps_reg_02_proba,ps_reg_03_proba,ind_bin_proba,calc_bin_proba,car_cat_proba1,car_cat_proba2,ind_cat_proba
count,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,...,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0
mean,0.036445,0.036465,0.036461,0.03645,0.036437,0.036452,0.036468,0.03634,0.036453,0.0364332,...,0.036406,0.036484,0.036446,0.036337,0.03636121,0.036448,0.036448,0.036448,0.036448,0.036448
std,0.000531,0.000524,0.000523,0.00036,0.000402,0.000553,0.000678,0.000432,0.000413,0.0007919283,...,0.001132,0.004152,0.005039,0.006766,0.006336163,0.011221,0.002075,0.013676,0.018014,0.009745
min,0.03545,0.03521,0.035256,0.035856,0.03609,9e-06,0.033531,0.015746,0.035683,2.873815e-07,...,0.036301,0.030678,0.024206,0.029951,1.870839e-07,0.0,0.01566,0.0,0.0,0.0
25%,0.036006,0.036205,0.036343,0.036238,0.036234,0.036031,0.036004,0.036056,0.036389,0.03595479,...,0.036301,0.032327,0.034311,0.032287,0.03165348,0.026372,0.035872,0.028005,0.025173,0.031885
50%,0.03646,0.036653,0.036654,0.036393,0.036416,0.036244,0.036387,0.036175,0.036606,0.03622631,...,0.036301,0.038997,0.038495,0.032322,0.0349921,0.03208,0.03613,0.031438,0.032868,0.034126
75%,0.036925,0.036813,0.036847,0.036815,0.036599,0.037164,0.036759,0.036397,0.036717,0.03693915,...,0.036301,0.039499,0.041095,0.04189,0.04125127,0.041577,0.037924,0.03993,0.043974,0.035995
max,0.037187,0.037046,0.036995,0.037704,0.045744,0.037782,0.058398,0.037792,0.036884,0.04697094,...,0.187267,0.045025,0.041095,0.061372,0.1508572,1.0,0.080952,1.0,1.0,0.417722


In [82]:
test_probas.head(5)

Unnamed: 0,ps_calc_01_proba,ps_calc_02_proba,ps_calc_03_proba,ps_calc_04_proba,ps_calc_05_proba,ps_calc_06_proba,ps_calc_07_proba,ps_calc_08_proba,ps_calc_09_proba,ps_calc_10_proba,...,ps_ind_14_proba,ps_ind_15_proba,ps_reg_01_proba,ps_reg_02_proba,ps_reg_03_proba,ind_bin_proba,calc_bin_proba,car_cat_proba1,car_cat_proba2,ind_cat_proba
0,0.03545,0.036588,0.036343,0.036393,0.036416,0.036363,0.036004,0.037142,0.036717,0.037609,...,0.036301,0.030678,0.031729,0.032322,0.032943,0.041577,0.035872,0.028005,0.025173,0.035995
1,0.037041,0.036653,0.036847,0.036238,0.03609,0.037164,0.036759,0.036056,0.036717,0.036472,...,0.036301,0.03926,0.041095,0.038478,0.036075,0.031435,0.033579,0.031067,0.039267,0.033319
2,0.03646,0.036908,0.036343,0.036815,0.03609,0.036031,0.036759,0.037142,0.036606,0.035816,...,0.036301,0.0335,0.034311,0.032287,0.039321,0.034795,0.037491,0.028005,0.039386,0.031885
3,0.03545,0.036653,0.036654,0.036815,0.036416,0.036031,0.036004,0.036056,0.035772,0.035319,...,0.036301,0.038997,0.024206,0.03008,0.028393,0.026372,0.032171,0.028005,0.026772,0.031885
4,0.036925,0.036908,0.036995,0.036238,0.036617,0.036031,0.037201,0.036056,0.036884,0.035816,...,0.036301,0.038997,0.041095,0.03595,0.036989,0.031435,0.035872,0.028005,0.041156,0.031885


In [83]:
test_probas.describe()

Unnamed: 0,ps_calc_01_proba,ps_calc_02_proba,ps_calc_03_proba,ps_calc_04_proba,ps_calc_05_proba,ps_calc_06_proba,ps_calc_07_proba,ps_calc_08_proba,ps_calc_09_proba,ps_calc_10_proba,...,ps_ind_14_proba,ps_ind_15_proba,ps_reg_01_proba,ps_reg_02_proba,ps_reg_03_proba,ind_bin_proba,calc_bin_proba,car_cat_proba1,car_cat_proba2,ind_cat_proba
count,892816.0,892816.0,892816.0,892816.0,892816.0,892816.0,892816.0,892816.0,892816.0,892816.0,...,892816.0,892816.0,892816.0,892816.0,892816.0,892770.0,892816.0,892717.0,892561.0,892797.0
mean,0.036446,0.036464,0.036461,0.036451,0.036438,0.036452,0.036469,0.03634,0.036453,0.03643476,...,0.036406,0.03649,0.03645,0.036346,0.03637199,0.036444,0.036444,0.036418,0.036382,0.03646
std,0.000532,0.000523,0.000523,0.00036,0.000405,0.000555,0.000684,0.000438,0.000413,0.0007931254,...,0.001197,0.004152,0.005038,0.006771,0.006342216,0.011196,0.002051,0.013381,0.017878,0.009762
min,0.03545,0.03521,0.035256,0.035856,0.03609,0.011868,0.033531,0.000728,0.035683,2.873815e-07,...,0.036301,0.030678,0.024206,0.029951,1.870839e-07,0.0,0.01566,0.0,0.0,0.0
25%,0.036006,0.036205,0.036343,0.036238,0.036234,0.036031,0.036004,0.036056,0.036389,0.03595479,...,0.036301,0.032327,0.034311,0.032287,0.03165348,0.026372,0.035872,0.028005,0.025173,0.031885
50%,0.03646,0.036617,0.036654,0.036393,0.036416,0.036244,0.036387,0.036175,0.036606,0.03622631,...,0.036301,0.038997,0.038495,0.032322,0.0349921,0.03208,0.03613,0.031438,0.032868,0.034126
75%,0.036925,0.036813,0.036847,0.036815,0.036599,0.037164,0.036759,0.036397,0.036717,0.03693915,...,0.036301,0.039499,0.041095,0.04189,0.04125127,0.041577,0.037924,0.03993,0.043974,0.035995
max,0.037187,0.037046,0.036995,0.037704,0.045744,0.037782,0.058398,0.037792,0.036884,0.04697094,...,0.187267,0.045025,0.041095,0.061372,0.1508572,1.0,0.080952,1.0,1.0,0.417722


In [84]:
train_probas.to_csv('./data/train_probas.csv', index=False)
test_probas.to_csv('./data/test_probas.csv', index=False)