# Setting up a class for converting features into joint probability space

<b>Groupings:</b>
- ind_bin (no missing values)
- calc_bin (no missing values)
- car_cat (add 1 to all to create category of missing values (i.e. cat=0)
  1. ['ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat',
      'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat']
  2. The rest
- ind_cat
<br><br>

<b>Processsing:</b>
1. Create a df with two columns:
  1. id from the original data
  2. tuple of features above (joint_vector)
2. Create a crosstab (i.e. contingency table) of the joint_vector with target vector
  - Note: Index for this will be the tuples
3. Create two new columns on the crosstab:
  1. Total count for each row/tuple
  2. Cond. Proba. of being in class1 given the tuple (divide class1 count by total)
4. Merge the 1st df with the crosstab['proba'] on the tuples.
  - Note1: Retain all the df tuples
  - Note2: In the test set, there might be some tuples not seen in train set.
    In that case, check for NaN, replace them with zeros.
5. Create a new df with just id and the proba column
6. Combined different groups of proba-converted features.
<br><br>

<b>Optimization</b><br>
Some features probably won't yield any higher joint probability even if it's combined. One could potentially improve the discriminatory power of joint distribution by omitting some features from each grouping. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import statsmodels.api as sm
%matplotlib inline

from collections import namedtuple
from importlib import reload
from matplotlib import cm
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.metrics import (make_scorer, roc_auc_score, 
                             classification_report, 
                             precision_recall_curve,
                             roc_curve)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
#from xgboost import XGBClassifier

from feature_processing import (create_contingency_table,
                                calculate_conditional_prob_bin,
                                encode_my_categorical_labels,
                                calculate_conditional_prob_cat,
                                estimate_cond_prob_density,
                                bin_myFeature)
import feature_analysis as fa
import porto_seguro as ps

import warnings
warnings.simplefilter("ignore", category=PendingDeprecationWarning)
warnings.simplefilter("ignore", category=DeprecationWarning)

  from pandas.core import datetools


In [2]:
def my_gini(y_true, y_probas):
    auc = roc_auc_score(y_true, y_probas[:, 1])
    gini = 2*auc - 1
    return gini

gini_scorer = make_scorer(my_gini, needs_proba=True, greater_is_better=True)

In [3]:
train = pd.read_csv('train.csv', header=0)

all_fs = train.columns[2:]
binary_fs = sorted([f for f in all_fs if '_bin' in f])
categorical_fs = sorted([f for f in all_fs if '_cat' in f])
other_fs = sorted([f for f in all_fs
                if f not in binary_fs
                if f not in categorical_fs])

binaries = train[binary_fs]
categoricals = ps.fuseCategoricalFeatures(train[categorical_fs])
cont_ordinals = train[other_fs]
target = train.target

train.shape, binaries.shape, categoricals.shape, cont_ordinals.shape

((595212, 59), (595212, 17), (595212, 14), (595212, 26))

In [4]:
train.columns

Index(['id', 'target', 'ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03',
       'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin',
       'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin',
       'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01',
       'ps_reg_02', 'ps_reg_03', 'ps_car_01_cat', 'ps_car_02_cat',
       'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
       'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat',
       'ps_car_11_cat', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14',
       'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04',
       'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09',
       'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14',
       'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin',
       'ps_calc_19_bin', 'ps_calc_20_bin'],


In [5]:
num_samples = len(target)
num_target = np.sum(target)
freq_target = num_target/num_samples
freq_target

0.036447517859182946

In [None]:
class JointProbability:
    def __init__(self):
        self.contingency = None
    
    def fit(self, id_, feature, target, label=None):
        '''
        id_ : (n_samples,)
            column vector containing ids for each sample
        feature: (n_samples, n_features)
            features to be joined
        target : (n_sampes, )
            column vector containing class label of each sample
        '''
        if label is None:
            label = 'cond_proba'
        else:
            label = label
            
        df1 = pd.DataFrame()
        df1['id'] = id_
        df1['combined_feature'] = feature.apply(tuple, axis=1)
        
        if self.contingency is None:
            contingency = pd.crosstab(df1['combined_feature'], target)
            contingency[label] = contingency[1]/(contingency[0] + contingency[1])
            self.contingency = contingency.reset_index()
        
        df2 = pd.merge(df1, contingency[['combined_feature', label]], 
                   left_on='combined_feature', right_on='combined_feature')
    
        return df2[['id', label]]
    
    def transform(self, id_, feature, label=None):
        pass

In [28]:
def calculate_joint_proba(id_, feature, target, label=False):
    '''
    id_ : (n_samples,)
        column vector containing ids for each sample
    feature: (n_samples, n_features)
        features to be joined
    target : (n_sampes, )
        column vector containing class label of each sample
    '''
    if label:
        label = label
    else:
        label = 'cond_proba'
    df1 = pd.DataFrame()
    df1['id'] = id_
    df1['feature_combo'] = feature.apply(tuple, axis=1)
    
    contingency = pd.crosstab(df1['feature_combo'], target)
    contingency['total'] = contingency[0] + contingency[1]
    contingency[label] = contingency[1]/contingency['total']
    
    contingency = contingency.reset_index()
    
    df2 = pd.merge(df1, contingency[['feature_combo', label]], 
                   left_on='feature_combo', right_on='feature_combo')
    
    return df2[['id', label]]

In [26]:
ind_bins_fs = [f for f in all_fs 
               if '_bin' in f
               if '_ind' in f]
ind_bins = train[ind_bins_fs]
ind_bins.columns

Index(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin',
       'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin'],
      dtype='object')

In [29]:
%%time
calculate_joint_proba(train['id'], ind_bins, target, label='ind_bin_proba')

CPU times: user 14.7 s, sys: 300 ms, total: 15 s
Wall time: 15.1 s


Unnamed: 0,id,feature_combo,ind_bin_proba
0,7,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0)",0.072793
1,125,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0)",0.072793
2,138,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0)",0.072793
3,235,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0)",0.072793
4,286,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0)",0.072793
5,305,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0)",0.072793
6,316,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0)",0.072793
7,344,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0)",0.072793
8,531,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0)",0.072793
9,573,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0)",0.072793
