In order to construct Bayesian Networks on our world happiness dataset, we first had to discritize all the continous features. In the past different types of discritization techniques viz. Supervised, Manual and Unsupervised have been explored and conclusions suggest that supervised algorithms such as the Fayyad and Irani (1993)http://refhub.elsevier.com/S1364-8152(17)31326-9/sref9 and Kononenko (1995)http://refhub.elsevier.com/S1364-8152(17)31326-9/sref14 which are based on maximizing predictability of the output variable generally outperform Unsupervised methods (which perform discritization on the basis of distributions). However, a downside of using supervised techniques are the resultant large CPTs(Conditional Probability Tables). On the other hand manual discritization offers greater flexibility in selecting appropriate intervals which are also physically interpretable but might not offer the best predictive power. To address this tradeoff we used a supervised learning alogrithm and then manually removed extra bins which were found to be unessential as per the literature thus preventing a large CPT. 

## Implementation of Fayyad and Irani's MDLP criterion discretiation algorithm
https://github.com/navicto/Discretization-MDLPC

In [2]:
from __future__ import division

__author__ = 'Victor Ruiz, vmr11@pitt.edu'

import pandas as pd
import numpy as np
from math import log

def entropy_numpy(data_classes, base=2):
    '''
    Computes the entropy of a set of labels (class instantiations)
    :param base: logarithm base for computation
    :param data_classes: Series with labels of examples in a dataset
    :return: value of entropy
    '''
    classes = np.unique(data_classes)
    N = len(data_classes)
    ent = 0  # initialize entropy

    # iterate over classes
    for c in classes:
        partition = data_classes[data_classes == c]  # data with class = c
        proportion = len(partition) / N
        #update entropy
        ent -= proportion * log(proportion, base)

    return ent

def cut_point_information_gain_numpy(X, y, cut_point):
    '''
    Return de information gain obtained by splitting a numeric attribute in two according to cut_point
    :param dataset: pandas dataframe with a column for attribute values and a column for class
    :param cut_point: threshold at which to partition the numeric attribute
    :param feature_label: column label of the numeric attribute values in data
    :param class_label: column label of the array of instance classes
    :return: information gain of partition obtained by threshold cut_point
    '''
    entropy_full = entropy_numpy(y)  # compute entropy of full dataset (w/o split)

    #split data at cut_point
    data_left_mask = X <= cut_point #dataset[dataset[feature_label] <= cut_point]
    data_right_mask = X > cut_point #dataset[dataset[feature_label] > cut_point]
    (N, N_left, N_right) = (len(X), data_left_mask.sum(), data_right_mask.sum())

    gain = entropy_full - (N_left / N) * entropy_numpy(y[data_left_mask]) - \
        (N_right / N) * entropy_numpy(y[data_right_mask])

    return gain

In [4]:
from __future__ import division
__author__ = 'Victor Ruiz, vmr11@pitt.edu'
import numpy as np
#from Entropy import entropy_numpy, cut_point_information_gain_numpy
from math import log
from sklearn.base import TransformerMixin
from sklearn import datasets
from sklearn.model_selection import train_test_split

def previous_item(a, val):
    idx = np.where(a == val)[0][0] - 1
    return a[idx]

class MDLP_Discretizer(TransformerMixin):
    def __init__(self, features=None, raw_data_shape=None):
        '''
        initializes discretizer object:
            saves raw copy of data and creates self._data with only features to discretize and class
            computes initial entropy (before any splitting)
            self._features = features to be discretized
            self._classes = unique classes in raw_data
            self._class_name = label of class in pandas dataframe
            self._data = partition of data with only features of interest and class
            self._cuts = dictionary with cut points for each feature
        :param X: pandas dataframe with data to discretize
        :param class_label: name of the column containing class in input dataframe
        :param features: if !None, features that the user wants to discretize specifically
        :return:
        '''
        #Initialize descriptions of discretizatino bins
        self._bin_descriptions = {}

        #Create array with attr indices to discretize
        if features is None:  # Assume all columns are numeric and need to be discretized
            if raw_data_shape is None:
                raise Exception("If feautes=None, raw_data_shape must be a non-empty tuple")
            self._col_idx = range(raw_data_shape[1])
        else:
            if not isinstance(features, np.ndarray):
                features = np.array(features)
            if np.issubdtype(features.dtype, np.integer):
                self._col_idx = features
            elif np.issubdtype(features.dtype, np.bool):  # features passed as mask
                if raw_data_shape is None:
                    raise Exception('If features is a boolean array, raw_data_shape must be != None')
                if len(features) != self._data_raw.shape[1]:
                    raise Exception('Column boolean mask must be of dimensions (NColumns,)')
                self._col_idx = np.where(features)
            else:
                raise Exception('features argument must a np.array of column indices or a boolean mask')

    def fit(self, X, y):
        self._data_raw = X  # copy of original input data
        self._class_labels = y.reshape(-1, 1)  # make sure class labels is a column vector
        self._classes = np.unique(self._class_labels)


        if len(self._col_idx) != self._data_raw.shape[1]:  # some columns will not be discretized
            self._ignore_col_idx = np.array([var for var in range(self._data_raw.shape[1]) if var not in self._col_idx])

        # initialize feature bins cut points
        self._cuts = {f: [] for f in self._col_idx}

        # pre-compute all boundary points in dataset
        self._boundaries = self.compute_boundary_points_all_features()

        # get cuts for all features
        self.all_features_accepted_cutpoints()

        #generate bin string descriptions
        self.generate_bin_descriptions()

        #Generate one-hot encoding schema

        return self

    def transform(self, X, inplace=False):
        if inplace:
            discretized = X
        else:
            discretized = X.copy()
        discretized = self.apply_cutpoints(discretized)
        return discretized
    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y)
        return self.transform(X, inplace=True)

    def MDLPC_criterion(self, X, y, feature_idx, cut_point):
        '''
        Determines whether a partition is accepted according to the MDLPC criterion
        :param feature: feature of interest
        :param cut_point: proposed cut_point
        :param partition_index: index of the sample (dataframe partition) in the interval of interest
        :return: True/False, whether to accept the partition
        '''
        #get dataframe only with desired attribute and class columns, and split by cut_point
        left_mask = X <= cut_point
        right_mask = X > cut_point

        #compute information gain obtained when splitting data at cut_point
        cut_point_gain = cut_point_information_gain_numpy(X, y, cut_point)
        #compute delta term in MDLPC criterion
        N = len(X) # number of examples in current partition
        partition_entropy = entropy_numpy(y)
        k = len(np.unique(y))
        k_left = len(np.unique(y[left_mask]))
        k_right = len(np.unique(y[right_mask]))
        entropy_left = entropy_numpy(y[left_mask])  # entropy of partition
        entropy_right = entropy_numpy(y[right_mask])
        delta = log(3 ** k, 2) - (k * partition_entropy) + (k_left * entropy_left) + (k_right * entropy_right)

        #to split or not to split
        gain_threshold = (log(N - 1, 2) + delta) / N

        if cut_point_gain > gain_threshold:
            return True
        else:
            return False

    def feature_boundary_points(self, values):
        '''
        Given an attribute, find all potential cut_points (boundary points)
        :param feature: feature of interest
        :param partition_index: indices of rows for which feature value falls whithin interval of interest
        :return: array with potential cut_points
        '''

        missing_mask = np.isnan(values)
        data_partition = np.concatenate([values[:, np.newaxis], self._class_labels], axis=1)
        data_partition = data_partition[~missing_mask]
        #sort data by values
        data_partition = data_partition[data_partition[:, 0].argsort()]

        #Get unique values in column
        unique_vals = np.unique(data_partition[:, 0])  # each of this could be a bin boundary
        #Find if when feature changes there are different class values
        boundaries = []
        for i in range(1, unique_vals.size):  # By definition first unique value cannot be a boundary
            previous_val_idx = np.where(data_partition[:, 0] == unique_vals[i-1])[0]
            current_val_idx = np.where(data_partition[:, 0] == unique_vals[i])[0]
            merged_classes = np.union1d(data_partition[previous_val_idx, 1], data_partition[current_val_idx, 1])
            if merged_classes.size > 1:
                boundaries += [unique_vals[i]]
        boundaries_offset = np.array([previous_item(unique_vals, var) for var in boundaries])
        return (np.array(boundaries) + boundaries_offset) / 2

    def compute_boundary_points_all_features(self):
        '''
        Computes all possible boundary points for each attribute in self._features (features to discretize)
        :return:
        '''
        def padded_cutpoints_array(arr, N):
            cutpoints = self.feature_boundary_points(arr)
            padding = np.array([np.nan] * (N - len(cutpoints)))
            return np.concatenate([cutpoints, padding])

        boundaries = np.empty(self._data_raw.shape)
        boundaries[:, self._col_idx] = np.apply_along_axis(padded_cutpoints_array, 0, self._data_raw[:, self._col_idx], self._data_raw.shape[0])
        mask = np.all(np.isnan(boundaries), axis=1)
        return boundaries[~mask]

    def boundaries_in_partition(self, X, feature_idx):
        '''
        From the collection of all cut points for all features, find cut points that fall within a feature-partition's
        attribute-values' range
        :param data: data partition (pandas dataframe)
        :param feature: attribute of interest
        :return: points within feature's range
        '''
        range_min, range_max = (X.min(), X.max())
        mask = np.logical_and((self._boundaries[:, feature_idx] > range_min), (self._boundaries[:, feature_idx] < range_max))
        return np.unique(self._boundaries[:, feature_idx][mask])

    def best_cut_point(self, X, y, feature_idx):
        '''
        Selects the best cut point for a feature in a data partition based on information gain
        :param data: data partition (pandas dataframe)
        :param feature: target attribute
        :return: value of cut point with highest information gain (if many, picks first). None if no candidates
        '''
        candidates = self.boundaries_in_partition(X, feature_idx=feature_idx)
        if candidates.size == 0:
            return None
        gains = [(cut, cut_point_information_gain_numpy(X, y, cut_point=cut)) for cut in candidates]
        gains = sorted(gains, key=lambda x: x[1], reverse=True)

        return gains[0][0] #return cut point

    def single_feature_accepted_cutpoints(self, X, y, feature_idx):
        '''
        Computes the cuts for binning a feature according to the MDLP criterion
        :param feature: attribute of interest
        :param partition_index: index of examples in data partition for which cuts are required
        :return: list of cuts for binning feature in partition covered by partition_index
        '''

        #Delte missing data
        mask = np.isnan(X)
        X = X[~mask]
        y = y[~mask]

        #stop if constant or null feature values
        if len(np.unique(X)) < 2:
            return
        #determine whether to cut and where
        cut_candidate = self.best_cut_point(X, y, feature_idx)
        if cut_candidate == None:
            return
        decision = self.MDLPC_criterion(X, y, feature_idx, cut_candidate)

        # partition masks
        left_mask = X <= cut_candidate
        right_mask = X > cut_candidate

        #apply decision
        if not decision:
            return  # if partition wasn't accepted, there's nothing else to do
        if decision:
            #now we have two new partitions that need to be examined
            left_partition = X[left_mask]
            right_partition = X[right_mask]
            if (left_partition.size == 0) or (right_partition.size == 0):
                return #extreme point selected, don't partition
            self._cuts[feature_idx] += [cut_candidate]  # accept partition
            self.single_feature_accepted_cutpoints(left_partition, y[left_mask], feature_idx)
            self.single_feature_accepted_cutpoints(right_partition, y[right_mask], feature_idx)
            #order cutpoints in ascending order
            self._cuts[feature_idx] = sorted(self._cuts[feature_idx])
            return

    def all_features_accepted_cutpoints(self):
        '''
        Computes cut points for all numeric features (the ones in self._features)
        :return:
        '''
        for attr in self._col_idx:
            self.single_feature_accepted_cutpoints(X=self._data_raw[:, attr], y=self._class_labels, feature_idx=attr)
        return

    def generate_bin_descriptions(self):
        '''
        Discretizes data by applying bins according to self._cuts. Saves a new, discretized file, and a description of
        the bins
        :param out_data_path: path to save discretized data
        :param out_bins_path: path to save bins description
        :return:
        '''
        bin_label_collection = {}
        for attr in self._col_idx:
            if len(self._cuts[attr]) == 0:
                bin_label_collection[attr] = ['All']
            else:
                cuts = [-np.inf] + self._cuts[attr] + [np.inf]
                start_bin_indices = range(0, len(cuts) - 1)
                bin_labels = ['%s_to_%s' % (str(cuts[i]), str(cuts[i+1])) for i in start_bin_indices]
                bin_label_collection[attr] = bin_labels
                self._bin_descriptions[attr] = {i: bin_labels[i] for i in range(len(bin_labels))}


    def apply_cutpoints(self, data):
        '''
        Discretizes data by applying bins according to self._cuts. Saves a new, discretized file, and a description of
        the bins
        :param out_data_path: path to save discretized data
        :param out_bins_path: path to save bins description
        :return:
        '''
        for attr in self._col_idx:
            if len(self._cuts[attr]) == 0:
                # data[:, attr] = 'All'
                data[:, attr] = 0
            else:
                cuts = [-np.inf] + self._cuts[attr] + [np.inf]
                discretized_col = np.digitize(x=data[:, attr], bins=cuts, right=False).astype('float') - 1
                discretized_col[np.isnan(data[:, attr])] = np.nan
                data[:, attr] = discretized_col
        return data

In [8]:
dataset = pd.read_csv('BN_continous_data.csv')
dataset_y = dataset['Life Ladder']
dataset_x = dataset.drop(['Life Ladder'],axis=1)
dataset_x.head()

Unnamed: 0,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government,Democratic Quality,Delivery Quality,"gini of household income reported in Gallup, by wp5-year"
0,7.471376,0.559072,53.0,0.522566,0.053188,0.793246,0.564953,0.348332,0.32499,-1.855448,-1.394648,0.418629
1,7.472197,0.49088,52.799999,0.427011,-0.110382,0.954393,0.496349,0.371326,0.261179,-1.896539,-1.440218,0.286599
2,7.458603,0.507516,52.599998,0.373536,-0.082319,0.927606,0.424125,0.404904,0.364666,-1.870725,-1.438761,0.290681
3,7.458469,0.419973,52.400002,0.393656,-0.096549,0.923849,0.351387,0.502474,0.341482,-1.874237,-1.424542,0.37493
4,9.337564,0.638411,68.099998,0.729819,-0.017473,0.901071,0.675244,0.321706,0.40091,0.257297,-0.13296,0.41654


In [19]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
#from MDLP import MDLP_Discretizer

def main():

    ######### USE-CASE EXAMPLE #############

    #read dataset
    dataset = pd.read_csv('BN_continous_data.csv')
    dataset_y = dataset['Life Ladder']
    dataset_x = dataset.drop(['Life Ladder'],axis=1)
    #dataset_x.head()
    dataset_x= dataset_x.to_numpy()
    dataset_y= dataset_y.to_numpy()
    #feature_names, class_names = dataset['feature_names'], dataset['target_names']
    numeric_features = np.arange(dataset_x.shape[1])  # all features in this dataset are numeric. These will be discretized

    #Split between training and test
    X_train, X_test, y_train, y_test = train_test_split(dataset_x,dataset_y, test_size=0.33)

    #Initialize discretizer object and fit to training data
    discretizer = MDLP_Discretizer(features=numeric_features)
    discretizer.fit(X_train, y_train)
    X_train_discretized = discretizer.transform(X_train)
    #apply same discretization to test set
    X_test_discretized = discretizer.transform(X_test)
    pd_x_train_dis= pd.DataFrame(X_train_discretized)
    pd_x_train_dis['Life Ladder'] = y_train 
    print(pd_x_train_dis.head())
    pd_x_test_dis= pd.DataFrame(X_test_discretized)
    pd_x_test_dis['Life Ladder'] = y_test
    print(pd_x_test_dis.head())
    df_row_reindex = pd.concat([pd_x_train_dis,pd_x_test_dis], ignore_index=True)
    df_row_reindex.to_csv('BN_data_dis.csv',index=False)
    #Print a slice of original and discretized data
    #print('Original dataset:\n%s' % str(X_train[0:5]))
    #print('Discretized dataset:\n%s' % str(X_train_discretized[0:5]))

    #see how feature 0 was discretized
    #print('Feature: %s' % feature_names[0])
    #print('Interval cut-points: %s' % str(discretizer._cuts[0]))
    #print('Bin descriptions: %s' % str(discretizer._bin_descriptions[0]))

if __name__ == '__main__':
    main()

      0     1     2     3     4      5     6     7     8     9    10    11  \
0  41.0  56.0  42.0  71.0  99.0   97.0  68.0   1.0  23.0  48.0  43.0  50.0   
1  84.0  97.0  74.0  47.0  29.0  101.0  69.0  45.0   8.0  89.0  81.0   2.0   
2  18.0   8.0   1.0  46.0  62.0   43.0  51.0  93.0  90.0  28.0  33.0  88.0   
3  83.0  94.0  78.0  99.0  63.0   68.0  23.0  60.0   9.0  93.0  87.0  22.0   
4  21.0   8.0   3.0  26.0  58.0   52.0  44.0  96.0  64.0  31.0  38.0  81.0   

   Life Ladder  
0     6.425144  
1     6.243429  
2     4.542546  
3     6.166838  
4     5.268375  
      0     1     2     3     4     5     6     7      8     9    10    11  \
0  22.0  38.0  20.0  12.0  40.0  40.0  47.0  49.0   23.0  23.0  22.0  64.0   
1   9.0  14.0  29.0  96.0  72.0   0.0  70.0  60.0  106.0  28.0  75.0  99.0   
2  48.0  15.0  71.0  64.0  69.0  92.0  55.0  76.0   45.0  69.0  57.0  63.0   
3  39.0  35.0  58.0  31.0  21.0  37.0  28.0  86.0   56.0  30.0  72.0  27.0   
4  24.0  45.0  42.0   9.0  29.0  62.0  

In [16]:
pd_x_train_dis= pd.DataFrame(X_train_discretized)
pd_x_train_dis['Life Ladder'] = y_train 
print(pd_x_train_dis.head())

NameError: name 'X_train_discretized' is not defined

In [None]:
pd_x_test_dis= pd.DataFrame(X_test_discretized)
pd_x_test_dis['Life Ladder'] = y_test
print(pd_x_test_dis.head())