In [211]:
import numpy as np
import pandas as pd


class DataHandler:
        
    def __init__(self, data, var_dict):
        
        self.categorical_vars = data[var_dict['categorical_vars']].astype(str)
        self.integer_vars = data[var_dict['integer_vars']].astype(np.int32)
        self.continuous_vars = data[var_dict['continuous_vars']].astype(np.float32)
        self.class_var = data[var_dict['class_var']]
        self.input_vars = var_dict['categorical_vars'] + var_dict['continuous_vars'] + var_dict['integer_vars']
        self.n_variables = len(self.input_vars)
        
    def get_dummy_coded_data(self, init_discretize_method='equal_freq', 
                             n_init_bins=20, bins_by_variable=None):
        
        continuous_vars = self.continuous_vars.copy()
        integer_vars = self.integer_vars.copy()
        categorical_vars = pd.get_dummies(self.categorical_vars.copy())
        
        if not bins_by_variable:
            
            if init_discretize_method == 'equal_width':
                for var in self.continuous_vars.columns:
                    continuous_vars[var] = pd.cut(continuous_vars[var], bins=n_init_bins)
            elif init_discretize_method == 'equal_freq':
                for var in self.continuous_vars.columns:
                    continuous_vars[var] = pd.qcut(continuous_vars[var], q=n_init_bins)
            elif init_discretize_method == 'scale_numeric':
                mean, std = continuous_vars.mean(), continuous_vars.std()
                continuous_vars = (continuous_vars - mean) / std
            elif init_discretize_method == 'dummy_only':
                pass
            else:
                raise NotImplementedError
            
        else:
            for var in bins_by_variable:
                
                if var in self.continuous_vars.columns:
                    bins = bins_by_variable[var]['split_point']
                    continuous_vars[var] = pd.cut(continuous_vars[var], bins=bins)
                
                elif var in self.integer_vars.columns:
                    
                    for merged_bin in bins_by_variable[var]['bins']:
                        bin_name = [' <OR> '.join([str(x) for x in merged_bin])]
                        is_in_bin = integer_vars[var].apply(lambda x: x in merged_bin)
                        integer_vars[bin_name] = is_in_bin
                    integer_vars.drop(var, axis=1, inplace=True)
                    
                else:
                    for merged_bin in bins_by_variable[var]['bins']:
                        cols = ['{}_{}'.format(var, x) for x in merged_bin.split(' <OR> ')]
                        if len(cols) >= 2:
                            categorical_vars[merged_bin] = categorical_vars[cols].sum(axis=1)
                            categorical_vars.drop(cols, axis=1, inplace=True)
                    
        continuous_vars = pd.get_dummies(continuous_vars)
    
        return pd.concat([categorical_vars, integer_vars, continuous_vars], axis=1)
    
    def get_bins_by_variable_from_data(self, dummy_coded_data):

        def get_variable_name(dummy_variable_name):
            return '_'.join(dummy_variable_name.split('_')[:-1])

        def get_interval_and_split_points(dummy_variable_name):
            interval = dummy_variable_name.split('_')[-1]
            begin = float(interval.split(', ')[0].replace('(',''))
            end = float(interval.split(', ')[1].replace(']',''))
            return interval, begin, end
    
        bins_by_variable = dict()

        for var in self.continuous_vars.columns:

            bins = []
            split_points = set()

            dummy_vars = [x for x in dummy_coded_data.columns 
                          if var == get_variable_name(x)]

            for dummy_var in dummy_vars:
                interval, begin, end = get_interval_and_split_points(dummy_var)
                bins.append(interval)
                split_points.update([begin, end])

            split_points = sorted(split_points)

            bins_by_variable[var] = dict(bins=bins, split_point=split_points)

        for var in self.integer_vars.colums:
            
            bins = []
            dummy_vars = [x for x in dummy_coded_data.columns
                         if var == get_variable_name(x)]
            
            for dummy_var in dummy_vars:
                values_in_bin = [int(x) for x in dummy_var[len(var)+1:].split(' <OR> ')]

                
        return bins_by_variable


In [212]:
from embed_bins import BinEmbedder

In [213]:
import pandas as pd
%matplotlib inline

In [214]:
var_dict = dict(
    categorical_vars = ['Work_accident', 'promotion_last_5years', 'sales', 'salary'],
    integer_vars = ['number_project','time_spend_company'],
    continuous_vars = ['satisfaction_level', 'last_evaluation', 'average_montly_hours'],
    class_var = 'left'
)

In [215]:
data_handler = DataHandler(pd.read_csv('data/HR_comma_sep.csv'), var_dict)

In [216]:
dummy_coded = data_handler.get_dummy_coded_data()

In [217]:
dummy_coded.columns

Index(['Work_accident_0', 'Work_accident_1', 'promotion_last_5years_0',
       'promotion_last_5years_1', 'sales_IT', 'sales_RandD',
       'sales_accounting', 'sales_hr', 'sales_management', 'sales_marketing',
       'sales_product_mng', 'sales_sales', 'sales_support', 'sales_technical',
       'salary_high', 'salary_low', 'salary_medium', 'number_project',
       'time_spend_company', 'satisfaction_level_(0.089, 0.11]',
       'satisfaction_level_(0.11, 0.21]', 'satisfaction_level_(0.21, 0.36]',
       'satisfaction_level_(0.36, 0.4]', 'satisfaction_level_(0.4, 0.44]',
       'satisfaction_level_(0.44, 0.49]', 'satisfaction_level_(0.49, 0.53]',
       'satisfaction_level_(0.53, 0.57]', 'satisfaction_level_(0.57, 0.61]',
       'satisfaction_level_(0.61, 0.64]', 'satisfaction_level_(0.64, 0.68]',
       'satisfaction_level_(0.68, 0.72]', 'satisfaction_level_(0.72, 0.75]',
       'satisfaction_level_(0.75, 0.78]', 'satisfaction_level_(0.78, 0.82]',
       'satisfaction_level_(0.82, 0.8

In [218]:
bin_embedder = BinEmbedder()

In [219]:
bin_embedder.learn_bin_embeddings(dummy_coded, data_handler.n_variables, n_epoch=1)

IndexError: index 716568 is out of bounds for axis 1 with size 629958

In [None]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


class BinMerger:
    
    def __init__(self, embedding_by_column, clustering_method='agglomerative'):
        self.embedding_by_column = embedding_by_column
        if clustering_method in ['kmeans', 'agglomerative']:
            self.clustering_method = clustering_method
        else:
            raise ValueError('Available method = [kmeans, agglomerative]')        
    
    def _get_cols_and_embeddings(self, variable):
        cols = []
        embeddings = []
        for c, e in self.embedding_by_column.items():
            if variable in c:
                cols.append(c)
                embeddings.append(e)
        return cols, embeddings

    def _cluster_embeddings(self, embeddings):
        
        # Determine Optimal Number of Cluster
        scores = []
        
        for n_cluster in range(2, len(embeddings)):
            
            if self.clustering_method == 'kmeans':
                cluster_label = KMeans(n_cluster).fit_predict(embeddings)
            if self.clustering_method == 'agglomerative':
                cluster_label = AgglomerativeClustering(n_cluster).fit_predict(embeddings)
                
            score = silhouette_score(embeddings, cluster_label)
            scores.append(score)
        
        # Clustering with Optimal Number of Cluster
        best_n = np.argmax(scores) + 2
        if self.clustering_method == 'kmeans':
            cluster_label = KMeans(best_n).fit_predict(embeddings)
        if self.clustering_method == 'agglomerative':
            cluster_label = AgglomerativeClustering(best_n).fit_predict(embeddings)
        
        return cluster_label

    def _get_cols_by_cluster(self, cols, cluster_label, v_type):
        
        cols_by_cluster = dict()

        if v_type == 'continuous':
            cnt, prev_label = -1, -1
            for col, label in sorted(zip(cols, cluster_label), key=lambda x:x[0]):
                if prev_label == label:
                    cols_by_cluster[cnt].append(col)
                else:
                    cnt += 1
                    cols_by_cluster[cnt] = [col]
                prev_label = label            
        
        elif v_type == 'integer':
            for col, label in zip(cols, cluster_label):
                if label in cols_by_cluster:
                    cols_by_cluster[label].append(col)
                else:
                    cols_by_cluster[label] = [col]
            
        elif v_type == 'categorical':
            for col, label in zip(cols, cluster_label):
                if label in cols_by_cluster:
                    cols_by_cluster[label].append(col)
                else:
                    cols_by_cluster[label] = [col]
        
        else:
            raise ValueError('Available v_type = [continuous, integer, categorical]')

        return cols_by_cluster

    def _merge_bins(self, variable, v_type):
        
        def get_category_level_name(variable, col_name):
            return col_name[len(variable) + 1:]
        
        merged_bins = list()
        split_points = set()
        
        cols, embeddings = self._get_cols_and_embeddings(variable)
        
        # Do not Merge, if #Bins <= 2
        if v_type == 'categorical' and (len(cols) <= 2):
            merged_bins = [get_category_level_name(variable, x) for x in cols]
            return merged_bins, split_points
        
        cluster_label = self._cluster_embeddings(embeddings)
        cols_by_cluster = self._get_cols_by_cluster(cols, cluster_label, v_type)
        
        for cols in cols_by_cluster.values():
        
            if v_type == 'continuous':
                intervals = [get_category_level_name(variable, x) for x in cols]
                begin = intervals[0].split(' ')[0]
                end = intervals[-1].split(' ')[1]
                merged_bins.append(' '.join([begin, end]))

                begin_point = float(begin.replace('(','').replace(',',''))
                end_point = float(end.replace(']','').replace(',',''))
                split_points.update([begin_point, end_point])
            
            if v_type == 'integer':
                values_in_bin = sorted([int(get_category_level_name(variable, x)) for x in cols])
                
                current_bin = []
                prev_value = None

                for value in values_in_bin:
                    if (value - 1) == prev_value:
                        current_bin.append(value)
                    else:
                        if len(current_bin) > 0:
                            merged_bins.append(current_bin)
                        current_bin = [value]
                    prev_value = value

                merged_bins.append(current_bin)
                
            if v_type == 'categorical':
                category_levels = [get_category_level_name(variable, x) for x in cols]
                merged_bins.append(' <OR> '.join(category_levels))
                
        split_points = sorted(split_points)
        
        return merged_bins, split_points
    
    def get_merged_bins_by_var(self, var_dict, 
                               merge_continuous_var=True, 
                               merge_integer_vars=True,
                               merge_categorical_var=True):
        
        bins_by_variable = dict()
        
        if merge_continuous_var:
            for var in var_dict['continuous_vars']:                
                merged_bins, split_points = self._merge_bins(var, v_type='continuous')
                bins_by_variable[var] = dict(bins=merged_bins, split_point=split_points)
        
        if merge_integer_vars:
            for var in var_dict['integer_vars']:                    
                merged_bins, _ = self._merge_bins(var, v_type='integer')
                bins_by_variable[var] = dict(bins=merged_bins)
            
        if merge_categorical_var:
            for var in var_dict['categorical_vars']:
                merged_bins, _ = self._merge_bins(var, v_type='categorical')
                bins_by_variable[var] = dict(bins=merged_bins)
            
        return bins_by_variable


In [None]:
bin_merger = BinMerger(bin_embedder.embedding_by_column)

In [None]:
bins_by_var = bin_merger.get_merged_bins_by_var(var_dict, merge_categorical_var=False)

In [None]:
bins_by_var

In [None]:
data_handler.get_dummy_coded_data(bins_by_variable=bins_by_var).columns