In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from data_handler import DataHandler
from semantic_binning import SemanticBinning
%matplotlib inline

In [2]:
hr_data = pd.read_csv('data/HR_comma_sep.csv')
hr_data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'sales', 'salary'],
      dtype='object')

In [3]:
var_dict = dict(
    categorical_vars = ['Work_accident', 'promotion_last_5years', 'sales', 
                        'salary', 'number_project','time_spend_company'],
    numerical_vars = ['satisfaction_level', 'last_evaluation', 'average_montly_hours'],
    class_var = 'left'
)

In [78]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.preprocessing import StandardScaler

from data_handler import DataHandler
from semantic_binning import SemanticBinning


class Experiment:
    
    def __init__(self, data_path, var_dict, n_init_bins_list=[5, 10, 15, 20], random_state=42):
        
        self.data = pd.read_csv(data_path)
        self.var_dict = var_dict
        self.n_init_bins_list = n_init_bins_list
        self.random_state = random_state
        
        self.semantic_binning = SemanticBinning(self.var_dict, embedding_dim=4,
                                                max_iter=100, verbose=False)
        self.class_var = self.data[var_dict['class_var']]
        self.n_class = len(self.class_var.unique())
    
    def _make_cv_folds(self, cv):

        cv_folds = []
        idxs = np.arange(0, len(self.data))
        
        np.random.seed(self.random_state)
        np.random.shuffle(idxs)

        folds = np.array_split(idxs, cv)

        for i in range(cv):
            train_idxs = np.concatenate([x for x in folds[:i] + folds[i+1:]])
            val_idxs = folds[i]
            cv_folds.append((train_idxs, val_idxs))

        return cv_folds

    def _get_classification_score(self, trn_x, val_x, trn_y, val_y, models):
            
        score = dict()

        if 'DT' in models:
            dt = DecisionTreeClassifier(min_samples_leaf=0.01, random_state=self.random_state)
            dt.fit(trn_x, trn_y)
            score['DT'] = accuracy_score(val_y, dt.predict(val_x))

        if 'LR' in models:
            lr = LogisticRegression(C=1.0, random_state=self.random_state)
            lr.fit(trn_x, trn_y)
            score['LR'] = accuracy_score(val_y, lr.predict(val_x))

        if 'NB' in models:
            nb = BernoulliNB(binarize=None)
            nb.fit(trn_x, trn_y)
            score['NB'] = accuracy_score(val_y, nb.predict(val_x))

        return score
        
    def classification_performance(self, cv=3, random_state=42):

        raw_scores = []
        sc_scores = []
        ewb_scores = dict()
        efb_scores = dict()
        sb_scores = dict()
        
        for trn_idx, val_idx in self._make_cv_folds(cv):
            
            trn_x, val_x = self.data.loc[trn_idx], self.data.loc[val_idx]
            trn_y, val_y = self.class_var[trn_idx], self.class_var[val_idx]
            
            trn_data_handler = DataHandler(trn_x, self.var_dict)
            val_data_handler = DataHandler(val_x, self.var_dict)
            
            # Dummy Coding Only
            trn_raw = trn_data_handler.get_dummy_coded_data('dummy_only')
            val_raw = val_data_handler.get_dummy_coded_data('dummy_only')
            raw_score = self._get_classification_score(trn_raw, val_raw, trn_y, val_y, ['DT', 'LR'])
            raw_scores.append(raw_score)
            
            # Dummy Coding + Scaling for Numerical Vars
            scaler = StandardScaler()
            numerical_vars = self.var_dict['numerical_vars']
            trn_sc, val_sc = trn_x.copy(), val_x.copy()
            trn_sc[numerical_vars] = scaler.fit_transform(trn_sc[numerical_vars])
            val_sc[numerical_vars] = scaler.transform(val_sc[numerical_vars])
            trn_sc = DataHandler(trn_sc, self.var_dict).get_dummy_coded_data('dummy_only')
            val_sc = DataHandler(val_sc, self.var_dict).get_dummy_coded_data('dummy_only')
            
            sc_score = self._get_classification_score(trn_sc, val_sc, trn_y, val_y, ['DT', 'LR'])            
            sc_scores.append(sc_score)

            for n_bins in range(2, 21):
                
                # Equal Width Binning
                trn_ewb = trn_data_handler.get_dummy_coded_data('equal_width', n_init_bins=n_bins)
                ewb_bins = trn_data_handler.get_bins_by_variable_from_data(trn_ewb)
                val_ewb = val_data_handler.get_dummy_coded_data(bins_by_variable=ewb_bins)
                ewb_score = self._get_classification_score(trn_ewb, val_ewb, trn_y, val_y, ['DT', 'LR', 'NB'])
                if n_bins not in ewb_scores:
                    ewb_scores[n_bins] = [ewb_score]
                else:
                    ewb_scores[n_bins].append(ewb_score)
                
                # Equal Freq Binning
                trn_efb = trn_data_handler.get_dummy_coded_data('equal_freq', n_init_bins=n_bins)
                efb_bins = trn_data_handler.get_bins_by_variable_from_data(trn_efb)
                val_efb = val_data_handler.get_dummy_coded_data(bins_by_variable=efb_bins)
                efb_score = self._get_classification_score(trn_efb, val_efb, trn_y, val_y, ['DT', 'LR', 'NB'])
                if n_bins not in efb_scores:
                    efb_scores[n_bins] = [efb_score]
                else:
                    efb_scores[n_bins].append(efb_score)
                    
            for n_init_bins in self.n_init_bins_list:

                # Semantic Binning
                trn_sb = self.semantic_binning.fit_transform(trn_x, n_init_bins=n_init_bins)
                val_sb = self.semantic_binning.transform(val_x)
                sb_score = self._get_classification_score(trn_sb, val_sb, trn_y, val_y, ['DT', 'LR', 'NB'])
                if n_init_bins not in sb_scores:
                    sb_scores[n_init_bins] = [sb_score]
                else:
                    sb_scores[n_init_bins].append(sb_score)

        return raw_scores, sc_scores, ewb_scores, efb_scores, sb_scores

    def clustering_performance(self):
        
        def get_clustering_score(X, method):
            if method == 'kmeans':
                cluster_label = KMeans(n_clusters=self.n_class).fit_predict(X)
            if method == 'agglomerative':
                cluster_label = AgglomerativeClustering(n_clusters=self.n_class).fit_predict(X)
            return adjusted_mutual_info_score(self.class_var, cluster_label)
        
        data_handler = DataHandler(self.data, self.var_dict)
        
        dummy_coded = data_handler.get_dummy_coded_data(init_discretize_method='dummy_only')
        scale_numeric = data_handler.get_dummy_coded_data(init_discretize_method='scale_numeric')
        ewb = data_handler.get_dummy_coded_data(init_discretize_method='equal_width', n_init_bins=self.n_init_bins)
        efb = data_handler.get_dummy_coded_data(init_discretize_method='equal_freq', n_init_bins=self.n_init_bins)
        sb = self.semantic_binning.fit_transform(self.data, n_init_bins=self.n_init_bins)
        
        print('Clustering Performance By Discretization Method')
        
        for method in ['kmeans', 'agglomerative']:
            print('> Clustering Algorithm = {}'.format(method))
            print('>> Dummy Coding Only = {}'.format(get_clustering_score(dummy_coded, method)))
            print('>> Scale Numerical Vars = {}'.format(get_clustering_score(scale_numeric, method)))
            print('>> Equal Width Binning = {}'.format(get_clustering_score(ewb, method)))
            print('>> Equal Freq Binning = {}'.format(get_clustering_score(efb, method)))
            print('>> Semantic Binning = {}'.format(get_clustering_score(sb, method)))


In [80]:
exp = Experiment('data/HR_comma_sep.csv', var_dict)

In [82]:
raw_scores, sc_scores, ewb_scores, efb_scores, sb_scores = exp.classification_performance()

In [85]:
def compute_cv_score(scores, models=['DT', 'LR']):
    for model in models:
        print(model)
        cv_score = np.array([fold[model] for fold in scores])
        print('Accuracy = {0:0.3f} (+/- {1:0.3f})'.format(cv_score.mean(), cv_score.std() * 2))

def compute_cv_score_by_n_bins(scores, models=['DT', 'LR', 'NB']):
    for model in models:
        print(model)
        for n_bins in scores:
            cv_score = np.array([fold[model] for fold in scores[n_bins]])
            print('#Bins = {0}, Accuracy = {1:0.3f} (+/- {2:0.3f})'.format(n_bins, cv_score.mean(), cv_score.std() * 2))

In [86]:
compute_cv_score(raw_scores)

DT
Accuracy = 0.959 (+/- 0.003)
LR
Accuracy = 0.896 (+/- 0.008)


In [87]:
compute_cv_score(sc_scores)

DT
Accuracy = 0.959 (+/- 0.003)
LR
Accuracy = 0.896 (+/- 0.008)


In [88]:
compute_cv_score_by_n_bins(ewb_scores)

DT
#Bins = 2, Accuracy = 0.941 (+/- 0.005)
#Bins = 3, Accuracy = 0.949 (+/- 0.006)
#Bins = 4, Accuracy = 0.941 (+/- 0.002)
#Bins = 5, Accuracy = 0.937 (+/- 0.008)
#Bins = 6, Accuracy = 0.944 (+/- 0.002)
#Bins = 7, Accuracy = 0.943 (+/- 0.009)
#Bins = 8, Accuracy = 0.937 (+/- 0.003)
#Bins = 9, Accuracy = 0.938 (+/- 0.003)
LR
#Bins = 2, Accuracy = 0.894 (+/- 0.010)
#Bins = 3, Accuracy = 0.912 (+/- 0.008)
#Bins = 4, Accuracy = 0.906 (+/- 0.006)
#Bins = 5, Accuracy = 0.922 (+/- 0.006)
#Bins = 6, Accuracy = 0.926 (+/- 0.002)
#Bins = 7, Accuracy = 0.941 (+/- 0.004)
#Bins = 8, Accuracy = 0.938 (+/- 0.001)
#Bins = 9, Accuracy = 0.945 (+/- 0.005)
NB
#Bins = 2, Accuracy = 0.843 (+/- 0.011)
#Bins = 3, Accuracy = 0.889 (+/- 0.010)
#Bins = 4, Accuracy = 0.870 (+/- 0.011)
#Bins = 5, Accuracy = 0.881 (+/- 0.008)
#Bins = 6, Accuracy = 0.892 (+/- 0.009)
#Bins = 7, Accuracy = 0.905 (+/- 0.002)
#Bins = 8, Accuracy = 0.908 (+/- 0.003)
#Bins = 9, Accuracy = 0.914 (+/- 0.006)


In [89]:
compute_cv_score_by_n_bins(efb_scores)

DT
#Bins = 2, Accuracy = 0.929 (+/- 0.013)
#Bins = 3, Accuracy = 0.945 (+/- 0.005)
#Bins = 4, Accuracy = 0.926 (+/- 0.008)
#Bins = 5, Accuracy = 0.942 (+/- 0.001)
#Bins = 6, Accuracy = 0.943 (+/- 0.004)
#Bins = 7, Accuracy = 0.938 (+/- 0.004)
#Bins = 8, Accuracy = 0.939 (+/- 0.008)
#Bins = 9, Accuracy = 0.937 (+/- 0.006)
LR
#Bins = 2, Accuracy = 0.892 (+/- 0.008)
#Bins = 3, Accuracy = 0.914 (+/- 0.008)
#Bins = 4, Accuracy = 0.908 (+/- 0.005)
#Bins = 5, Accuracy = 0.917 (+/- 0.009)
#Bins = 6, Accuracy = 0.925 (+/- 0.007)
#Bins = 7, Accuracy = 0.940 (+/- 0.002)
#Bins = 8, Accuracy = 0.925 (+/- 0.007)
#Bins = 9, Accuracy = 0.930 (+/- 0.005)
NB
#Bins = 2, Accuracy = 0.853 (+/- 0.014)
#Bins = 3, Accuracy = 0.882 (+/- 0.013)
#Bins = 4, Accuracy = 0.870 (+/- 0.012)
#Bins = 5, Accuracy = 0.884 (+/- 0.010)
#Bins = 6, Accuracy = 0.890 (+/- 0.008)
#Bins = 7, Accuracy = 0.904 (+/- 0.006)
#Bins = 8, Accuracy = 0.898 (+/- 0.006)
#Bins = 9, Accuracy = 0.902 (+/- 0.008)


In [91]:
compute_cv_score_by_n_bins(sb_scores)

DT
#Bins = 5, Accuracy = 0.933 (+/- 0.018)
#Bins = 10, Accuracy = 0.938 (+/- 0.005)
#Bins = 15, Accuracy = 0.943 (+/- 0.016)
#Bins = 20, Accuracy = 0.943 (+/- 0.013)
LR
#Bins = 5, Accuracy = 0.910 (+/- 0.017)
#Bins = 10, Accuracy = 0.922 (+/- 0.001)
#Bins = 15, Accuracy = 0.949 (+/- 0.007)
#Bins = 20, Accuracy = 0.941 (+/- 0.023)
NB
#Bins = 5, Accuracy = 0.871 (+/- 0.024)
#Bins = 10, Accuracy = 0.884 (+/- 0.015)
#Bins = 15, Accuracy = 0.916 (+/- 0.011)
#Bins = 20, Accuracy = 0.895 (+/- 0.050)


In [10]:
%time exp.clustering_performance()

>>> Iteration = 10000, Loss = 0.3017544746398926
>>> Iteration = 20000, Loss = 0.30021047592163086
>>> Iteration = 30000, Loss = 0.3046160042285919
>>> Iteration = 40000, Loss = 0.30355164408683777
>>> Iteration = 50000, Loss = 0.3017103970050812
>>> Iteration = 60000, Loss = 0.307841032743454
>>> Iteration = 70000, Loss = 0.30641624331474304
>>> Iteration = 80000, Loss = 0.30312803387641907
>>> Iteration = 90000, Loss = 0.3049249053001404
>>> Iteration = 100000, Loss = 0.3034362494945526
>>> Iteration = 110000, Loss = 0.3036462068557739
>>> Iteration = 120000, Loss = 0.3026193380355835
>>> Iteration = 130000, Loss = 0.2992166578769684
>>> Iteration = 140000, Loss = 0.3016679883003235
>>> Iteration = 150000, Loss = 0.30351972579956055
>>> Iteration = 160000, Loss = 0.30218303203582764
>>> Iteration = 170000, Loss = 0.3054898977279663
>>> Iteration = 180000, Loss = 0.29904675483703613
>>> Iteration = 190000, Loss = 0.3019198775291443
>>> Iteration = 200000, Loss = 0.3020668923854828
>>>

In [None]:
%time get_cv_error_for_model(LogisticRegression(), hr_data, var_dict)

3 fold cv score
>> dummy_only
Accuracy: 0.896 (+/- 0.004)
>> equal_width
Accuracy: 0.942 (+/- 0.006)
>> equal_freq
Accuracy: 0.937 (+/- 0.002)
>> semantic_binning
Accuracy: 0.908 (+/- 0.010)
CPU times: user 41min 52s, sys: 50.2 s, total: 42min 42s
Wall time: 42min 40s


In [None]:
%time get_cv_error_for_model(DecisionTreeClassifier(), hr_data, var_dict)

3 fold cv score
>> dummy_only
Accuracy: 0.975 (+/- 0.001)
>> equal_width
Accuracy: 0.967 (+/- 0.004)
>> equal_freq
Accuracy: 0.964 (+/- 0.005)
>> semantic_binning
Accuracy: 0.964 (+/- 0.001)
CPU times: user 43min 50s, sys: 43.9 s, total: 44min 34s
Wall time: 44min 44s


In [None]:
%time get_cv_error_for_model(BernoulliNB(), hr_data, var_dict)

3 fold cv score
>> dummy_only
Accuracy: 0.879 (+/- 0.007)
>> equal_width
Accuracy: 0.920 (+/- 0.006)
>> equal_freq
Accuracy: 0.907 (+/- 0.005)
>> semantic_binning
Accuracy: 0.855 (+/- 0.010)
CPU times: user 45min 21s, sys: 44.8 s, total: 46min 6s
Wall time: 46min 20s
