In [136]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import accuracy_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.preprocessing import StandardScaler

from data_handler import DataHandler
from semantic_binning import SemanticBinning

%matplotlib inline


class Experiment:
    
    def __init__(self, data_path, var_dict, n_bins_range=range(2, 5),
                 n_init_bins_list=[5, 10, 15, 20], random_state=42):
        
        self.data = pd.read_csv(data_path)
        self.var_dict = var_dict
        self.n_bins_range = n_bins_range
        self.n_init_bins_list = n_init_bins_list
        self.random_state = random_state
        
        self.semantic_binning = SemanticBinning(self.var_dict, embedding_dim=4,
                                                max_iter=100, verbose=False)
        self.class_var = self.data[var_dict['class_var']]
        self.n_class = len(self.class_var.unique())
    
    def _make_cv_folds(self, cv):

        cv_folds = []
        idxs = np.arange(0, len(self.data))
        
        np.random.seed(self.random_state)
        np.random.shuffle(idxs)

        folds = np.array_split(idxs, cv)

        for i in range(cv):
            train_idxs = np.concatenate([x for x in folds[:i] + folds[i+1:]])
            val_idxs = folds[i]
            cv_folds.append((train_idxs, val_idxs))

        return cv_folds

    def _get_classification_score(self, trn_x, val_x, trn_y, val_y, models):
            
        score = dict()

        if 'DT' in models:
            dt = DecisionTreeClassifier(min_samples_leaf=0.01, random_state=self.random_state)
            dt.fit(trn_x, trn_y)
            score['DT'] = accuracy_score(val_y, dt.predict(val_x))

        if 'LR' in models:
            lr = LogisticRegression(C=1.0, random_state=self.random_state)
            lr.fit(trn_x, trn_y)
            score['LR'] = accuracy_score(val_y, lr.predict(val_x))

        if 'NB' in models:
            nb = BernoulliNB(binarize=None)
            nb.fit(trn_x, trn_y)
            score['NB'] = accuracy_score(val_y, nb.predict(val_x))

        return score
        
    def classification_performance(self, cv=3):
        
        raw_scores, sc_scores = list(), list()
        ewb_scores, efb_scores, sb_scores = dict(), dict(), dict()
        
        for trn_idx, val_idx in self._make_cv_folds(cv):
            
            trn_x, val_x = self.data.loc[trn_idx], self.data.loc[val_idx]
            trn_y, val_y = self.class_var[trn_idx], self.class_var[val_idx]
            
            trn_data_handler = DataHandler(trn_x, self.var_dict)
            val_data_handler = DataHandler(val_x, self.var_dict)
            
            # Dummy Coding Only
            trn_raw = trn_data_handler.get_dummy_coded_data('dummy_only')
            val_raw = val_data_handler.get_dummy_coded_data('dummy_only')
            raw_score = self._get_classification_score(trn_raw, val_raw, trn_y, val_y, ['DT', 'LR'])
            raw_scores.append(raw_score)
            
            # Dummy Coding + Scaling for Numerical Vars
            scaler = StandardScaler()
            numerical_vars = self.var_dict['numerical_vars']
            trn_sc, val_sc = trn_x.copy(), val_x.copy()
            trn_sc[numerical_vars] = scaler.fit_transform(trn_sc[numerical_vars])
            val_sc[numerical_vars] = scaler.transform(val_sc[numerical_vars])
            trn_sc = DataHandler(trn_sc, self.var_dict).get_dummy_coded_data('dummy_only')
            val_sc = DataHandler(val_sc, self.var_dict).get_dummy_coded_data('dummy_only')
            
            sc_score = self._get_classification_score(trn_sc, val_sc, trn_y, val_y, ['DT', 'LR'])            
            sc_scores.append(sc_score)

            for n_bins in self.n_bins_range:
                
                # Equal Width Binning
                trn_ewb = trn_data_handler.get_dummy_coded_data('equal_width', n_bins)
                ewb_bins = trn_data_handler.get_bins_by_variable_from_data(trn_ewb)
                val_ewb = val_data_handler.get_dummy_coded_data(bins_by_variable=ewb_bins)
                ewb_score = self._get_classification_score(trn_ewb, val_ewb, trn_y, val_y, ['DT', 'LR', 'NB'])
                if n_bins not in ewb_scores:
                    ewb_scores[n_bins] = [ewb_score]
                else:
                    ewb_scores[n_bins].append(ewb_score)
                
                # Equal Freq Binning
                trn_efb = trn_data_handler.get_dummy_coded_data('equal_freq', n_bins)
                efb_bins = trn_data_handler.get_bins_by_variable_from_data(trn_efb)
                val_efb = val_data_handler.get_dummy_coded_data(bins_by_variable=efb_bins)
                efb_score = self._get_classification_score(trn_efb, val_efb, trn_y, val_y, ['DT', 'LR', 'NB'])
                if n_bins not in efb_scores:
                    efb_scores[n_bins] = [efb_score]
                else:
                    efb_scores[n_bins].append(efb_score)
                    
            for n_init_bins in self.n_init_bins_list:

                # Semantic Binning
                trn_sb = self.semantic_binning.fit_transform(trn_x, n_init_bins)
                val_sb = self.semantic_binning.transform(val_x)
                sb_score = self._get_classification_score(trn_sb, val_sb, trn_y, val_y, ['DT', 'LR', 'NB'])
                if n_init_bins not in sb_scores:
                    sb_scores[n_init_bins] = [sb_score]
                else:
                    sb_scores[n_init_bins].append(sb_score)
                    
        scores = dict(raw_scores=raw_scores, sc_scores=sc_scores,
                      ewb_scores=ewb_scores, efb_scores=efb_scores,
                      sb_scores=sb_scores)
        return scores
    
    def clustering_performance(self, methods=['kmeans', 'agglomerative']):
        
        def get_clustering_score(X, method):
            if method == 'kmeans':
                cluster_label = KMeans(n_clusters=self.n_class, 
                                       random_state=self.random_state).fit_predict(X)
            if method == 'agglomerative':
                cluster_label = AgglomerativeClustering(n_clusters=self.n_class).fit_predict(X)
            return adjusted_mutual_info_score(self.class_var, cluster_label)
        
        scores = dict()
        
        data_handler = DataHandler(self.data, self.var_dict)
        
        scores['dummy_only'] = dict()
        dummy_coded = data_handler.get_dummy_coded_data('dummy_only')
        for method in methods:
            dummy_score = get_clustering_score(dummy_coded, method)
            scores['dummy_only'][method] = dummy_score
        
        scores['scale_numeric'] = dict()
        scale_numeric = data_handler.get_dummy_coded_data('scale_numeric')
        for method in methods:
            scale_score = get_clustering_score(scale_numeric, method)
            scores['scale_numeric'][method] = scale_score

        scores['equal_width'] = dict()
        for method in methods:
            scores['equal_width'][method] = dict()
            for n_bins in self.n_bins_range:
                ewb = data_handler.get_dummy_coded_data('equal_width', n_bins)
                ewb_score = get_clustering_score(ewb, method)
                scores['equal_width'][method][n_bins] = ewb_score

        scores['equal_freq'] = dict()
        for method in methods:
            scores['equal_freq'][method] = dict()
            for n_bins in self.n_bins_range:   
                efb = data_handler.get_dummy_coded_data('equal_freq', n_bins)
                efb_score = get_clustering_score(efb, method)
                scores['equal_freq'][method][n_bins] = efb_score
        
        scores['semantic_binning'] = dict()
        for method in methods:
            scores['semantic_binning'][method] = dict()
        for n_init_bins in self.n_init_bins_list:
            sb = self.semantic_binning.fit_transform(self.data, n_init_bins)
            for method in methods:
                sb_score = get_clustering_score(sb, method)
                scores['semantic_binning'][method][n_init_bins] = sb_score
                
        return scores

def compute_cv_score(scores, models=['DT', 'LR']):
    for model in models:
        print(model)
        cv_score = np.array([fold[model] for fold in scores])
        print('Accuracy = {0:0.3f} (+/- {1:0.3f})'.format(cv_score.mean(), cv_score.std() * 2))
        print('')

def compute_cv_score_by_n_bins(scores, models=['DT', 'LR', 'NB']):
    for model in models:
        print(model)
        for n_bins in scores:
            cv_score = np.array([fold[model] for fold in scores[n_bins]])
            print('#Bins = {0}, Accuracy = {1:0.3f} (+/- {2:0.3f})'.format(n_bins, cv_score.mean(), cv_score.std() * 2))
        print('')
        
def print_clustering_score(scores):
    for method, score in scores.items():
        print('{0}, NMI = {1:0.4f}'.format(method, score))
    
def print_clustering_score_by_n_bins(scores):
    for method, score_by_n_bins in scores.items():
        print(method)
        for n_bins, score in score_by_n_bins.items():
            print('#Bins = {0}, NMI = {1:0.4f}'.format(n_bins, score))

In [99]:
hr_data = pd.read_csv('data/HR_comma_sep.csv')
hr_data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'sales', 'salary'],
      dtype='object')

In [100]:
var_dict = dict(
    categorical_vars = ['Work_accident', 'promotion_last_5years', 'sales', 
                        'salary', 'number_project','time_spend_company'],
    numerical_vars = ['satisfaction_level', 'last_evaluation', 'average_montly_hours'],
    class_var = 'left'
)

In [101]:
exp = Experiment('data/HR_comma_sep.csv', 
                 var_dict, 
                 n_bins_range=range(2, 21),
                 n_init_bins_list=[5, 10, 15, 20, 25, 30])

In [102]:
clf_scores = exp.classification_performance(cv=3)

In [111]:
compute_cv_score(clf_scores['raw_scores'])

DT
Accuracy = 0.959 (+/- 0.003)

LR
Accuracy = 0.896 (+/- 0.008)



In [112]:
compute_cv_score(clf_scores['sc_scores'])

DT
Accuracy = 0.959 (+/- 0.003)

LR
Accuracy = 0.896 (+/- 0.008)



In [113]:
compute_cv_score_by_n_bins(clf_scores['ewb_scores'])

DT
#Bins = 2, Accuracy = 0.941 (+/- 0.005)
#Bins = 3, Accuracy = 0.949 (+/- 0.006)
#Bins = 4, Accuracy = 0.941 (+/- 0.002)

LR
#Bins = 2, Accuracy = 0.894 (+/- 0.010)
#Bins = 3, Accuracy = 0.912 (+/- 0.008)
#Bins = 4, Accuracy = 0.906 (+/- 0.006)

NB
#Bins = 2, Accuracy = 0.843 (+/- 0.011)
#Bins = 3, Accuracy = 0.889 (+/- 0.010)
#Bins = 4, Accuracy = 0.870 (+/- 0.011)



In [114]:
compute_cv_score_by_n_bins(clf_scores['efb_scores'])

DT
#Bins = 2, Accuracy = 0.929 (+/- 0.013)
#Bins = 3, Accuracy = 0.945 (+/- 0.005)
#Bins = 4, Accuracy = 0.926 (+/- 0.008)

LR
#Bins = 2, Accuracy = 0.892 (+/- 0.008)
#Bins = 3, Accuracy = 0.914 (+/- 0.008)
#Bins = 4, Accuracy = 0.908 (+/- 0.005)

NB
#Bins = 2, Accuracy = 0.853 (+/- 0.014)
#Bins = 3, Accuracy = 0.882 (+/- 0.013)
#Bins = 4, Accuracy = 0.870 (+/- 0.012)



In [115]:
compute_cv_score_by_n_bins(clf_scores['sb_scores'])

DT
#Bins = 5, Accuracy = 0.934 (+/- 0.015)
#Bins = 10, Accuracy = 0.936 (+/- 0.008)
#Bins = 15, Accuracy = 0.930 (+/- 0.011)
#Bins = 20, Accuracy = 0.942 (+/- 0.013)

LR
#Bins = 5, Accuracy = 0.901 (+/- 0.021)
#Bins = 10, Accuracy = 0.924 (+/- 0.003)
#Bins = 15, Accuracy = 0.924 (+/- 0.024)
#Bins = 20, Accuracy = 0.958 (+/- 0.001)

NB
#Bins = 5, Accuracy = 0.864 (+/- 0.023)
#Bins = 10, Accuracy = 0.880 (+/- 0.013)
#Bins = 15, Accuracy = 0.885 (+/- 0.026)
#Bins = 20, Accuracy = 0.927 (+/- 0.011)



In [108]:
%time clustering_scores = exp.clustering_performance()

CPU times: user 2min 14s, sys: 44.7 s, total: 2min 58s
Wall time: 2min 9s


In [125]:
clustering_scores.keys()

dict_keys(['dummy_only', 'scale_numeric', 'equal_width', 'equal_freq', 'semantic_binning'])

In [128]:
print_clustering_score(clustering_scores['dummy_only'])

kmeans, NMI = 0.0017
agglomerative, NMI = 0.0103


In [129]:
print_clustering_score(clustering_scores['scale_numeric'])

kmeans, NMI = 0.0004
agglomerative, NMI = 0.2091


In [137]:
print_clustering_score_by_n_bins(clustering_scores['equal_width'])

kmeans
#Bins = 2, NMI = 0.0001
#Bins = 3, NMI = 0.0279
#Bins = 4, NMI = 0.0131
agglomerative
#Bins = 2, NMI = 0.1436
#Bins = 3, NMI = 0.2005
#Bins = 4, NMI = 0.2232


In [138]:
print_clustering_score_by_n_bins(clustering_scores['equal_freq'])

kmeans
#Bins = 2, NMI = 0.0011
#Bins = 3, NMI = 0.0934
#Bins = 4, NMI = 0.0131
agglomerative
#Bins = 2, NMI = 0.0144
#Bins = 3, NMI = 0.2531
#Bins = 4, NMI = 0.1808


In [139]:
print_clustering_score_by_n_bins(clustering_scores['semantic_binning'])

kmeans
#Bins = 5, NMI = 0.0131
#Bins = 10, NMI = 0.0131
#Bins = 15, NMI = 0.0131
#Bins = 20, NMI = 0.0131
agglomerative
#Bins = 5, NMI = 0.0290
#Bins = 10, NMI = 0.1874
#Bins = 15, NMI = 0.1972
#Bins = 20, NMI = 0.0781
