In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from data_handler import DataHandler
from semantic_binning import SemanticBinning
%matplotlib inline

In [2]:
hr_data = pd.read_csv('data/HR_comma_sep.csv')
hr_data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'sales', 'salary'],
      dtype='object')

In [3]:
var_dict = dict(
    categorical_vars = ['Work_accident', 'promotion_last_5years', 'sales', 
                        'salary', 'number_project','time_spend_company'],
    numerical_vars = ['satisfaction_level', 'last_evaluation', 'average_montly_hours'],
    class_var = 'left'
)

In [None]:
import numpy as np
import pandas as pd

from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_mutual_info_score

from data_handler import DataHandler
from semantic_binning import SemanticBinning


class Experiment:
    
    def __init__(self, data_path, var_dict):
        self.data = pd.read_csv(data_path)
        self.var_dict = var_dict
        self.semantic_binning = SemanticBinning(self.var_dict, embedding_dim=8, 
                                                max_iter=300000)
        self.class_var = self.data[var_dict['class_var']]
        self.n_class = len(self.class_var.unique())
    
    def classification_performance(self):
        data_handler = DataHandler(self.data, self.var_dict)
        data_handler
        
        
        return None
    
    def clustering_performance(self, method='agglomerative'):
        
        def get_score(X, method):
            if method == 'kmeans':
                cluster_label = KMeans(n_clusters=self.n_class).fit_predict(X)
            if method == 'agglomerative':
                cluster_label = AgglomerativeClustering(n_clusters=self.n_class).fit_predict(X)
            return adjusted_mutual_info_score(self.class_var, cluster_label)
        
        data_handler = DataHandler(self.data, self.var_dict)
        
        dummy_coded = data_handler.get_dummy_coded_data(init_discretize_method='dummy_only')
        scale_numeric = data_handler.get_dummy_coded_data(init_discretize_method='scale_numeric')
        ewb = data_handler.get_dummy_coded_data(init_discretize_method='equal_width', n_init_bins=10)
        efb = data_handler.get_dummy_coded_data(init_discretize_method='equal_freq', n_init_bins=10)
        sb = self.semantic_binning.fit_transform(self.data, n_init_bins=10)
        
        print('Clustering Performance By Discretization Method')
        print('> Dummy Coding Only = {}'.format(get_score(dummy_coded, method)))
        print('> Scale Numerical Vars = {}'.format(get_score(scale_numeric, method)))
        print('> Equal Width Binning = {}'.format(get_score(ewb, method)))
        print('> Equal Freq Binning = {}'.format(get_score(efb, method)))
        print('> Semantic Binning = {}'.format(get_score(sb, method)))


In [None]:
exp = Experiment('data/HR_comma_sep.csv', var_dict)

In [None]:
%time exp.clustering_performance()

>>> Iteration = 10000, Loss = 0.2976953387260437
>>> Iteration = 20000, Loss = 0.2967853546142578
>>> Iteration = 30000, Loss = 0.2917010486125946
>>> Iteration = 40000, Loss = 0.2964842617511749
>>> Iteration = 50000, Loss = 0.29677262902259827
>>> Iteration = 60000, Loss = 0.2947201728820801
>>> Iteration = 70000, Loss = 0.2959824502468109
>>> Iteration = 80000, Loss = 0.29302340745925903
>>> Iteration = 90000, Loss = 0.29622188210487366
>>> Iteration = 100000, Loss = 0.29901406168937683
>>> Iteration = 110000, Loss = 0.2971545159816742
>>> Iteration = 120000, Loss = 0.29810118675231934
>>> Iteration = 130000, Loss = 0.29306966066360474
>>> Iteration = 140000, Loss = 0.2939961850643158
>>> Iteration = 150000, Loss = 0.2978173792362213
>>> Iteration = 160000, Loss = 0.29650187492370605
>>> Iteration = 170000, Loss = 0.29137149453163147
>>> Iteration = 180000, Loss = 0.2971870005130768
>>> Iteration = 190000, Loss = 0.293679803609848
>>> Iteration = 200000, Loss = 0.2939714193344116
>>

In [None]:
%time exp.clustering_performance(method='kmeans')

>>> Iteration = 10000, Loss = 0.2975062131881714
>>> Iteration = 20000, Loss = 0.29386061429977417
>>> Iteration = 30000, Loss = 0.29456761479377747
>>> Iteration = 40000, Loss = 0.2939647436141968
>>> Iteration = 50000, Loss = 0.29233407974243164
>>> Iteration = 60000, Loss = 0.2894141972064972
>>> Iteration = 70000, Loss = 0.29430869221687317
>>> Iteration = 80000, Loss = 0.29418352246284485
>>> Iteration = 90000, Loss = 0.29267388582229614
>>> Iteration = 100000, Loss = 0.29420921206474304
>>> Iteration = 110000, Loss = 0.2952078580856323
>>> Iteration = 120000, Loss = 0.297040730714798
>>> Iteration = 130000, Loss = 0.2932296693325043
>>> Iteration = 140000, Loss = 0.2955964505672455
>>> Iteration = 150000, Loss = 0.2965798079967499
>>> Iteration = 160000, Loss = 0.2964361310005188
>>> Iteration = 170000, Loss = 0.29272913932800293
>>> Iteration = 180000, Loss = 0.2942306399345398
>>> Iteration = 190000, Loss = 0.29308053851127625
>>> Iteration = 200000, Loss = 0.29546141624450684


In [9]:
def get_cv_error_for_model(model, data, var_dict, cv=3):

    def split_idxs_for_cross_validation(cv):
        idx_splitted = []
        idxs = np.arange(0, len(data))
        np.random.shuffle(idxs)
        folds = np.array_split(idxs, cv)
        for i in range(cv):
            train_idxs = np.concatenate([x for x in folds[:i] + folds[i+1:]])
            val_idxs = folds[i]
            idx_splitted.append((train_idxs, val_idxs))
        return idx_splitted

    def get_val_score(trn_x, val_x):
        model.fit(trn_x, trn_y)
        score = accuracy_score(val_y, model.predict(val_x))
        return score

    def print_cv_score(scores, setting_name):
        scores = np.array(scores)
        print('>> {}'.format(setting_name))
        print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    
    semantic_binning = SemanticBinning(var_dict, max_iter=300000, verbose=False)
    class_var = data[var_dict['class_var']]
    idx_splitted = split_idxs_for_cross_validation(cv)
    
    dummy_coded_scores = []
    ewb_scores = []
    efb_scores = []
    sb_scores = []
    
    for trn_idx, val_idx in idx_splitted:
        
        trn_y = class_var[trn_idx]
        val_y = class_var[val_idx]
        
        trn_data_handler = DataHandler(data.loc[trn_idx], var_dict)
        val_data_handler = DataHandler(data.loc[val_idx], var_dict)
        
        trn_dummy_coded = trn_data_handler.get_dummy_coded_data('dummy_only')
        val_dummy_coded = val_data_handler.get_dummy_coded_data('dummy_only')
        dummy_coded_scores.append(get_val_score(trn_dummy_coded, val_dummy_coded))
        
        trn_ewb = trn_data_handler.get_dummy_coded_data('equal_width', n_init_bins=10)
        ewb_bins = trn_data_handler.get_bins_by_variable_from_data(trn_ewb)
        val_ewb = val_data_handler.get_dummy_coded_data(bins_by_variable=ewb_bins)
        ewb_scores.append(get_val_score(trn_ewb, val_ewb))
    
        trn_efb = trn_data_handler.get_dummy_coded_data('equal_freq', n_init_bins=10)
        efb_bins = trn_data_handler.get_bins_by_variable_from_data(trn_efb)
        val_efb = val_data_handler.get_dummy_coded_data(bins_by_variable=efb_bins)
        efb_scores.append(get_val_score(trn_efb, val_efb))
    
        trn_sb = semantic_binning.fit_transform(data.loc[trn_idx], n_init_bins=10)
        val_sb = semantic_binning.transform(data.loc[val_idx])
        sb_scores.append(get_val_score(trn_sb, val_sb))
    
    print('{} fold cv score'.format(cv))
    print_cv_score(dummy_coded_scores, 'dummy_only')
    print_cv_score(ewb_scores, 'equal_width')
    print_cv_score(efb_scores, 'equal_freq')
    print_cv_score(sb_scores, 'semantic_binning')

In [None]:
%time get_cv_error_for_model(LogisticRegression(), hr_data, var_dict)

3 fold cv score
>> dummy_only
Accuracy: 0.896 (+/- 0.004)
>> equal_width
Accuracy: 0.942 (+/- 0.006)
>> equal_freq
Accuracy: 0.937 (+/- 0.002)
>> semantic_binning
Accuracy: 0.908 (+/- 0.010)
CPU times: user 41min 52s, sys: 50.2 s, total: 42min 42s
Wall time: 42min 40s


In [None]:
%time get_cv_error_for_model(DecisionTreeClassifier(), hr_data, var_dict)

3 fold cv score
>> dummy_only
Accuracy: 0.975 (+/- 0.001)
>> equal_width
Accuracy: 0.967 (+/- 0.004)
>> equal_freq
Accuracy: 0.964 (+/- 0.005)
>> semantic_binning
Accuracy: 0.964 (+/- 0.001)
CPU times: user 43min 50s, sys: 43.9 s, total: 44min 34s
Wall time: 44min 44s


In [None]:
%time get_cv_error_for_model(BernoulliNB(), hr_data, var_dict)

3 fold cv score
>> dummy_only
Accuracy: 0.879 (+/- 0.007)
>> equal_width
Accuracy: 0.920 (+/- 0.006)
>> equal_freq
Accuracy: 0.907 (+/- 0.005)
>> semantic_binning
Accuracy: 0.855 (+/- 0.010)
CPU times: user 45min 21s, sys: 44.8 s, total: 46min 6s
Wall time: 46min 20s
