In [274]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# import hdtree
from hdtree import HDTreeClassifier
from information_measure import EntropyMeasure
# import information_measure
# import split_rule
from split_rule import LessThanHalfOfSplit, SingleCategorySplit, FixedValueSplit, TwentyQuantileSplit, LogisticRegressionSingleSplit, AbstractQuantileSplit, TwentyQuantileRangeSplit, TwoQuantileRangeSplit, LogisticRegressionDoubleCategorySplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score, roc_curve, roc_auc_score, plot_roc_curve, auc
import pickle
from joblib import dump, load
import json

In [275]:
# code to implement 5 fold cross validation and report p, r, f1 and accuracy
def five_fold(X, y, clf, folds=5):
    # X is the feature matrix
    # y is the label vector
    # clf is the classifier
    # folds is the number of folds
    # returns the average p, r, f1 and accuracy
    # create the folds
    skf = StratifiedKFold(n_splits=folds)
    # initialize the scores
    p, r, f1, acc = 0, 0, 0, 0
    # loop through the folds
    for train_index, test_index in skf.split(X, y):
        # split the data into train and test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # fit the model
        clf.fit(X_train, y_train)
        # predict the labels
        y_pred = clf.predict(X_test)
        y_pred = [float(i) for i in y_pred]
        # calculate the scores
        p += precision_score(y_test, y_pred, average='macro')
        r += recall_score(y_test, y_pred, average='macro')
        f1 += f1_score(y_test, y_pred, average='macro')
        acc += accuracy_score(y_test, y_pred)
    # return the average scores
    print('Precision: ', p/folds)
    print('Recall: ', r/folds)
    print('F1: ', f1/folds)
    print('Accuracy: ', acc/folds)
    return acc/folds

In [276]:
# code for t test to evaluate two models
def t_test(model1, model2):
    # Null hypothesis: model1 and model2 are the same
    # model1 and model2 are lists of accuracies
    # returns the t statistic and p value
    # for the t test
    import numpy as np
    from scipy import stats
    t, p = stats.ttest_ind([model1], [model2])
    print("NULL HYPOTHESIS: model1 and model2 are the same")
    if p < 0.05:
        print('Reject null hypothesis')
        if(np.mean([model1]) > np.mean([model2])):
            print('single attribute split model is better than double attribute split model')
        else:
            print('double attribute split model is better than single attribute split model')
    else:
        print('Fail to reject null hypothesis')

In [277]:
# code for wilcoxon signed rank test to evaluate two models
def wilcoxon_test(model1, model2):
    # Null hypothesis: model1 and model2 are the same
    # model1 and model2 are lists of accuracies
    # returns the t statistic and p value
    # for the wilcoxon signed rank test
    from scipy.stats import wilcoxon
    t, p = wilcoxon([model1], [model2])
    print("NULL HYPOTHESIS: model1 and model2 are the same")
    if p < 0.05:
        print('Reject null hypothesis')
        if(np.mean([model1]) > np.mean([model2])):
            print('single attribute split model is better than double attribute split model')
        else:
            print('double attribute split model is better than single attribute split model')
    else:
        print('Fail to reject null hypothesis')

In [278]:
# code for mann whitney u test to evaluate two models
def mannwhitneyu_test(model1, model2):
    # Null hypothesis: model1 and model2 are the same
    # model1 and model2 are lists of accuracies
    # returns the t statistic and p value
    # for the mann whitney u test
    from scipy.stats import mannwhitneyu
    t, p = mannwhitneyu([model1], [model2])
    print("NULL HYPOTHESIS: model1 and model2 are the same")
    if p < 0.05:
        print('Reject null hypothesis')
        if(np.mean([model1]) > np.mean([model2])):
            print('single attribute split model is better than double attribute split model')
        else:
            print('double attribute split model is better than single attribute split model')
    else:
        print('Fail to reject null hypothesis')

In [279]:
# code for kruskal wallis test to evaluate two models
def kruskalwallis_test(model1, model2):
    # Null hypothesis: model1 and model2 are the same
    # model1 and model2 are lists of accuracies
    # returns the t statistic and p value
    # for the kruskal wallis test
    from scipy.stats import kruskal
    t, p = kruskal([model1], [model2])
    print("NULL HYPOTHESIS: model1 and model2 are the same")
    if p < 0.05:
        print('Reject null hypothesis')
        if(np.mean([model1]) > np.mean([model2])):
            print('single attribute split model is better than double attribute split model')
        else:
            print('double attribute split model is better than single attribute split model')
    else:
        print('Fail to reject null hypothesis')

In [280]:
# code for chi squared test to evaluate two models
def chi_squared_test(model1, model2):
    # Null hypothesis: model1 and model2 are the same
    # model1 and model2 are lists of accuracies
    # returns the t statistic and p value
    # for the chi squared test
    from scipy.stats import chisquare
    t, p = chisquare([model1, model2])
    print("NULL HYPOTHESIS: model1 and model2 are the same")
    if p < 0.05:
        print('Reject null hypothesis')
        if(np.mean([model1]) > np.mean([model2])):
            print('single attribute split model is better than double attribute split model')
        else:
            print('double attribute split model is better than single attribute split model')
    else:
        print('Fail to reject null hypothesis')

In [281]:
def checkStatisticalTests(model1, model2):
    print('t test results: ')
    t_test(model1, model2)
    print('\nwilcoxon test results: ')
    wilcoxon_test(model1, model2)
    print('\nmann whitney u test results: ')
    mannwhitneyu_test(model1, model2)
    print('\nkruskal wallis test results: ')
    kruskalwallis_test(model1, model2)
    print('\nchi squared test results: ')
    chi_squared_test(model1, model2)

# Dataset 1
## Median Dataset

In [282]:
train_data = pd.read_csv('../../data/train1median.csv')
test_data = pd.read_csv('../../data/test1median.csv')

In [283]:
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [284]:
totalData = pd.concat([train_data, test_data], axis=0)

In [285]:
X, y = totalData.iloc[:, :], totalData.iloc[:, -1]

In [286]:
col_names = [*train_data.columns[:]]

### Single Attribute

In [287]:
treeData1Median = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionSingleSplit.build(), TwoQuantileRangeSplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [288]:
singAcc = five_fold(X.values, y.values, treeData1Median)

Precision:  0.8402636497628009
Recall:  0.9215745544777804
F1:  0.8726566032081594
Accuracy:  0.9639780679604646


### Multiple Attributes

In [289]:
treeData1Median = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionDoubleCategorySplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [290]:
multiAcc = five_fold(X.values, y.values, treeData1Median)

Precision:  0.8526173666502894
Recall:  0.7371005338747274
F1:  0.7661645534468129
Accuracy:  0.9507611283457182


In [291]:
checkStatisticalTests(singAcc, multiAcc)

t test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis

wilcoxon test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis

mann whitney u test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis

kruskal wallis test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis

chi squared test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


# Dataset 2

In [292]:
# code to implement 5 fold cross validation and report p, r, f1 and accuracy
def five_fold_dataset2(X, y, clf, folds=5):
    # X is the feature matrix
    # y is the label vector
    # clf is the classifier
    # folds is the number of folds
    # returns the average p, r, f1 and accuracy
    # create the folds
    skf = StratifiedKFold(n_splits=folds)
    # initialize the scores
    probabs = []
    actuals = []
    # loop through the folds
    for train_index, test_index in skf.split(X, y):
        # split the data into train and test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # fit the model
        clf.fit(X_train, y_train)
        # predict the labels
        probabs.append(clf.predict_proba(X_test)[:, 1])
        # probs = clf.predict_proba(X_test)
        # y_pred = clf.predict(X_test)
        # y_pred = [float(i) for i in y_pred]
        # calculate the scores
        # p += precision_score(y_test, y_pred, average='macro')
        # r += recall_score(y_test, y_pred, average='macro')
        # f1 += f1_score(y_test, y_pred, average='macro')
        # acc += accuracy_score(y_test, y_pred)
    # return the average scores
    # print('Precision: ', p/folds)
    # print('Recall: ', r/folds)
    # print('F1: ', f1/folds)
    # print('Accuracy: ', acc/folds)
    return probabs

In [293]:
def performFiveFold(probabs1, probabs2, probabs3, X, y, folds=5):
    skf = StratifiedKFold(n_splits=folds)
    idx = 0
    p, r, f1, acc = 0, 0, 0, 0
    for train_index, test_index in skf.split(X, y):
        # split the data into train and test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # print(probabs1[idx])
        # probabs = np.hstack((probabs1[idx], probabs2[idx], probabs3[idx]))
        probabs = np.hstack((probabs1[idx].reshape(-1,1), probabs2[idx].reshape(-1,1), probabs3[idx].reshape(-1,1)))
        # print(idx, len(probabs1[idx]),len(probabs2[idx]),len(probabs3[idx]), probabs.shape)
        idx += 1
        
        y_pred = np.argmax(probabs, axis=1) + 1
        p += precision_score(y_test, y_pred, average='macro')
        r += recall_score(y_test, y_pred, average='macro')
        f1 += f1_score(y_test, y_pred, average='macro')
        acc += accuracy_score(y_test, y_pred)
    print('Precision: ', p/folds)
    print('Recall: ', r/folds)
    print('F1: ', f1/folds)
    print('Accuracy: ', acc/folds)
    return acc/folds

In [294]:
train1_data = pd.read_csv('../../data/train2Ones.csv')
test1_data = pd.read_csv('../../data/test2Ones.csv')

In [295]:
train1_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test1_data.drop(['Unnamed: 0'], axis=1, inplace=True)
train1_data.drop(['Unnamed: 0.1'], axis=1, inplace=True)
test1_data.drop(['Unnamed: 0.1'], axis=1, inplace=True)

In [296]:
totalData1 = pd.concat([train1_data, test1_data], axis=0)

In [297]:
X1, y1 = totalData1.iloc[:, :], totalData1.iloc[:, -1]

In [298]:
col1_names = [*train1_data.columns[:]]

In [299]:
train2_data = pd.read_csv('../../data/train2Twos.csv')
test2_data = pd.read_csv('../../data/test2Twos.csv')

In [300]:
train2_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test2_data.drop(['Unnamed: 0'], axis=1, inplace=True)
train2_data.drop(['Unnamed: 0.1'], axis=1, inplace=True)
test2_data.drop(['Unnamed: 0.1'], axis=1, inplace=True)

In [301]:
totalData2 = pd.concat([train2_data, test2_data], axis=0)

In [302]:
X2, y2 = totalData2.iloc[:, :], totalData2.iloc[:, -1]

In [303]:
col2_names = [*train2_data.columns[:]]

In [304]:
train3_data = pd.read_csv('../../data/train2Threes.csv')
test3_data = pd.read_csv('../../data/test2Threes.csv')

In [305]:
train3_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test3_data.drop(['Unnamed: 0'], axis=1, inplace=True)
train3_data.drop(['Unnamed: 0.1'], axis=1, inplace=True)
test3_data.drop(['Unnamed: 0.1'], axis=1, inplace=True)

In [306]:
totalData3 = pd.concat([train3_data, test3_data], axis=0)

In [307]:
X3, y3 = totalData3.iloc[:, :], totalData3.iloc[:, -1]

In [308]:
col3_names = [*train3_data.columns[:]]

In [309]:
train_data = pd.read_csv('../../data/train2.csv')
test_data = pd.read_csv('../../data/test2.csv')

In [310]:
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [311]:
totalData = pd.concat([train_data, test_data], axis=0)

In [312]:
X, y = totalData.iloc[:, :], totalData.iloc[:, -1]

In [313]:
col_names = [*train_data.columns[:]]

### Single Attribute

In [314]:
treeData2Ones = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionSingleSplit.build(), TwoQuantileRangeSplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)
treeData2Twos = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionSingleSplit.build(), TwoQuantileRangeSplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)
treeData2Threes = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionSingleSplit.build(), TwoQuantileRangeSplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [315]:
onesProbab = five_fold_dataset2(X1.values, y1.values, treeData2Ones)
twosProbab = five_fold_dataset2(X2.values, y2.values, treeData2Twos)
threesProbab = five_fold_dataset2(X3.values, y3.values, treeData2Threes)

In [316]:
singAcc = performFiveFold(onesProbab, twosProbab, threesProbab, X.values, y.values, 5)

Precision:  0.7380282494943191
Recall:  0.4893432040428185
F1:  0.5315554623060486
Accuracy:  0.8197115384615385


### Multiple Attributes

In [317]:
treeData2Ones = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionDoubleCategorySplit.build()],
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)
treeData2Twos = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionDoubleCategorySplit.build()],
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)
treeData2Threes = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionDoubleCategorySplit.build()],
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [318]:
onesProbab = five_fold_dataset2(X1.values, y1.values, treeData2Ones)
twosProbab = five_fold_dataset2(X2.values, y2.values, treeData2Twos)
threesProbab = five_fold_dataset2(X3.values, y3.values, treeData2Threes)

In [319]:
multiAcc = performFiveFold(onesProbab, twosProbab, threesProbab, X.values, y.values, 5)

Precision:  0.6303684351344956
Recall:  0.40202834898012385
F1:  0.41584704813210943
Accuracy:  0.7923076923076924


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [320]:
checkStatisticalTests(singAcc, multiAcc)

t test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis

wilcoxon test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis

mann whitney u test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis

kruskal wallis test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis

chi squared test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


# Dataset 3
## One Hot Dataset

In [321]:
train_data = pd.read_csv('../../data/train3onehot.csv')
test_data = pd.read_csv('../../data/test3onehot.csv')

In [322]:
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [323]:
totalData = pd.concat([train_data, test_data], axis=0)

In [324]:
X, y = totalData.iloc[:, :], totalData.iloc[:, -1]

In [325]:
col_names = [*train_data.columns[:]]

### Single Attribute

In [326]:
treeData3OHT = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionSingleSplit.build(), TwoQuantileRangeSplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [327]:
singAcc = five_fold(X.values, y.values, treeData3OHT)

Precision:  0.7780456056956732
Recall:  0.668827072385594
F1:  0.7049257170217207
Accuracy:  0.9047827505102646


### Multiple Attributes

In [328]:
treeData3OHT = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionDoubleCategorySplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [329]:
multiAcc = five_fold(X.values, y.values, treeData3OHT)

Precision:  0.763879372953405
Recall:  0.6569969445431458
F1:  0.6913703612080084
Accuracy:  0.901064507217237


In [330]:
checkStatisticalTests(singAcc, multiAcc)

t test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis

wilcoxon test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis

mann whitney u test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis

kruskal wallis test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis

chi squared test results: 
NULL HYPOTHESIS: model1 and model2 are the same
Fail to reject null hypothesis


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)
