In [84]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# import hdtree
from hdtree import HDTreeClassifier
from information_measure import EntropyMeasure
# import information_measure
# import split_rule
from split_rule import LessThanHalfOfSplit, SingleCategorySplit, FixedValueSplit, TwentyQuantileSplit, LogisticRegressionSingleSplit, AbstractQuantileSplit, TwentyQuantileRangeSplit, TwoQuantileRangeSplit, LogisticRegressionDoubleCategorySplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score, roc_curve, roc_auc_score, plot_roc_curve, auc
import pickle
from joblib import dump, load
import json

In [85]:
# code to implement 5 fold cross validation and report p, r, f1 and accuracy
def five_fold(X, y, clf, folds=5):
    # X is the feature matrix
    # y is the label vector
    # clf is the classifier
    # folds is the number of folds
    # returns the average p, r, f1 and accuracy
    # create the folds
    skf = StratifiedKFold(n_splits=folds)
    # initialize the scores
    p, r, f1, acc = 0, 0, 0, 0
    # loop through the folds
    for train_index, test_index in skf.split(X, y):
        # split the data into train and test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # fit the model
        clf.fit(X_train, y_train)
        # predict the labels
        y_pred = clf.predict(X_test)
        y_pred = [float(i) for i in y_pred]
        # calculate the scores
        p += precision_score(y_test, y_pred, average='macro')
        r += recall_score(y_test, y_pred, average='macro')
        f1 += f1_score(y_test, y_pred, average='macro')
        acc += accuracy_score(y_test, y_pred)
    # return the average scores
    print('Precision: ', p/folds)
    print('Recall: ', r/folds)
    print('F1: ', f1/folds)
    print('Accuracy: ', acc/folds)

# Dataset 1
## Median Dataset

In [86]:
train_data = pd.read_csv('../../data/train1median.csv')
test_data = pd.read_csv('../../data/test1median.csv')

In [87]:
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [88]:
totalData = pd.concat([train_data, test_data], axis=0)

In [89]:
X, y = totalData.iloc[:, :], totalData.iloc[:, -1]

In [90]:
col_names = [*train_data.columns[:]]

### Single Attribute

In [91]:
treeData1Median = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionSingleSplit.build(), TwoQuantileRangeSplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [93]:
five_fold(X.values, y.values, treeData1Median)

Precision:  0.8402636497628009
Recall:  0.9215745544777804
F1:  0.8726566032081594
Accuracy:  0.9639780679604646


### Multiple Attributes

In [94]:
treeData1Median = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionDoubleCategorySplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [96]:
five_fold(X.values, y.values, treeData1Median)

Precision:  0.8526173666502894
Recall:  0.7371005338747274
F1:  0.7661645534468129
Accuracy:  0.9507611283457182


# Dataset 2

In [181]:
# code to implement 5 fold cross validation and report p, r, f1 and accuracy
def five_fold_dataset2(X, y, clf, folds=5):
    # X is the feature matrix
    # y is the label vector
    # clf is the classifier
    # folds is the number of folds
    # returns the average p, r, f1 and accuracy
    # create the folds
    skf = StratifiedKFold(n_splits=folds)
    # initialize the scores
    probabs = []
    actuals = []
    # loop through the folds
    for train_index, test_index in skf.split(X, y):
        # split the data into train and test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # fit the model
        clf.fit(X_train, y_train)
        # predict the labels
        probabs.append(clf.predict_proba(X_test)[:, 1])
        # probs = clf.predict_proba(X_test)
        # y_pred = clf.predict(X_test)
        # y_pred = [float(i) for i in y_pred]
        # calculate the scores
        # p += precision_score(y_test, y_pred, average='macro')
        # r += recall_score(y_test, y_pred, average='macro')
        # f1 += f1_score(y_test, y_pred, average='macro')
        # acc += accuracy_score(y_test, y_pred)
    # return the average scores
    # print('Precision: ', p/folds)
    # print('Recall: ', r/folds)
    # print('F1: ', f1/folds)
    # print('Accuracy: ', acc/folds)
    return probabs

In [210]:
def performFiveFold(probabs1, probabs2, probabs3, X, y, folds=5):
    skf = StratifiedKFold(n_splits=folds)
    idx = 0
    p, r, f1, acc = 0, 0, 0, 0
    for train_index, test_index in skf.split(X, y):
        # split the data into train and test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # print(probabs1[idx])
        # probabs = np.hstack((probabs1[idx], probabs2[idx], probabs3[idx]))
        probabs = np.hstack((probabs1[idx].reshape(-1,1), probabs2[idx].reshape(-1,1), probabs3[idx].reshape(-1,1)))
        # print(idx, len(probabs1[idx]),len(probabs2[idx]),len(probabs3[idx]), probabs.shape)
        idx += 1
        
        y_pred = np.argmax(probabs, axis=1) + 1
        p += precision_score(y_test, y_pred, average='macro')
        r += recall_score(y_test, y_pred, average='macro')
        f1 += f1_score(y_test, y_pred, average='macro')
        acc += accuracy_score(y_test, y_pred)
    print('Precision: ', p/folds)
    print('Recall: ', r/folds)
    print('F1: ', f1/folds)
    print('Accuracy: ', acc/folds)

In [211]:
train1_data = pd.read_csv('../../data/train2Ones.csv')
test1_data = pd.read_csv('../../data/test2Ones.csv')

In [184]:
train1_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test1_data.drop(['Unnamed: 0'], axis=1, inplace=True)
train1_data.drop(['Unnamed: 0.1'], axis=1, inplace=True)
test1_data.drop(['Unnamed: 0.1'], axis=1, inplace=True)

In [185]:
totalData1 = pd.concat([train1_data, test1_data], axis=0)

In [186]:
X1, y1 = totalData1.iloc[:, :], totalData1.iloc[:, -1]

In [187]:
col1_names = [*train1_data.columns[:]]

In [188]:
train2_data = pd.read_csv('../../data/train2Twos.csv')
test2_data = pd.read_csv('../../data/test2Twos.csv')

In [189]:
train2_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test2_data.drop(['Unnamed: 0'], axis=1, inplace=True)
train2_data.drop(['Unnamed: 0.1'], axis=1, inplace=True)
test2_data.drop(['Unnamed: 0.1'], axis=1, inplace=True)

In [190]:
totalData2 = pd.concat([train2_data, test2_data], axis=0)

In [191]:
X2, y2 = totalData2.iloc[:, :], totalData2.iloc[:, -1]

In [192]:
col2_names = [*train2_data.columns[:]]

In [193]:
train3_data = pd.read_csv('../../data/train2Threes.csv')
test3_data = pd.read_csv('../../data/test2Threes.csv')

In [194]:
train3_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test3_data.drop(['Unnamed: 0'], axis=1, inplace=True)
train3_data.drop(['Unnamed: 0.1'], axis=1, inplace=True)
test3_data.drop(['Unnamed: 0.1'], axis=1, inplace=True)

In [195]:
totalData3 = pd.concat([train3_data, test3_data], axis=0)

In [196]:
X3, y3 = totalData3.iloc[:, :], totalData3.iloc[:, -1]

In [197]:
col3_names = [*train3_data.columns[:]]

In [198]:
train_data = pd.read_csv('../../data/train2.csv')
test_data = pd.read_csv('../../data/test2.csv')

In [199]:
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [200]:
totalData = pd.concat([train_data, test_data], axis=0)

In [201]:
X, y = totalData.iloc[:, :], totalData.iloc[:, -1]

In [202]:
col_names = [*train_data.columns[:]]

### Single Attribute

In [203]:
treeData2Ones = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionSingleSplit.build(), TwoQuantileRangeSplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)
treeData2Twos = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionSingleSplit.build(), TwoQuantileRangeSplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)
treeData2Threes = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionSingleSplit.build(), TwoQuantileRangeSplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [204]:
onesProbab = five_fold_dataset2(X1.values, y1.values, treeData2Ones)
twosProbab = five_fold_dataset2(X2.values, y2.values, treeData2Twos)
threesProbab = five_fold_dataset2(X3.values, y3.values, treeData2Threes)

In [212]:
performFiveFold(onesProbab, twosProbab, threesProbab, X.values, y.values, 5)

Precision:  0.7380282494943191
Recall:  0.4893432040428185
F1:  0.5315554623060486
Accuracy:  0.8197115384615385


### Multiple Attributes

In [213]:
treeData2Ones = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionDoubleCategorySplit.build()],
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)
treeData2Twos = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionDoubleCategorySplit.build()],
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)
treeData2Threes = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionDoubleCategorySplit.build()],
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [214]:
onesProbab = five_fold_dataset2(X1.values, y1.values, treeData2Ones)
twosProbab = five_fold_dataset2(X2.values, y2.values, treeData2Twos)
threesProbab = five_fold_dataset2(X3.values, y3.values, treeData2Threes)

In [215]:
performFiveFold(onesProbab, twosProbab, threesProbab, X.values, y.values, 5)

Precision:  0.6303684351344956
Recall:  0.40202834898012385
F1:  0.41584704813210943
Accuracy:  0.7923076923076924


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Dataset 3
## One Hot Dataset

In [97]:
train_data = pd.read_csv('../../data/train3onehot.csv')
test_data = pd.read_csv('../../data/test3onehot.csv')

In [98]:
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [99]:
totalData = pd.concat([train_data, test_data], axis=0)

In [100]:
X, y = totalData.iloc[:, :], totalData.iloc[:, -1]

In [101]:
col_names = [*train_data.columns[:]]

### Single Attribute

In [102]:
treeData3OHT = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionSingleSplit.build(), TwoQuantileRangeSplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [104]:
five_fold(X.values, y.values, treeData3OHT)

Precision:  0.7780456056956732
Recall:  0.668827072385594
F1:  0.7049257170217207
Accuracy:  0.9047827505102646


### Multiple Attributes

In [105]:
treeData3OHT = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionDoubleCategorySplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [106]:
five_fold(X.values, y.values, treeData3OHT)

Precision:  0.763879372953405
Recall:  0.6569969445431458
F1:  0.6913703612080084
Accuracy:  0.901064507217237
