In [84]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# import hdtree
from hdtree import HDTreeClassifier
from information_measure import EntropyMeasure
# import information_measure
# import split_rule
from split_rule import LessThanHalfOfSplit, SingleCategorySplit, FixedValueSplit, TwentyQuantileSplit, LogisticRegressionSingleSplit, AbstractQuantileSplit, TwentyQuantileRangeSplit, TwoQuantileRangeSplit, LogisticRegressionDoubleCategorySplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score, roc_curve, roc_auc_score, plot_roc_curve, auc
import pickle
from joblib import dump, load
import json

In [85]:
# code to implement 5 fold cross validation and report p, r, f1 and accuracy
def five_fold(X, y, clf, folds=5):
    # X is the feature matrix
    # y is the label vector
    # clf is the classifier
    # folds is the number of folds
    # returns the average p, r, f1 and accuracy
    # create the folds
    skf = StratifiedKFold(n_splits=folds)
    # initialize the scores
    p, r, f1, acc = 0, 0, 0, 0
    # loop through the folds
    for train_index, test_index in skf.split(X, y):
        # split the data into train and test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # fit the model
        clf.fit(X_train, y_train)
        # predict the labels
        y_pred = clf.predict(X_test)
        y_pred = [float(i) for i in y_pred]
        # calculate the scores
        p += precision_score(y_test, y_pred, average='macro')
        r += recall_score(y_test, y_pred, average='macro')
        f1 += f1_score(y_test, y_pred, average='macro')
        acc += accuracy_score(y_test, y_pred)
    # return the average scores
    print('Precision: ', p/folds)
    print('Recall: ', r/folds)
    print('F1: ', f1/folds)
    print('Accuracy: ', acc/folds)

# Dataset 1
## Median Dataset

In [86]:
train_data = pd.read_csv('../../data/train1median.csv')
test_data = pd.read_csv('../../data/test1median.csv')

In [87]:
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [88]:
totalData = pd.concat([train_data, test_data], axis=0)

In [89]:
X, y = totalData.iloc[:, :], totalData.iloc[:, -1]

In [90]:
col_names = [*train_data.columns[:]]

### Single Attribute

In [91]:
treeData1Median = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionSingleSplit.build(), TwoQuantileRangeSplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [93]:
five_fold(X.values, y.values, treeData1Median)

Precision:  0.8402636497628009
Recall:  0.9215745544777804
F1:  0.8726566032081594
Accuracy:  0.9639780679604646


### Multiple Attributes

In [94]:
treeData1Median = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionDoubleCategorySplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [96]:
five_fold(X.values, y.values, treeData1Median)

Precision:  0.8526173666502894
Recall:  0.7371005338747274
F1:  0.7661645534468129
Accuracy:  0.9507611283457182


# Dataset 2

In [54]:
# code to implement 5 fold cross validation and report p, r, f1 and accuracy
def five_fold_dataset2(X, y, clf, folds=5):
    # X is the feature matrix
    # y is the label vector
    # clf is the classifier
    # folds is the number of folds
    # returns the average p, r, f1 and accuracy
    # create the folds
    skf = StratifiedKFold(n_splits=folds, random_state=42)
    # initialize the scores
    probabs = []
    actuals = []
    # loop through the folds
    for train_index, test_index in skf.split(X, y):
        # split the data into train and test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # fit the model
        clf.fit(X_train, y_train)
        # predict the labels
        probabs.append(clf.predict_proba(X_test))
        # probs = clf.predict_proba(X_test)
        # y_pred = clf.predict(X_test)
        # y_pred = [float(i) for i in y_pred]
        # calculate the scores
        # p += precision_score(y_test, y_pred, average='macro')
        # r += recall_score(y_test, y_pred, average='macro')
        # f1 += f1_score(y_test, y_pred, average='macro')
        # acc += accuracy_score(y_test, y_pred)
    # return the average scores
    # print('Precision: ', p/folds)
    # print('Recall: ', r/folds)
    # print('F1: ', f1/folds)
    # print('Accuracy: ', acc/folds)
    return probabs

In [None]:
def performFiveFold(probabs1, probabs2, probabs3, y_true)

# Dataset 3
## One Hot Dataset

In [97]:
train_data = pd.read_csv('../../data/train3onehot.csv')
test_data = pd.read_csv('../../data/test3onehot.csv')

In [98]:
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [99]:
totalData = pd.concat([train_data, test_data], axis=0)

In [100]:
X, y = totalData.iloc[:, :], totalData.iloc[:, -1]

In [101]:
col_names = [*train_data.columns[:]]

### Single Attribute

In [102]:
treeData3OHT = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionSingleSplit.build(), TwoQuantileRangeSplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [104]:
five_fold(X.values, y.values, treeData3OHT)

Precision:  0.7780456056956732
Recall:  0.668827072385594
F1:  0.7049257170217207
Accuracy:  0.9047827505102646


### Multiple Attributes

In [105]:
treeData3OHT = HDTreeClassifier(
                    allowed_splits=[LogisticRegressionDoubleCategorySplit.build()], 
                    information_measure=EntropyMeasure(), 
                    attribute_names=col_names, 
                    max_levels=5,
                    min_samples_at_leaf=10)

In [106]:
five_fold(X.values, y.values, treeData3OHT)

Precision:  0.763879372953405
Recall:  0.6569969445431458
F1:  0.6913703612080084
Accuracy:  0.901064507217237
