In [1]:
import numpy as np
import pandas as pd
import scikit_posthocs as sp
import time
import copy
from tools import *

# bayes classifier
from sklearn.naive_bayes import GaussianNB

# decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# random forest classifier
from sklearn.ensemble import RandomForestClassifier

# normal simple dataset division fuction, 
from sklearn.model_selection import train_test_split

# K-fold cross-validation
from sklearn.model_selection import cross_val_score,StratifiedKFold

# normalization function provided by sklearn
from sklearn import preprocessing

# f1 measure
from sklearn.metrics import f1_score

# kfold function provided by sklearn
from sklearn.model_selection import cross_val_score

# scipy friedman test
from scipy.stats import friedmanchisquare

In [2]:
# read data
data = pd.read_csv('spambase.data', header=None)

# divide X and Y
# X is the features matrix of the dataset
X = np.array(data)[:, :-1]
# Y is the label vector of the dataset
Y = np.array(data)[:, -1]

In [3]:
# initialize classifiers
clf_bayes = GaussianNB()
# TODO
clf_dt = DecisionTreeClassifier(criterion="entropy")
# TODO
clf_rf = RandomForestClassifier(criterion="gini", max_features="log2", n_estimators=73)

In [4]:
# 10-Fold Stratified by the actual classification
skf = StratifiedKFold(n_splits=10,shuffle=True)

# list matrix to store the measure results of each algorithm and on each fold.
t = []
acc = []
f1 = []


for train_index, test_index in skf.split(X,Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    fold_acc = []
    fold_t = []
    fold_f1 = []

#     print('Train size: %d | test size: %d' % (len(train), len(test)))
#     tr_result = pd.value_counts(Y_train)
#     ts_result = pd.value_counts(Y_test)
#     print(tr_result)
#     print(ts_result)
    
    # training and computational performance calculation
    # bayes
    start = time.time()
    clf_bayes.fit(X_train, Y_train)
    end = time.time()
    t_bayes = end - start
    
    # decision tree
    start = time.time()
    clf_dt.fit(X_train, Y_train)
    end = time.time()
    t_dt = end - start
    
    # random forest
    start = time.time()
    clf_rf.fit(X_train, Y_train)
    end = time.time()
    t_rf = end - start
    
    # calculate the f1 of each model
    bayes_y_pred = clf_bayes.predict(X_test)
    dt_y_pred = clf_dt.predict(X_test)
    rf_y_pred = clf_rf.predict(X_test)
    f1_bayes = f1_score(Y_test, bayes_y_pred)
    f1_dt = f1_score(Y_test, dt_y_pred)
    f1_rf = f1_score(Y_test, rf_y_pred)
    
    # measure results recording
    bayes_acc.append(acc_bayes)
    dt_acc.append(acc_dt)
    rf_acc.append(acc_rf)
    
    fold_acc.append(acc_bayes)
    fold_acc.append(acc_dt)
    fold_acc.append(acc_rf)
    
    fold_f1.append(f1_bayes)
    fold_f1.append(f1_dt)
    fold_f1.append(f1_rf)
    
    fold_t.append(t_bayes)
    fold_t.append(t_dt)
    fold_t.append(t_rf)
    
    acc.append(fold_acc)
    f1.append(fold_f1)
    t.append(fold_t)

In [5]:
# set parameter for self-made table generator
column_name = ['Naive Bayes', 'Decision Tree','Random Forest']
column_size = 3
fold_index = []
for i in range(1,11):
    idx = 'D'+ str(i)
    fold_index.append(idx)

# show the results
df_acc = generate_measure_table(acc, fold_index, column_name, column_size)
print('------ Accuracy of 3 Algorithms for each fold ------')
print(df_acc)
print('\n')

df_f1 = generate_measure_table(f1, fold_index, column_name, column_size)
print('--------- F1 of 3 Algorithms for each fold ---------')
print(df_f1)
print('\n')

df_t = generate_measure_table(t, fold_index, column_name, column_size)
print('-- Time Consumption of 3 Algorithms for each fold --')
print(df_t)



------ Accuracy of 3 Algorithms for each fold ------
    Naive Bayes Decision Tree Random Forest
D1     0.806941      0.906725      0.937093
D2     0.802603      0.937093      0.956616
D3     0.813449      0.915401      0.947939
D4     0.813043      0.928261       0.96087
D5     0.858696      0.919565      0.967391
D6     0.836957      0.906522      0.947826
D7     0.836957      0.928261      0.971739
D8     0.830435      0.917391      0.936957
D9     0.810458      0.925926       0.96732
D10    0.812636      0.928105      0.965142
---    --------      --------      --------
avg    0.822217      0.921325      0.955889
std   0.0177698    0.00998863     0.0127588


--------- F1 of 3 Algorithms for each fold ---------
    Naive Bayes Decision Tree Random Forest
D1     0.796339      0.881543       0.91922
D2     0.793651      0.920548      0.943503
D3     0.799065      0.893733      0.934066
D4     0.803653      0.909589          0.95
D5     0.844869      0.899183      0.958904
D6     0.821

In [6]:
# rank the performance of
df_acc = generate_friedman_table(acc, fold_index, column_name, column_size,0)
print('------ Friedman test on Accuracy ------')
print(df_acc)
print('\n')

df_f1 = generate_friedman_table(f1, fold_index, column_name, column_size,0)
print('--------- Friedman test on F1 ---------')
print(df_f1)
print('\n')

df_t = generate_friedman_table(t, fold_index, column_name, column_size,1)
print('-- Friedman test on Time Consumption --')
print(df_t)


------ Friedman test on Accuracy ------
         Naive Bayes Decision Tree Random Forest
D1                 3             2             1
D2                 3             2             1
D3                 3             2             1
D4                 3             2             1
D5                 3             2             1
D6                 3             2             1
D7                 3             2             1
D8                 3             2             1
D9                 3             2             1
D10                3             2             1
--------    --------      --------      --------
avg_rank           3             2             1


--------- Friedman test on F1 ---------
         Naive Bayes Decision Tree Random Forest
D1                 3             2             1
D2                 3             2             1
D3                 3             2             1
D4                 3             2             1
D5                 3             2  

In [12]:
# friedman test
# input: 3 measurement arrays of the algorithm
# TODO: use numpy to get column of the data instead of arrays
acc_np = np.array(acc)
f1_np = np.array(f1)
t_np = np.array(t)
stat, p_acc = friedmanchisquare(acc_np[:,0], acc_np[:,1], acc_np[:,2])
stat, p_f1 = friedmanchisquare(f1_np[:,0], f1_np[:,1], f1_np[:,2])
stat, p_t = friedmanchisquare(t_np[:,0], t_np[:,1], t_np[:,2])

def judge_p(p, alpha, perf_str):
    if p > alpha:
        print('p = ',p , ' > ', alpha)
        print('No significance difference between the',perf_str,'performance (fail to reject H0)')
    else:
        print('p = ',p , ' < ', alpha)
        print('Significance difference exists between the',perf_str,'performance (reject H0)')

alpha = 0.05
print('The friedman Test result on Accuracy')
judge_p(p_acc,alpha,'Accuracy')
print()

print('The friedman Test result on F1')
judge_p(p_f1,alpha,'F1')
print()

print('The friedman Test result on Computational performance')
judge_p(p_t,alpha,'Computational')
print()


The friedman Test result on Accuracy
p =  4.539992976248486e-05  <  0.05
Significance difference exists between the Accuracy performance (reject H0)

The friedman Test result on F1
p =  4.539992976248486e-05  <  0.05
Significance difference exists between the F1 performance (reject H0)

The friedman Test result on Computational performance
p =  4.539992976248486e-05  <  0.05
Significance difference exists between the Computational performance (reject H0)



In [8]:
# print(acc)
# print(f1)
# print(t)
print(sp.posthoc_nemenyi_friedman(acc))
print(sp.posthoc_nemenyi_friedman(f1))
print(sp.posthoc_nemenyi_friedman(t))

          0         1         2
0 -1.000000  0.065303  0.001000
1  0.065303 -1.000000  0.065303
2  0.001000  0.065303 -1.000000
          0         1         2
0 -1.000000  0.065303  0.001000
1  0.065303 -1.000000  0.065303
2  0.001000  0.065303 -1.000000
          0         1         2
0 -1.000000  0.065303  0.001000
1  0.065303 -1.000000  0.065303
2  0.001000  0.065303 -1.000000
