In [1]:
import numpy as np
import pandas as pd
import time
import copy
from tools import *

# bayes classifier
from sklearn.naive_bayes import GaussianNB

# decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# random forest classifier
from sklearn.ensemble import RandomForestClassifier

# normal simple dataset division fuction, 
from sklearn.model_selection import train_test_split

# K-fold cross-validation
from sklearn.model_selection import cross_val_score,StratifiedKFold

# f1 measure
from sklearn.metrics import f1_score

# scipy friedman test
from scipy.stats import friedmanchisquare

# nemenyi test function inside
import scikit_posthocs as sp

In [2]:
# ---------------------- tool functions , just run and skip it ----------------------

# generate friedman table
def generate_measure_table(data, row_index, column_name, column_size):
    size_list = list(range(0,column_size))
    df = pd.DataFrame(data)
    # get mean and std of the column
    mean = df.iloc[:,size_list].mean().values.tolist()
    std = df.iloc[:,size_list].std().values.tolist()
    
    # add index for rows
    df = pd.DataFrame(data, index=row_index)
    
    # add division for mean and std
    df.loc['--------'] = ['--------','--------','--------']
    df.loc['avg     '] = mean
    df.loc['std'] = std
    
    # add column name
    df.columns = column_name
    return df

# generate friedman table
def generate_friedman_table(data, row_index, column_name, column_size, is_time):
    size_list = list(range(0,column_size))
    
    # create rank matrix 
    # generate friedman test table
    data_rank = copy.deepcopy(data)
    
    # turn the value into rank number
    if is_time == 0:
        for row in data_rank:
            s_row = sorted(enumerate(row), key=lambda x: x[1])
            idx = [i[0] for i in s_row]
            for index, or_index in enumerate(idx):
                row[or_index] = len(idx) - index
    else:
        for row in data_rank:
            s_row = sorted(enumerate(row), key=lambda x: x[1])
            idx = [i[0] for i in s_row]
            for index, or_index in enumerate(idx):
                row[or_index] = index + 1

    # get mean rank row
    df = pd.DataFrame(data_rank)
    mean = df.iloc[:,size_list].mean().values.tolist()

    # add index for rows
    df = pd.DataFrame(data_rank, index=row_index)
    
    # add division for mean and std
    df.loc['--------'] = ['--------','--------','--------']
    df.loc['avg_rank'] = mean
    
    # add column name
    df.columns = column_name
    return df

# calculate friedman p-value 
def judge_stat_of_Friedman(data, stat, perf_str):
    data_np = np.array(data)
    q, p = friedmanchisquare(data_np[:,0], data_np[:,1], data_np[:,2])
    if q < stat:
        print('Q = ',q , ' < ', stat)
        print('No significant difference between the',perf_str,'performance (fail to reject H0)')
    else:
        print('Q = ',q , ' > ', stat)
        print('Significant difference exists between the',perf_str,'performance (reject H0)')

# run nemenyi test between algorithm pairs
def generate_nemenyi_table(data, column_name, measure):
    df = sp.posthoc_nemenyi_friedman(data)
    df.columns = column_name
    df.index = column_name
    print('The Nemenyi post-hoc test result on ', measure)
    print(df)
    
# run the whole procedure for a measure
def step_2_3_4(data, fold_index, column_name, column_size, is_time, measure):
    
    # Step 2
    # (Table 12.4) generate measure performance table               
    df = generate_measure_table(data, fold_index, column_name, column_size)
    print('------', measure, 'of 3 Algorithms for each fold ------ Table 12.4')
    print(df)
    print('\n')
    
    # Step 3
    # (Table 12.8) generate measure performance friedman rank table 
    df = generate_friedman_table(data, fold_index, column_name, column_size,is_time)
    print('------ Friedman rank table on', measure, '------------- Table 12.8')
    print(df)
    print('\n')
    
    # Step 4
    # use the avg rank data of each algorithm to calculate p-value of friedman test 
    stat = 7.82
    print('The friedman Test result on',measure)
    judge_stat_of_Friedman(acc,stat,measure)
    print('\n')
    
    # use nemenyi post-hoc to test if there's signficant difference between algorithm pairs
    generate_nemenyi_table(acc, column_name, measure)
    print('\n')
    return

In [3]:
# read data
data = pd.read_csv('spambase.data', header=None)

# divide X and Y
# X is the features matrix of the dataset
X = np.array(data)[:, :-1]
# Y is the class vector of the dataset
Y = np.array(data)[:, -1]

In [4]:
# initialize classifiers

# Bayes classifier
clf_bayes = GaussianNB()

# Decision Tree classifier
clf_dt = DecisionTreeClassifier(criterion="entropy")

# Random Forest classifier
clf_rf = RandomForestClassifier(criterion="gini", max_features="log2", n_estimators=100)

In [5]:
# 10-Fold Stratified by the actual classification
skf = StratifiedKFold(n_splits=10,shuffle=True)

# list matrix to store the measure results of each algorithm and on each fold.
t = []
acc = []
f1 = []


for train_index, test_index in skf.split(X,Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    fold_acc = []
    fold_t = []
    fold_f1 = []
    
    # training and time calculation
    # bayes
    start = time.time()
    clf_bayes.fit(X_train, Y_train)
    end = time.time()
    t_bayes = end - start
    
    # decision tree
    start = time.time()
    clf_dt.fit(X_train, Y_train)
    end = time.time()
    t_dt = end - start
    
    # random forest
    start = time.time()
    clf_rf.fit(X_train, Y_train)
    end = time.time()
    t_rf = end - start
    
    # measure accuracy
    acc_bayes = clf_bayes.score(X_test, Y_test)
    acc_dt = clf_dt.score(X_test, Y_test)
    acc_rf = clf_rf.score(X_test, Y_test)
    
    # measure f1 of each model
    bayes_y_pred = clf_bayes.predict(X_test)
    dt_y_pred = clf_dt.predict(X_test)
    rf_y_pred = clf_rf.predict(X_test)
    f1_bayes = f1_score(Y_test, bayes_y_pred)
    f1_dt = f1_score(Y_test, dt_y_pred)
    f1_rf = f1_score(Y_test, rf_y_pred)
    
    # measure results recording
    fold_acc.append(acc_bayes)
    fold_acc.append(acc_dt)
    fold_acc.append(acc_rf)
    
    fold_f1.append(f1_bayes)
    fold_f1.append(f1_dt)
    fold_f1.append(f1_rf)
    
    fold_t.append(t_bayes)
    fold_t.append(t_dt)
    fold_t.append(t_rf)
    
    acc.append(fold_acc)
    f1.append(fold_f1)
    t.append(fold_t)

In [6]:
# initialize the parameters for the procedure
column_name = ['Naive Bayes', 'Decision Tree','Random Forest']
column_size = 3
fold_index = []
for i in range(1,11):
    idx = 'D'+ str(i)
    fold_index.append(idx)

# show the results
# Accuracy Comparision
step_2_3_4(acc, fold_index, column_name, column_size, 0, 'Accuracy')

------ Accuracy of 3 Algorithms for each fold ------ Table 12.4
         Naive Bayes Decision Tree Random Forest
D1          0.854664      0.926247      0.958785
D2          0.832972       0.94577      0.963124
D3          0.796095      0.908894      0.952278
D4          0.826087      0.934783      0.963043
D5              0.85      0.902174          0.95
D6          0.819565      0.930435      0.963043
D7           0.81087      0.934783      0.967391
D8          0.815217      0.921739      0.952174
D9          0.784314      0.915033      0.941176
D10         0.821351      0.908497      0.941176
--------    --------      --------      --------
avg         0.821113      0.922835      0.955219
std        0.0217248      0.014023    0.00934142


------ Friedman rank table on Accuracy ------------- Table 12.8
         Naive Bayes Decision Tree Random Forest
D1                 3             2             1
D2                 3             2             1
D3                 3             2   

In [7]:
# F1 measure Comparision
step_2_3_4(f1, fold_index, column_name, column_size, 0, 'F1')

------ F1 of 3 Algorithms for each fold ------ Table 12.4
         Naive Bayes Decision Tree Random Forest
D1          0.834568      0.910053      0.947368
D2          0.818824      0.929972      0.952113
D3          0.788288      0.884615      0.938889
D4          0.816514      0.919355      0.953168
D5          0.834532       0.87395      0.935574
D6          0.808314      0.910112      0.952381
D7          0.802721      0.916201      0.957507
D8          0.803695      0.900552      0.939227
D9          0.771363      0.894879      0.923944
D10         0.809302      0.884615       0.92437
--------    --------      --------      --------
avg         0.808812       0.90243      0.942454
std         0.019356     0.0178207      0.012007


------ Friedman rank table on F1 ------------- Table 12.8
         Naive Bayes Decision Tree Random Forest
D1                 3             2             1
D2                 3             2             1
D3                 3             2             1


In [8]:
# Training Time Consumption Comparision
step_2_3_4(t, fold_index, column_name, column_size, 1, 'Time Consumption')

------ Time Consumption of 3 Algorithms for each fold ------ Table 12.4
          Naive Bayes Decision Tree Random Forest
D1         0.00511718     0.0457041      0.416781
D2         0.00302792     0.0386612      0.407837
D3         0.00308299     0.0472541       0.42256
D4         0.00382614     0.0394831      0.400536
D5         0.00339603      0.040956      0.405085
D6         0.00338936      0.040432      0.401904
D7          0.0034461       0.04088      0.408856
D8         0.00366378     0.0405681      0.398348
D9         0.00327206     0.0386629      0.404066
D10        0.00318098     0.0398679      0.397418
--------     --------      --------      --------
avg        0.00354025     0.0412469      0.406339
std       0.000606328    0.00289967    0.00805716


------ Friedman rank table on Time Consumption ------------- Table 12.8
         Naive Bayes Decision Tree Random Forest
D1                 1             2             3
D2                 1             2             3
D3     