In [54]:
import numpy as np
import pandas as pd
import time
import copy

# bayes classifier
from sklearn.naive_bayes import GaussianNB

# decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# random forest classifier
from sklearn.ensemble import RandomForestClassifier

# normal simple dataset division fuction, 
from sklearn.model_selection import train_test_split

# K-fold cross-validation
from sklearn.model_selection import cross_val_score,StratifiedKFold

# f1 measure
from sklearn.metrics import f1_score

# scipy friedman test
from scipy.stats import friedmanchisquare

# nemenyi test function inside
import scikit_posthocs as sp

In [55]:
# ---------------------- tool functions , just run and skip it ----------------------

# generate friedman table
def generate_measure_table(data, row_index, column_name, column_size):
    size_list = list(range(0,column_size))
    df = pd.DataFrame(data)
    # get mean and std of the column
    mean = df.iloc[:,size_list].mean().values.tolist()
    std = df.iloc[:,size_list].std().values.tolist()
    
    # add index for rows
    df = pd.DataFrame(data, index=row_index)
    
    # add division for mean and std
    df.loc['--------'] = ['--------','--------','--------']
    df.loc['avg     '] = mean
    df.loc['std'] = std
    
    # add column name
    df.columns = column_name
    return df

# generate friedman table
def generate_friedman_table(data, row_index, column_name, column_size, is_time):
    size_list = list(range(0,column_size))
    
    # create rank matrix 
    # generate friedman test table
    data_rank = copy.deepcopy(data)
    
    # turn the value into rank number
    if is_time == 0:
        for row in data_rank:
            s_row = sorted(enumerate(row), key=lambda x: x[1])
            idx = [i[0] for i in s_row]
            for index, or_index in enumerate(idx):
                row[or_index] = len(idx) - index
    else:
        for row in data_rank:
            s_row = sorted(enumerate(row), key=lambda x: x[1])
            idx = [i[0] for i in s_row]
            for index, or_index in enumerate(idx):
                row[or_index] = index + 1

    # get mean rank row
    df = pd.DataFrame(data_rank)
    mean = df.iloc[:,size_list].mean().values.tolist()

    # add index for rows
    df = pd.DataFrame(data_rank, index=row_index)
    
    # add division for mean and std
    df.loc['--------'] = ['--------','--------','--------']
    df.loc['avg_rank'] = mean
    
    # add column name
    df.columns = column_name
    return df, mean

# calculate friedman p-value 
def judge_stat_of_Friedman(data, stat, perf_str):
    data_np = np.array(data)
    q, p = friedmanchisquare(data_np[:,0], data_np[:,1], data_np[:,2])
    if q < stat:
        print('Q = ',q , ' < ', stat)
        print('No significant difference between the',perf_str,'performance (fail to reject H0)')
    else:
        print('Q = ',q , ' > ', stat)
        print('Significant difference exists between the',perf_str,'performance (reject H0)')

# run nemenyi test between algorithm pairs
def generate_nemenyi_table(mean, measure, column_name, num_fold, freedom):
    
#     df = sp.posthoc_nemenyi_friedman(data)
#     df.columns = column_name
#     df.index = column_name
#     print('The Nemenyi post-hoc test result on ', measure)
#     print(df)
    q_alpha = 2.343
    print('With alpha = 0.05 and freedom = 3, q_alpha = ', q_alpha)
    critical_difference = q_alpha * np.sqrt(freedom * (freedom + 1.) / (6. * num_fold))
    print('Calculated CD =', critical_difference)
    
    pair1 = [mean[0], mean[1], 0, 1]
    pair2 = [mean[0], mean[2], 0, 2]
    pair3 = [mean[1], mean[2], 1, 2]
    for pair in [pair1, pair2, pair3]:
        print()
        dif = abs(pair[0] - pair[1])
        if dif >= critical_difference:
            print('Rankdiff(',column_name[pair[2]],',', column_name[pair[3]],') =',dif, '>',critical_difference)
            print('Significant difference between', column_name[pair[2]], 'and', column_name[pair[3]], 'on', measure)
        else:
            print('Rankdiff(',column_name[pair[2]],',', column_name[pair[3]],') =',dif, '<',critical_difference)
            print('No Significant difference between', column_name[pair[2]], 'and', column_name[pair[3]], 'on', measure)
    
# run the whole procedure for a measure
def step_2_3_4(data, fold_index, column_name, column_size, is_time, measure):
    
    # Step 2
    # (Table 12.4) generate measure performance table               
    df = generate_measure_table(data, fold_index, column_name, column_size)
    print('------', measure, 'of 3 Algorithms for each fold ------ Table 12.4')
    print(df)
    print('\n')
    
    # Step 3
    # (Table 12.8) generate measure performance friedman rank table 
    df, ranks_mean = generate_friedman_table(data, fold_index, column_name, column_size,is_time)
    print('------ Friedman rank table on', measure, '------------- Table 12.8')
    print(df)
    print('\n')
    
    # Step 4
    # use the avg rank data of each algorithm to calculate p-value of friedman test 
    stat = 7.82
    print('The friedman Test result on',measure)
    judge_stat_of_Friedman(acc,stat,measure)
    print('\n')
    
    # use nemenyi post-hoc to test if there's signficant difference between algorithm pairs
    generate_nemenyi_table(ranks_mean, measure, column_name, 10,3)
    print('\n')
    return

In [56]:
# read data
data = pd.read_csv('spambase.data', header=None)

# divide X and Y
# X is the features matrix of the dataset
X = np.array(data)[:, :-1]
# Y is the class vector of the dataset
Y = np.array(data)[:, -1]

In [57]:
# initialize classifiers

# Bayes classifier
clf_bayes = GaussianNB()

# Decision Tree classifier
clf_dt = DecisionTreeClassifier(criterion="entropy")

# Random Forest classifier
clf_rf = RandomForestClassifier(criterion="gini", max_features="log2", n_estimators=100)

In [58]:
# 10-Fold Stratified by the actual classification
skf = StratifiedKFold(n_splits=10,shuffle=True)

# list matrix to store the measure results of each algorithm and on each fold.
t = []
acc = []
f1 = []


for train_index, test_index in skf.split(X,Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    fold_acc = []
    fold_t = []
    fold_f1 = []
    
    # training and time calculation
    # bayes
    start = time.time()
    clf_bayes.fit(X_train, Y_train)
    end = time.time()
    t_bayes = end - start
    
    # decision tree
    start = time.time()
    clf_dt.fit(X_train, Y_train)
    end = time.time()
    t_dt = end - start
    
    # random forest
    start = time.time()
    clf_rf.fit(X_train, Y_train)
    end = time.time()
    t_rf = end - start
    
    # measure accuracy
    acc_bayes = clf_bayes.score(X_test, Y_test)
    acc_dt = clf_dt.score(X_test, Y_test)
    acc_rf = clf_rf.score(X_test, Y_test)
    
    # measure f1 of each model
    bayes_y_pred = clf_bayes.predict(X_test)
    dt_y_pred = clf_dt.predict(X_test)
    rf_y_pred = clf_rf.predict(X_test)
    f1_bayes = f1_score(Y_test, bayes_y_pred)
    f1_dt = f1_score(Y_test, dt_y_pred)
    f1_rf = f1_score(Y_test, rf_y_pred)
    
    # measure results recording
    fold_acc.append(acc_bayes)
    fold_acc.append(acc_dt)
    fold_acc.append(acc_rf)
    
    fold_f1.append(f1_bayes)
    fold_f1.append(f1_dt)
    fold_f1.append(f1_rf)
    
    fold_t.append(t_bayes)
    fold_t.append(t_dt)
    fold_t.append(t_rf)
    
    acc.append(fold_acc)
    f1.append(fold_f1)
    t.append(fold_t)

In [59]:
# initialize the parameters for the procedure function
column_name = ['Naive Bayes', 'Decision Tree','Random Forest']
column_size = 3
fold_index = []
for i in range(1,11):
    idx = 'D'+ str(i)
    fold_index.append(idx)

# show the results
# Accuracy Comparision
step_2_3_4(acc, fold_index, column_name, column_size, 0, 'Accuracy')

------ Accuracy of 3 Algorithms for each fold ------ Table 12.4
         Naive Bayes Decision Tree Random Forest
D1          0.813449       0.91974      0.943601
D2          0.815618      0.911063      0.963124
D3          0.826464       0.91974      0.954447
D4          0.843478      0.897826      0.936957
D5          0.773913      0.936957      0.969565
D6          0.867391      0.930435      0.971739
D7          0.830435      0.947826      0.967391
D8          0.826087      0.921739      0.945652
D9          0.825708      0.934641       0.95207
D10         0.788671      0.923747      0.962963
--------    --------      --------      --------
avg         0.821121      0.924371      0.956751
std        0.0261885     0.0140741     0.0120019


------ Friedman rank table on Accuracy ------------- Table 12.8
         Naive Bayes Decision Tree Random Forest
D1                 3             2             1
D2                 3             2             1
D3                 3             2   

In [60]:
# F1 measure Comparision
step_2_3_4(f1, fold_index, column_name, column_size, 0, 'F1')

------ F1 of 3 Algorithms for each fold ------ Table 12.4
         Naive Bayes Decision Tree Random Forest
D1           0.80543       0.89863      0.927778
D2          0.803695      0.891821      0.953425
D3          0.813953      0.897507      0.941504
D4          0.825243       0.86533      0.917379
D5          0.763636      0.917379      0.960227
D6          0.853012      0.910112      0.962963
D7          0.818605      0.933702      0.958678
D8          0.813084      0.903743      0.931129
D9          0.813953      0.917127      0.939227
D10         0.780045      0.904632      0.952646
--------    --------      --------      --------
avg         0.809066      0.903998      0.944496
std        0.0242578      0.018215     0.0155134


------ Friedman rank table on F1 ------------- Table 12.8
         Naive Bayes Decision Tree Random Forest
D1                 3             2             1
D2                 3             2             1
D3                 3             2             1


In [61]:
# Training Time Consumption Comparision
step_2_3_4(t, fold_index, column_name, column_size, 1, 'Time Consumption')

------ Time Consumption of 3 Algorithms for each fold ------ Table 12.4
          Naive Bayes Decision Tree Random Forest
D1         0.00484419      0.053426      0.445061
D2         0.00320697     0.0432839      0.463669
D3         0.00343132     0.0450399      0.477523
D4         0.00391722     0.0423079      0.443334
D5         0.00428534     0.0477152      0.461341
D6         0.00321293     0.0426018      0.466308
D7         0.00393009     0.0470779      0.488025
D8         0.00292301      0.046092      0.471991
D9           0.003582     0.0630438      0.446426
D10        0.00307417     0.0453687      0.434279
--------     --------      --------      --------
avg        0.00364072     0.0475957      0.459796
std       0.000603562    0.00630841     0.0171192


------ Friedman rank table on Time Consumption ------------- Table 12.8
         Naive Bayes Decision Tree Random Forest
D1                 1             2             3
D2                 1             2             3
D3     