In [36]:
import numpy as np
import pandas as pd
import scikit_posthocs as sp
import time
import copy
from tools import *

# bayes classifier
from sklearn.naive_bayes import GaussianNB

# decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# random forest classifier
from sklearn.ensemble import RandomForestClassifier

# normal simple dataset division fuction, 
from sklearn.model_selection import train_test_split

# K-fold cross-validation
from sklearn.model_selection import cross_val_score,StratifiedKFold

# normalization function provided by sklearn
from sklearn import preprocessing

# f1 measure
from sklearn.metrics import f1_score

# kfold function provided by sklearn
from sklearn.model_selection import cross_val_score

# scipy friedman test
from scipy.stats import friedmanchisquare

In [37]:
# read data
data = pd.read_csv('spambase.data', header=None)

# divide X and Y
# X is the features matrix of the dataset
X = np.array(data)[:, :-1]
# Y is the class vector of the dataset
Y = np.array(data)[:, -1]

In [38]:
# initialize classifiers

# Bayes classifier
clf_bayes = GaussianNB()

# Decision Tree classifier
clf_dt = DecisionTreeClassifier(criterion="entropy")

# Random Forest classifier
clf_rf = RandomForestClassifier(criterion="gini", max_features="log2", n_estimators=100)

In [39]:
# 10-Fold Stratified by the actual classification
skf = StratifiedKFold(n_splits=10,shuffle=True)

# list matrix to store the measure results of each algorithm and on each fold.
t = []
acc = []
f1 = []


for train_index, test_index in skf.split(X,Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    fold_acc = []
    fold_t = []
    fold_f1 = []

#     print('Train size: %d | test size: %d' % (len(train), len(test)))
#     tr_result = pd.value_counts(Y_train)
#     ts_result = pd.value_counts(Y_test)
#     print(tr_result)
#     print(ts_result)
    
    # training and computational performance calculation
    # bayes
    start = time.time()
    clf_bayes.fit(X_train, Y_train)
    end = time.time()
    t_bayes = end - start
    
    # decision tree
    start = time.time()
    clf_dt.fit(X_train, Y_train)
    end = time.time()
    t_dt = end - start
    
    # random forest
    start = time.time()
    clf_rf.fit(X_train, Y_train)
    end = time.time()
    t_rf = end - start
    
    # measure accuracy
    acc_bayes = clf_bayes.score(X_test, Y_test)
    acc_dt = clf_dt.score(X_test, Y_test)
    acc_rf = clf_rf.score(X_test, Y_test)
    
    # calculate the f1 of each model
    bayes_y_pred = clf_bayes.predict(X_test)
    dt_y_pred = clf_dt.predict(X_test)
    rf_y_pred = clf_rf.predict(X_test)
    f1_bayes = f1_score(Y_test, bayes_y_pred)
    f1_dt = f1_score(Y_test, dt_y_pred)
    f1_rf = f1_score(Y_test, rf_y_pred)
    
    # measure results recording
    fold_acc.append(acc_bayes)
    fold_acc.append(acc_dt)
    fold_acc.append(acc_rf)
    
    fold_f1.append(f1_bayes)
    fold_f1.append(f1_dt)
    fold_f1.append(f1_rf)
    
    fold_t.append(t_bayes)
    fold_t.append(t_dt)
    fold_t.append(t_rf)
    
    acc.append(fold_acc)
    f1.append(fold_f1)
    t.append(fold_t)

In [41]:
# initialize the parameters for the procedure
column_name = ['Naive Bayes', 'Decision Tree','Random Forest']
column_size = 3
fold_index = []
for i in range(1,11):
    idx = 'D'+ str(i)
    fold_index.append(idx)

# run the whole procedure for a measure
def step_2_3_4(data, fold_index, column_name, column_size, is_time, measure):
    
    # Step 2
    # (Table 12.4) generate measure performance table               
    df = generate_measure_table(data, fold_index, column_name, column_size)
    print('------', measure, 'of 3 Algorithms for each fold ------ Table 12.8')
    print(df)
    print('\n')
    
    # Step 3
    # (Table 12.8) generate measure performance friedman rank table 
    df = generate_friedman_table(data, fold_index, column_name, column_size,0)
    print('------ Friedman rank table on', measure, '------------- Table 12.4')
    print(df)
    print('\n')
    
    # Step 4
    # use the avg rank data of each algorithm to calculate p-value of friedman test 
    alpha = 0.05
    print('The friedman Test result on',measure)
    judge_p_of_Friedman(acc,alpha,measure)
    print('\n')
    
    # use nemenyi post-hoc to test if there's signficant difference between algorithm pairs
    generate_nemenyi_table(acc, column_name, 'Accuracy')
    print('\n')
    return

# show the results
# Accuracy
step_2_3_4(acc, fold_index, column_name, column_size, 0, 'Accuracy')

------ Accuracy of 3 Algorithms for each fold ------ Table 12.8
         Naive Bayes Decision Tree Random Forest
D1           0.81128      0.937093        0.9718
D2          0.826464      0.921909      0.956616
D3          0.824295      0.921909      0.954447
D4          0.826087      0.919565      0.954348
D5          0.819565      0.934783      0.978261
D6          0.815217      0.926087      0.954348
D7          0.813043      0.936957      0.965217
D8           0.83913      0.919565      0.952174
D9          0.816993      0.901961      0.945534
D10         0.810458      0.906318      0.932462
--------    --------      --------      --------
avg         0.820253      0.922615      0.956521
std       0.00888326     0.0119591     0.0129697


------ Friedman rank table on Accuracy ------------- Table 12.4
         Naive Bayes Decision Tree Random Forest
D1                 3             2             1
D2                 3             2             1
D3                 3             2   

In [20]:
# F1 measure
step_2_3_4(f1, fold_index, column_name, column_size, 0, 'F1')

------  F1  of 3 Algorithms for each fold ------
         Naive Bayes Decision Tree Random Forest
D1          0.798165      0.913043      0.944134
D2           0.82381      0.903955      0.931818
D3          0.782609      0.879121      0.938889
D4          0.820513      0.908108      0.963989
D5          0.801843      0.895604      0.935211
D6          0.811189      0.895604      0.943503
D7          0.789954      0.936986      0.954802
D8          0.823256      0.914127      0.923944
D9          0.810427      0.870027      0.960894
D10         0.812207      0.923077      0.942149
--------    --------      --------      --------
avg         0.807397      0.903965      0.943933
std        0.0140411     0.0199092     0.0127337


------ Friedman test on  F1  ------
         Naive Bayes Decision Tree Random Forest
D1                 3             2             1
D2                 3             2             1
D3                 3             2             1
D4                 3           

In [21]:
# Training Time Consumption
step_2_3_4(f1, fold_index, column_name, column_size, 0, 'Time Consumption')

------  Time Consumption  of 3 Algorithms for each fold ------
         Naive Bayes Decision Tree Random Forest
D1          0.798165      0.913043      0.944134
D2           0.82381      0.903955      0.931818
D3          0.782609      0.879121      0.938889
D4          0.820513      0.908108      0.963989
D5          0.801843      0.895604      0.935211
D6          0.811189      0.895604      0.943503
D7          0.789954      0.936986      0.954802
D8          0.823256      0.914127      0.923944
D9          0.810427      0.870027      0.960894
D10         0.812207      0.923077      0.942149
--------    --------      --------      --------
avg         0.807397      0.903965      0.943933
std        0.0140411     0.0199092     0.0127337


------ Friedman test on  Time Consumption  ------
         Naive Bayes Decision Tree Random Forest
D1                 3             2             1
D2                 3             2             1
D3                 3             2             1
D4 

In [None]:
# ---------------------- tool functions ----------------------

# generate friedman table
def generate_measure_table(data, row_index, column_name, column_size):
    size_list = list(range(0,column_size))
    df = pd.DataFrame(data)
    # get mean and std of the column
    mean = df.iloc[:,size_list].mean().values.tolist()
    std = df.iloc[:,size_list].std().values.tolist()
    
    # add index for rows
    df = pd.DataFrame(data, index=row_index)
    
    # add division for mean and std
    df.loc['--------'] = ['--------','--------','--------']
    df.loc['avg     '] = mean
    df.loc['std'] = std
    
    # add column name
    df.columns = column_name
    return df

# generate friedman table
def generate_friedman_table(data, row_index, column_name, column_size, is_time):
    size_list = list(range(0,column_size))
    
    # create rank matrix 
    # generate friedman test table
    data_rank = copy.deepcopy(data)
    
    # turn the value into rank number
    if is_time == 0:
        for row in data_rank:
            s_row = sorted(enumerate(row), key=lambda x: x[1])
            idx = [i[0] for i in s_row]
            for index, or_index in enumerate(idx):
                row[or_index] = len(idx) - index
    else:
        for row in data_rank:
            s_row = sorted(enumerate(row), key=lambda x: x[1])
            idx = [i[0] for i in s_row]
            for index, or_index in enumerate(idx):
                row[or_index] = index + 1

    # get mean rank row
    df = pd.DataFrame(data_rank)
    mean = df.iloc[:,size_list].mean().values.tolist()

    # add index for rows
    df = pd.DataFrame(data_rank, index=row_index)
    
    # add division for mean and std
    df.loc['--------'] = ['--------','--------','--------']
    df.loc['avg_rank'] = mean
    
    # add column name
    df.columns = column_name
    return df

# calculate friedman p-value 
def judge_p_of_Friedman(data, alpha, perf_str):
    data_np = np.array(data)
    stat, p = friedmanchisquare(data_np[:,0], data_np[:,1], data_np[:,2])
    if p > alpha:
        print('p = ',p , ' > ', alpha)
        print('No significance difference between the',perf_str,'performance (fail to reject H0)')
    else:
        print('p = ',p , ' < ', alpha)
        print('Significance difference exists between the',perf_str,'performance (reject H0)')

# run nemenyi test between algorithm pairs
def generate_nemenyi_table(data, column_name, measure):
    df = sp.posthoc_nemenyi_friedman(data)
    df.columns = column_name
    df.index = column_name
    print('The Nemenyi post-hoc test result on ', measure)
    print(df)