In [95]:
import numpy as np
import pandas as pd
import scikit_posthocs as sp
import time

# bayes classifier
from sklearn.naive_bayes import GaussianNB

# decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# random forest classifier
from sklearn.ensemble import RandomForestClassifier

# normal simple dataset division fuction, 
from sklearn.model_selection import train_test_split

# K-fold cross-validation
from sklearn.model_selection import cross_val_score,StratifiedKFold

# normalization function provided by sklearn
from sklearn import preprocessing

# f1 measure
from sklearn.metrics import f1_score

# kfold function provided by sklearn
from sklearn.model_selection import cross_val_score

# scipy friedman test
from scipy.stats import friedmanchisquare

In [96]:
# read data
data = pd.read_csv('spambase.data', header=None)

# divide X and Y
# X is the features matrix of the dataset
X = np.array(data)[:, :-1]
# Y is the label vector of the dataset
Y = np.array(data)[:, -1]

In [113]:
# initialize classifiers
clf_bayes = GaussianNB()
# TODO
clf_dt = DecisionTreeClassifier(criterion="entropy")
# TODO
clf_rf = RandomForestClassifier(criterion="gini", max_features="log2", n_estimators=73)

In [114]:
# 10-Fold Stratified by the actual classification
skf = StratifiedKFold(n_splits=10,shuffle=True)

# accuracy scores array
bayes_acc = []
dt_acc = []
rf_acc = []

# matrix provided for friedman and nemenyi post-hoc
t = []
acc = []
f1 = []


for train_index, test_index in skf.split(X,Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    fold_acc = []
    fold_t = []
    fold_f1 = []
    
#     # data normalization for training set
#     min_max_scaler = preprocessing.MinMaxScaler()
#     X_train_minmax = min_max_scaler.fit_transform(X_train)

#     # data normalization for testing set
#     # use the transformer(metric) generated by training set to normalize testing set
#     X_test_minmax = min_max_scaler.transform(X_test)

#     print('Train size: %d | test size: %d' % (len(train), len(test)))
#     tr_result = pd.value_counts(Y_train)
#     ts_result = pd.value_counts(Y_test)
#     print(tr_result)
#     print(ts_result)
    
    # training and computational performance calculation
    # bayes
    start = time.time()
    clf_bayes.fit(X_train, Y_train)
    end = time.time()
    t_bayes = end - start
    
    # decision tree
    start = time.time()
    clf_dt.fit(X_train, Y_train)
    end = time.time()
    t_dt = end - start
    
    # random forest
    start = time.time()
    clf_rf.fit(X_train, Y_train)
    end = time.time()
    t_rf = end - start
    
    # calculate the f1 of each model
    bayes_y_pred = clf_bayes.predict(X_test)
    dt_y_pred = clf_dt.predict(X_test)
    rf_y_pred = clf_rf.predict(X_test)
    f1_bayes = f1_score(Y_test, bayes_y_pred)
    f1_dt = f1_score(Y_test, dt_y_pred)
    f1_rf = f1_score(Y_test, rf_y_pred)
    
    # get accuracy of each model
    acc_bayes = clf_bayes.score(X_test, Y_test)
    acc_dt = clf_dt.score(X_test, Y_test)
    acc_rf = clf_rf.score(X_test, Y_test)
    
    
    bayes_acc.append(acc_bayes)
    dt_acc.append(acc_dt)
    rf_acc.append(acc_rf)
    
    fold_acc.append(acc_bayes)
    fold_acc.append(acc_dt)
    fold_acc.append(acc_rf)
    
    fold_f1.append(f1_bayes)
    fold_f1.append(f1_dt)
    fold_f1.append(f1_rf)
    
    fold_t.append(t_bayes)
    fold_t.append(t_dt)
    fold_t.append(t_rf)
    
    acc.append(fold_acc)
    f1.append(fold_f1)
    t.append(fold_t)

In [115]:
# show the results
fold_index = []
for i in range(1,11):
    idx = 'D'+ str(i)
    fold_index.append(idx)

# generate result table
df_acc = pd.DataFrame(acc)
acc_mean = df_acc.iloc[:,[0,1,2]].mean().values.tolist()
acc_std = df_acc.iloc[:,[0,1,2]].std().values.tolist()
df_acc = pd.DataFrame(acc, index=fold_index)
df_acc.loc['---'] = ['--------','--------','--------']
df_acc.loc['avg'] = acc_mean
df_acc.loc['std'] = acc_std
df_acc.columns = ['Naive Bayes', 'Decision Tree','Random Forest']
print('------ Accuracy of 3 Algorithms for each fold ------')
print(df_acc)
print('\n')


df_f1 = pd.DataFrame(f1)
f1_mean = df_f1.iloc[:,[0,1,2]].mean().values.tolist()
f1_std = df_f1.iloc[:,[0,1,2]].std().values.tolist()
print('--------- F1 of 3 Algorithms for each fold ---------')
df_f1 = pd.DataFrame(f1, index=fold_index)
df_f1.loc['---'] = ['--------','--------','--------']
df_f1.loc['avg'] = f1_mean
df_f1.loc['std'] = f1_std
df_f1.columns = ['Naive Bayes', 'Decision Tree','Random Forest']
print(df_f1)
print('\n')


df_t = pd.DataFrame(t)
t_mean = df_t.iloc[:,[0,1,2]].mean().values.tolist()
t_std = df_t.iloc[:,[0,1,2]].std().values.tolist()
print('-- Time Consumption of 3 Algorithms for each fold --')
df_t = pd.DataFrame(t, index=fold_index)
df_t.loc['---'] = ['--------','--------','--------']
df_t.loc['avg'] = t_mean
df_t.loc['std'] = t_std
df_t.columns = ['Naive Bayes', 'Decision Tree','Random Forest']
print(df_t)

------ Accuracy of 3 Algorithms for each fold ------
    Naive Bayes Decision Tree Random Forest
D1     0.813449      0.921909      0.956616
D2     0.826464      0.915401      0.954447
D3     0.839479      0.900217      0.941432
D4     0.804348      0.902174      0.967391
D5     0.823913      0.913043          0.95
D6     0.830435      0.934783      0.958696
D7          0.8      0.908696      0.947826
D8     0.813043      0.921739      0.952174
D9     0.830065      0.925926       0.95207
D10    0.840959       0.91939       0.96732
---    --------      --------      --------
avg    0.822216      0.916328      0.954797
std   0.0140365     0.0107098    0.00815567


--------- F1 of 3 Algorithms for each fold ---------
    Naive Bayes Decision Tree Random Forest
D1      0.80543           0.9      0.944444
D2     0.816514      0.890756      0.940171
D3     0.826291      0.874317      0.926027
D4     0.791667      0.875346      0.957983
D5     0.811189      0.893617       0.93733
D6     0.813

In [119]:
# friedman test
# input: 3 measurement arrays of the algorithm
stat, p = friedmanchisquare(bayes_acc, rf_acc, dt_acc)
print(stat)
print(p)

alpha = 0.05
if p > alpha:
    print('No significance difference between the performance (fail to reject H0)')
else:
    print('Significance difference exists between the performance (reject H0)')

20.0
4.539992976248486e-05
Significance difference exists between the performance (reject H0)


In [122]:
# print(acc)
# print(f1)
# print(t)
print(sp.posthoc_nemenyi_friedman(acc))
print(sp.posthoc_nemenyi_friedman(f1))
print(sp.posthoc_nemenyi_friedman(t))

          0         1         2
0 -1.000000  0.065303  0.001000
1  0.065303 -1.000000  0.065303
2  0.001000  0.065303 -1.000000
          0         1         2
0 -1.000000  0.065303  0.001000
1  0.065303 -1.000000  0.065303
2  0.001000  0.065303 -1.000000
          0         1         2
0 -1.000000  0.065303  0.001000
1  0.065303 -1.000000  0.065303
2  0.001000  0.065303 -1.000000
