In [2]:
import numpy as np
from sklearn.ensemble import VotingClassifier, StackingClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.base import clone
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
import os

clfs = {
    'GNB': GaussianNB(),
    'SVM': SVC(probability=True),
    'DT': DecisionTreeClassifier()
}

filenames=[]
for filename in os.listdir('datasets'):
    filenames.append(filename)

filenames=filenames[:20]
print(len(filenames))

20


In [4]:
methods = []
base_stacking=[]

for clf_name, clf in clfs.items():
    base_stacking.append((clf_name, clone(clf)))

for clf_name, clf in clfs.items():
    methods.append(BaggingClassifier(base_estimator=clone(clf)))
    methods.append(GradientBoostingClassifier(init=clone(clf)))
    methods.append(AdaBoostClassifier(base_estimator=clone(clf)))
    methods.append(StackingClassifier(estimators=base_stacking, final_estimator=clone(clf)))
    
# methods.append(VotingClassifier(estimators=base_voting))

In [5]:
methods

[BaggingClassifier(base_estimator=GaussianNB()),
 GradientBoostingClassifier(init=GaussianNB()),
 AdaBoostClassifier(base_estimator=GaussianNB()),
 StackingClassifier(estimators=[('GNB', GaussianNB()),
                                ('SVM', SVC(probability=True)),
                                ('DT', DecisionTreeClassifier())],
                    final_estimator=GaussianNB()),
 BaggingClassifier(base_estimator=SVC(probability=True)),
 GradientBoostingClassifier(init=SVC(probability=True)),
 AdaBoostClassifier(base_estimator=SVC(probability=True)),
 StackingClassifier(estimators=[('GNB', GaussianNB()),
                                ('SVM', SVC(probability=True)),
                                ('DT', DecisionTreeClassifier())],
                    final_estimator=SVC(probability=True)),
 BaggingClassifier(base_estimator=DecisionTreeClassifier()),
 GradientBoostingClassifier(init=DecisionTreeClassifier()),
 AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
 StackingClas

In [6]:
results = np.zeros((10, len(filenames), len(methods)))

In [7]:
rskf = RepeatedStratifiedKFold(
    n_splits=5, 
    n_repeats=2, 
    random_state=1234
)
PYTHONIOENCODING="UTF-8"

for f_id, filename in enumerate(filenames):
    print(filename)
    
    if filename == ".DS_Store":
        continue

    data = np.loadtxt("%s/%s" % ('datasets', filename), delimiter=',')
    X = data[:, 0:-1]
    y = data[:, -1]
    print(X.shape, y.shape)
    
    for fold, (train_index, test_index) in enumerate(rskf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        for method_id, method in enumerate(methods):
            method_cloned = clone(method)
            
            method_cloned.fit(X_train, y_train)
            y_pred = method_cloned.predict(X_test)
            
            score = accuracy_score(y_test, y_pred)
            
            results[fold, f_id, method_id] = score
            
        
#         print(results[fold, f_id])
        print("fold:",fold,"/",10, "\t", "dataset:",f_id,"/",len(filenames))

    np.save('results', results)
    

breastcan.csv
(683, 9) (683,)
fold: 0 / 10 	 dataset: 0 / 20
fold: 1 / 10 	 dataset: 0 / 20
fold: 2 / 10 	 dataset: 0 / 20
fold: 3 / 10 	 dataset: 0 / 20
fold: 4 / 10 	 dataset: 0 / 20
fold: 5 / 10 	 dataset: 0 / 20
fold: 6 / 10 	 dataset: 0 / 20
fold: 7 / 10 	 dataset: 0 / 20
fold: 8 / 10 	 dataset: 0 / 20
fold: 9 / 10 	 dataset: 0 / 20
wisconsin.csv
(699, 9) (699,)
fold: 0 / 10 	 dataset: 1 / 20
fold: 1 / 10 	 dataset: 1 / 20
fold: 2 / 10 	 dataset: 1 / 20
fold: 3 / 10 	 dataset: 1 / 20
fold: 4 / 10 	 dataset: 1 / 20
fold: 5 / 10 	 dataset: 1 / 20
fold: 6 / 10 	 dataset: 1 / 20
fold: 7 / 10 	 dataset: 1 / 20
fold: 8 / 10 	 dataset: 1 / 20
fold: 9 / 10 	 dataset: 1 / 20
bupa.csv
(345, 6) (345,)
fold: 0 / 10 	 dataset: 2 / 20
fold: 1 / 10 	 dataset: 2 / 20
fold: 2 / 10 	 dataset: 2 / 20
fold: 3 / 10 	 dataset: 2 / 20
fold: 4 / 10 	 dataset: 2 / 20
fold: 5 / 10 	 dataset: 2 / 20
fold: 6 / 10 	 dataset: 2 / 20
fold: 7 / 10 	 dataset: 2 / 20
fold: 8 / 10 	 dataset: 2 / 20
fold: 9 / 10 	 d

In [8]:
### Wilcoxon -- trzeba usrednic wyniki ze zbiorow danych
import numpy as np

results = np.load('results.npy') # 10 x 20 x methods(10)

mean_results_for_datasets = np.mean(results, axis=0) # 20 x methods(10)

print(mean_results_for_datasets.shape)
print(mean_results_for_datasets)

(20, 12)
[[0.96340167 0.80529197 0.82268678 0.968538   0.97072778 0.96852726
  0.90562473 0.96999785 0.96265028 0.94582976 0.94366681 0.94657578]
 [0.95635663 0.83823227 0.80753854 0.96136691 0.96350976 0.95635663
  0.82626927 0.9577852  0.94918294 0.92989723 0.92987667 0.93491264]
 [0.56376812 0.71884058 0.55942029 0.71304348 0.72463768 0.7173913
  0.5826087  0.69275362 0.6942029  0.64492754 0.63188406 0.61884058]
 [0.88889336 0.88897384 0.81490946 0.93022133 0.95020121 0.93877264
  0.64525151 0.93301811 0.92018109 0.89028169 0.88022133 0.92881288]
 [0.98888889 1.         1.         1.         0.99       1.
  0.96666667 1.         0.97777778 0.98888889 0.97777778 1.        ]
 [0.65978261 0.72826087 0.66431159 0.66775362 0.53876812 0.75471014
  0.55615942 0.69836957 0.73695652 0.68134058 0.68115942 0.56992754]
 [0.68066202 0.79337979 0.63397213 0.78629501 0.80307782 0.8344367
  0.7952381  0.73809524 0.78362369 0.7018583  0.7043554  0.56962834]
 [0.904      0.5664     0.936      0.9256 

In [9]:
from scipy.stats import rankdata

ranks = []
for r in mean_results_for_datasets:
    ranks.append(rankdata(r).tolist())
    
ranks = np.array(ranks)
print(ranks)

# mean_ranks = np.mean(ranks, axis=0)
# print("Mean ranks:", mean_ranks)

[[ 8.   1.   2.  10.  12.   9.   3.  11.   7.   5.   4.   6. ]
 [ 9.   3.   1.  11.  12.   8.   2.  10.   7.   5.   4.   6. ]
 [ 2.  11.   1.   9.  12.  10.   3.   7.   8.   6.   5.   4. ]
 [ 4.   5.   2.   9.  12.  11.   1.  10.   7.   6.   3.   8. ]
 [ 4.5  9.5  9.5  9.5  6.   9.5  1.   9.5  2.5  4.5  2.5  9.5]
 [ 4.  10.   5.   6.   1.  12.   2.   9.  11.   8.   7.   3. ]
 [ 3.   9.   2.   8.  11.  12.  10.   6.   7.   4.   5.   1. ]
 [ 6.   1.  12.  11.  10.   8.   9.   5.   4.   3.   2.   7. ]
 [11.   7.   1.  12.   8.   6.   9.  10.   5.   3.   4.   2. ]
 [ 2.   8.   1.   9.  12.   7.   3.  10.   6.   4.   5.  11. ]
 [10.   9.   1.  12.   3.   8.   2.  11.   7.   6.   5.   4. ]
 [ 5.  10.   2.   8.   3.  12.   1.   9.  11.   6.   7.   4. ]
 [ 1.   6.   2.  10.  10.  10.   7.  10.   5.   4.   3.  10. ]
 [10.  11.5  1.   8.   6.  11.5  2.   9.   5.   7.   4.   3. ]
 [ 2.  11.   1.   9.  10.  12.   3.   8.   7.   5.   6.   4. ]
 [ 1.   8.5  4.   8.5  2.   8.5  3.   8.5  8.5  8.5  8.

In [10]:
from scipy.stats import ranksums

alpha = 0.05

w_statistic = np.zeros((len(methods), len(methods)))
p_value = np.zeros((len(methods), len(methods)))
advantage = np.zeros((len(methods), len(methods)))
significance = np.zeros((len(methods), len(methods)))

for i, i_clf in enumerate(methods):
    for j, j_clf in enumerate(methods):
        w_statistic[i, j], p_value[i, j] = ranksums(ranks.T[i], ranks.T[j])
        
        


In [14]:
from tabulate import tabulate

method_names = ['B_GNB', 'G_GNB', 'A_GNB', 'S_GNB',
               'B_SVC', 'G_SVC','A_SVC', 'S_SVC',
               'B_DT', 'G_DT','A_DT', 'S_DT']

headers = method_names
names_column = np.expand_dims(np.array(method_names), axis=1)

w_statistic_table = np.concatenate((names_column, w_statistic), axis=1)
w_statistic_table = tabulate(w_statistic_table, headers, floatfmt=".2f")
p_value_table = np.concatenate((names_column, p_value), axis=1)
p_value_table = tabulate(p_value_table, headers, floatfmt=".2f")

print('w statistic')
print(w_statistic_table)
print('p value')
print(p_value_table)


w statistic
         B_GNB    G_GNB    A_GNB    S_GNB    B_SVC    G_SVC    A_SVC    S_SVC    B_DT    G_DT    A_DT    S_DT
-----  -------  -------  -------  -------  -------  -------  -------  -------  ------  ------  ------  ------
B_GNB     0.00    -1.76     2.64    -3.49    -2.76    -3.94     0.70    -3.71   -2.39   -0.96   -0.24   -0.87
G_GNB     1.76     0.00     3.26    -1.28    -1.31    -2.14     2.22    -1.58    0.20    1.66    2.07    1.18
A_GNB    -2.64    -3.26     0.00    -4.50    -4.07    -4.68    -2.26    -4.58   -4.30   -3.96   -3.62   -3.54
S_GNB     3.49     1.28     4.50     0.00    -0.16    -0.92     3.73    -0.51    2.29    4.15    4.40    3.03
B_SVC     2.76     1.31     4.07     0.16     0.00    -0.72     3.07     0.03    1.22    2.43    2.85    2.00
G_SVC     3.94     2.14     4.68     0.92     0.72     0.00     4.07     0.62    2.91    4.46    4.68    3.45
A_SVC    -0.70    -2.22     2.26    -3.73    -3.07    -4.07     0.00    -3.91   -2.77   -2.02   -1.37   -1.7

In [15]:
advantage[w_statistic > 0] = 1
significance[p_value <= alpha] = 1

advantage_table = np.concatenate((names_column, advantage), axis=1)
advantage_table = tabulate(advantage_table, headers)
significance_table = np.concatenate((names_column, significance), axis=1)
significance_table = tabulate(significance_table, headers)

print('advantage')
print(advantage_table)

print('significance')
print(significance_table)


advantage
         B_GNB    G_GNB    A_GNB    S_GNB    B_SVC    G_SVC    A_SVC    S_SVC    B_DT    G_DT    A_DT    S_DT
-----  -------  -------  -------  -------  -------  -------  -------  -------  ------  ------  ------  ------
B_GNB        0        0        1        0        0        0        1        0       0       0       0       0
G_GNB        1        0        1        0        0        0        1        0       1       1       1       1
A_GNB        0        0        0        0        0        0        0        0       0       0       0       0
S_GNB        1        1        1        0        0        0        1        0       1       1       1       1
B_SVC        1        1        1        1        0        0        1        1       1       1       1       1
G_SVC        1        1        1        1        1        0        1        1       1       1       1       1
A_SVC        0        0        1        0        0        0        0        0       0       0       0       0


In [16]:
stat_better = significance * advantage

stat_better_table = np.concatenate((names_column, stat_better), axis=1)
stat_better_table = tabulate(stat_better_table, headers)

print('statistically significantly better')
print(stat_better_table)


statistically significantly better
         B_GNB    G_GNB    A_GNB    S_GNB    B_SVC    G_SVC    A_SVC    S_SVC    B_DT    G_DT    A_DT    S_DT
-----  -------  -------  -------  -------  -------  -------  -------  -------  ------  ------  ------  ------
B_GNB        0        0        1        0        0        0        0        0       0       0       0       0
G_GNB        0        0        1        0        0        0        1        0       0       0       1       0
A_GNB        0        0        0        0        0        0        0        0       0       0       0       0
S_GNB        1        0        1        0        0        0        1        0       1       1       1       1
B_SVC        1        0        1        0        0        0        1        0       0       1       1       1
G_SVC        1        1        1        0        0        0        1        0       1       1       1       1
A_SVC        0        0        1        0        0        0        0        0       0

In [17]:
# T-test -- dla każdego datasetu, ze wszystkich foldow
from scipy import stats

results = np.load('results.npy')
print(results.shape)

(10, 20, 12)


In [18]:
alpha = 0.05

for dataset_id in range(20):
    dataset_res = results[:,dataset_id]
    print('----dataset id----', dataset_id) # shape: 10(folds) x 10(methods)
    
    t_statistic = np.zeros((len(methods), len(methods)))
    p_value = np.zeros((len(methods), len(methods)))
    advantage = np.zeros((len(methods), len(methods)))
    significance = np.zeros((len(methods), len(methods)))
     
    for i, i_ds in enumerate(methods):
        for j, j_ds in enumerate(methods):
            t_statistic[i, j], p_value[i, j] = stats.ttest_rel(dataset_res[:,i], dataset_res[:,j])
            
    advantage[t_statistic > 0] = 1
    significance[p_value <= alpha] = 1
    stat_better = significance * advantage
    
    stat_better_table = np.concatenate((names_column, stat_better), axis=1)
    stat_better_table = tabulate(stat_better_table, headers)

    print('statistically significantly better')
    print(stat_better_table)
    

----dataset id---- 0
statistically significantly better
         B_GNB    G_GNB    A_GNB    S_GNB    B_SVC    G_SVC    A_SVC    S_SVC    B_DT    G_DT    A_DT    S_DT
-----  -------  -------  -------  -------  -------  -------  -------  -------  ------  ------  ------  ------
B_GNB        0        1        1        0        0        0        1        0       0       1       1       1
G_GNB        0        0        0        0        0        0        0        0       0       0       0       0
A_GNB        0        0        0        0        0        0        0        0       0       0       0       0
S_GNB        1        1        1        0        0        0        1        0       0       1       1       1
B_SVC        0        1        1        0        0        0        1        0       0       1       1       1
G_SVC        0        1        1        0        0        0        1        0       0       1       1       1
A_SVC        0        0        1        0        0        0     