In [14]:
import numpy as np
from sklearn.ensemble import VotingClassifier, StackingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.base import clone
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
import os

clfs = {
    'GNB': GaussianNB(),
    'SVM': SVC(probability=True),
    'DT': DecisionTreeClassifier()
}

filenames=[]
for filename in os.listdir('datasets'):
    filenames.append(filename)

filenames=filenames[:20]
print(len(filenames))

20


In [15]:
methods = []
base_voting=[]
base_stacking=[]

for clf_name, clf in clfs.items():
    base_voting.append((clf_name, clone(clf)))
    base_stacking.append((clf_name, clone(clf)))

for clf_name, clf in clfs.items():
    methods.append(BaggingClassifier(base_estimator=clone(clf)))
    methods.append(AdaBoostClassifier(base_estimator=clone(clf)))
    methods.append(StackingClassifier(estimators=base_stacking, final_estimator=clone(clf)))
    
methods.append(VotingClassifier(estimators=base_voting))

In [16]:
methods

[BaggingClassifier(base_estimator=GaussianNB()),
 AdaBoostClassifier(base_estimator=GaussianNB()),
 StackingClassifier(estimators=[('GNB', GaussianNB()),
                                ('SVM', SVC(probability=True)),
                                ('DT', DecisionTreeClassifier())],
                    final_estimator=GaussianNB()),
 BaggingClassifier(base_estimator=SVC(probability=True)),
 AdaBoostClassifier(base_estimator=SVC(probability=True)),
 StackingClassifier(estimators=[('GNB', GaussianNB()),
                                ('SVM', SVC(probability=True)),
                                ('DT', DecisionTreeClassifier())],
                    final_estimator=SVC(probability=True)),
 BaggingClassifier(base_estimator=DecisionTreeClassifier()),
 AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
 StackingClassifier(estimators=[('GNB', GaussianNB()),
                                ('SVM', SVC(probability=True)),
                                ('DT', DecisionTreeClassifi

In [4]:
# results = np.zeros((10, len(datasets), len(methods)))

In [5]:
# rskf = RepeatedStratifiedKFold(
#     n_splits=5, 
#     n_repeats=2, 
#     random_state=1234
# )

# TU JEST ŹLE

# for f_id, filename in enumerate(filenames):
    
#     data = np.loadtxt("%s/%s" % ('datasets', filename), delimiter=',')
#     X = data[:, 0:-1]
#     y = data[:, -1]
#     print(X.shape, y.shape)
    
#     for fold, (train_index, test_index) in enumerate(rskf.split(X, y)):
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y[train_index], y[test_index]
        
#         for method_id, method in enumerate(methods):
#             method_cloned = clone(method)
            
#             method_cloned.fit(X_train, y_train)
#             y_pred = method_cloned.predict(X_test)
            
#             score = accuracy_score(y_test, y_pred)
            
#             results[fold, data_id, method_id] = score
#         print(results[fold, data_id])
#         print("fold:",fold,"/",10, "\t", "dataset:",f_id,"/",len(filenames))

#     np.save('results', results)
    

In [42]:
### Wilcoxon -- trzeba usrednic wyniki ze zbiorow danych
import numpy as np

results = np.load('results.npy') # 10 x 20 x methods(10)

mean_results_for_datasets = np.mean(results, axis=0) # 20 x methods(10)

print(mean_results_for_datasets.shape)
print(mean_results_for_datasets)

(20, 10)
[[0.96340167 0.82268678 0.96780271 0.97291756 0.92241305 0.968538
  0.96485616 0.944386   0.94728424 0.97218763]
 [0.95563207 0.80753854 0.96136691 0.96495375 0.85692189 0.95706578
  0.95204522 0.93561151 0.92561151 0.96208119]
 [0.54782609 0.55942029 0.70289855 0.71304348 0.58115942 0.69130435
  0.66956522 0.63478261 0.6173913  0.70434783]
 [0.89744467 0.81490946 0.93303823 0.95018109 0.65102616 0.93732394
  0.92297787 0.88030181 0.92740443 0.92736419]
 [1.         1.         1.         1.         0.93333333 1.
  1.         0.98888889 1.         0.98888889]
 [0.62047101 0.66431159 0.69800725 0.54293478 0.55181159 0.65869565
  0.71503623 0.67246377 0.6115942  0.71539855]
 [0.68304297 0.63397213 0.79105691 0.78397213 0.84883856 0.71637631
  0.78362369 0.69953542 0.59117305 0.79349593]
 [0.9016     0.936      0.9184     0.9064     0.912      0.9024
  0.8008     0.772      0.9072     0.9032    ]
 [0.75163934 0.55597567 0.74836066 0.73532522 0.73368588 0.733633
  0.70272343 0.6617

In [43]:
from scipy.stats import rankdata

ranks = []
for r in mean_results_for_datasets:
    ranks.append(rankdata(r).tolist())
    
ranks = np.array(ranks)
print(ranks)

# mean_ranks = np.mean(ranks, axis=0)
# print("Mean ranks:", mean_ranks)

[[ 5.   1.   7.  10.   2.   8.   6.   3.   4.   9. ]
 [ 6.   1.   8.  10.   2.   7.   5.   4.   3.   9. ]
 [ 1.   2.   8.  10.   3.   7.   6.   5.   4.   9. ]
 [ 4.   2.   8.  10.   1.   9.   5.   3.   7.   6. ]
 [ 7.   7.   7.   7.   1.   7.   7.   2.5  7.   2.5]
 [ 4.   6.   8.   1.   2.   5.   9.   7.   3.  10. ]
 [ 3.   2.   8.   7.  10.   5.   6.   4.   1.   9. ]
 [ 3.  10.   9.   6.   8.   4.   2.   1.   7.   5. ]
 [10.   1.   9.   7.   6.   5.   4.   3.   2.   8. ]
 [ 2.   1.   4.  10.   3.   7.   6.   5.   8.   9. ]
 [ 8.   1.  10.   3.   2.   9.   6.   5.   4.   7. ]
 [ 5.   2.   8.   3.   1.   9.  10.   7.   6.   4. ]
 [ 1.   2.   8.5  8.5  6.   8.5  4.   3.   8.5  5. ]
 [ 8.   1.   7.   5.   2.  10.   6.   4.   3.   9. ]
 [ 2.   1.  10.   9.   3.   8.   7.   5.   4.   6. ]
 [ 1.   5.   8.   2.5  4.   8.   8.   8.   8.   2.5]
 [ 6.   1.  10.   5.   4.   7.   8.   3.   2.   9. ]
 [ 3.5  1.   5.5 10.   9.   3.5  8.   2.   7.   5.5]
 [ 1.   2.   4.   8.   8.   8.  10.   5.   3. 

In [44]:
from scipy.stats import ranksums

alpha = 0.05

w_statistic = np.zeros((len(methods), len(methods)))
p_value = np.zeros((len(methods), len(methods)))
advantage = np.zeros((len(methods), len(methods)))
significance = np.zeros((len(methods), len(methods)))

for i, i_clf in enumerate(methods):
    for j, j_clf in enumerate(methods):
        w_statistic[i, j], p_value[i, j] = ranksums(ranks.T[i], ranks.T[j])
        
        


In [45]:
from tabulate import tabulate

method_names = ['B_GNB', 'A_GNB', 'S_GNB',
               'B_SVC', 'A_SVC', 'S_SVC',
               'B_DT', 'A_DT', 'S_DT',
               'V']

headers = method_names
names_column = np.expand_dims(np.array(method_names), axis=1)

w_statistic_table = np.concatenate((names_column, w_statistic), axis=1)
w_statistic_table = tabulate(w_statistic_table, headers, floatfmt=".2f")
p_value_table = np.concatenate((names_column, p_value), axis=1)
p_value_table = tabulate(p_value_table, headers, floatfmt=".2f")

print('w statistic')
print(w_statistic_table)
print('p value')
print(p_value_table)


w statistic
         B_GNB    A_GNB    S_GNB    B_SVC    A_SVC    S_SVC    B_DT    A_DT    S_DT      V
-----  -------  -------  -------  -------  -------  -------  ------  ------  ------  -----
B_GNB     0.00     2.33    -3.73    -2.75     0.41    -3.49   -2.89   -0.47   -1.04  -2.92
A_GNB    -2.33     0.00    -4.57    -4.07    -2.29    -4.46   -4.17   -3.23   -3.41  -4.25
S_GNB     3.73     4.57     0.00     0.47     3.61     0.55    1.50    4.02    3.44   0.74
B_SVC     2.75     4.07    -0.47     0.00     2.92    -0.09    0.66    2.79    2.15   0.50
A_SVC    -0.41     2.29    -3.61    -2.92     0.00    -3.45   -2.95   -1.19   -1.47  -3.08
S_SVC     3.49     4.46    -0.55     0.09     3.45     0.00    1.03    3.76    3.06   0.38
B_DT      2.89     4.17    -1.50    -0.66     2.95    -1.03    0.00    3.10    2.00  -0.26
A_DT      0.47     3.23    -4.02    -2.79     1.19    -3.76   -3.10    0.00   -0.61  -3.11
S_DT      1.04     3.41    -3.44    -2.15     1.47    -3.06   -2.00    0.61   

In [46]:
advantage[w_statistic > 0] = 1
significance[p_value <= alpha] = 1

advantage_table = np.concatenate((names_column, advantage), axis=1)
advantage_table = tabulate(advantage_table, headers)
significance_table = np.concatenate((names_column, significance), axis=1)
significance_table = tabulate(significance_table, headers)

print('advantage')
print(advantage_table)

print('significance')
print(significance_table)


advantage
         B_GNB    A_GNB    S_GNB    B_SVC    A_SVC    S_SVC    B_DT    A_DT    S_DT    V
-----  -------  -------  -------  -------  -------  -------  ------  ------  ------  ---
B_GNB        0        1        0        0        1        0       0       0       0    0
A_GNB        0        0        0        0        0        0       0       0       0    0
S_GNB        1        1        0        1        1        1       1       1       1    1
B_SVC        1        1        0        0        1        0       1       1       1    1
A_SVC        0        1        0        0        0        0       0       0       0    0
S_SVC        1        1        0        1        1        0       1       1       1    1
B_DT         1        1        0        0        1        0       0       1       1    0
A_DT         1        1        0        0        1        0       0       0       0    0
S_DT         1        1        0        0        1        0       0       1       0    0
V          

In [47]:
stat_better = significance * advantage

stat_better_table = np.concatenate((names_column, stat_better), axis=1)
stat_better_table = tabulate(stat_better_table, headers)

print('statistically significantly better')
print(stat_better_table)


statistically significantly better
         B_GNB    A_GNB    S_GNB    B_SVC    A_SVC    S_SVC    B_DT    A_DT    S_DT    V
-----  -------  -------  -------  -------  -------  -------  ------  ------  ------  ---
B_GNB        0        1        0        0        0        0       0       0       0    0
A_GNB        0        0        0        0        0        0       0       0       0    0
S_GNB        1        1        0        0        1        0       0       1       1    0
B_SVC        1        1        0        0        1        0       0       1       1    0
A_SVC        0        1        0        0        0        0       0       0       0    0
S_SVC        1        1        0        0        1        0       0       1       1    0
B_DT         1        1        0        0        1        0       0       1       1    0
A_DT         0        1        0        0        0        0       0       0       0    0
S_DT         0        1        0        0        0        0       0       0

In [48]:
# T-test -- dla każdego datasetu, ze wszystkich foldow
from scipy import stats

results = np.load('results.npy')
print(results.shape)

(10, 20, 10)


In [49]:
alpha = 0.05

for dataset_id in range(20):
    dataset_res = results[:,dataset_id]
    print('----dataset id----', dataset_id) # shape: 10(folds) x 10(methods)
    
    t_statistic = np.zeros((len(methods), len(methods)))
    p_value = np.zeros((len(methods), len(methods)))
    advantage = np.zeros((len(methods), len(methods)))
    significance = np.zeros((len(methods), len(methods)))
     
    for i, i_ds in enumerate(methods):
        for j, j_ds in enumerate(methods):
            t_statistic[i, j], p_value[i, j] = stats.ttest_rel(dataset_res[:,i], dataset_res[:,j])
            
    advantage[t_statistic > 0] = 1
    significance[p_value <= alpha] = 1
    stat_better = significance * advantage
    
    stat_better_table = np.concatenate((names_column, stat_better), axis=1)
    stat_better_table = tabulate(stat_better_table, headers)

    print('statistically significantly better')
    print(stat_better_table)
    

----dataset id---- 0
statistically significantly better
         B_GNB    A_GNB    S_GNB    B_SVC    A_SVC    S_SVC    B_DT    A_DT    S_DT    V
-----  -------  -------  -------  -------  -------  -------  ------  ------  ------  ---
B_GNB        0        1        0        0        1        0       0       1       1    0
A_GNB        0        0        0        0        0        0       0       0       0    0
S_GNB        0        1        0        0        1        0       0       1       1    0
B_SVC        1        1        0        0        1        0       1       1       1    0
A_SVC        0        1        0        0        0        0       0       0       0    0
S_SVC        0        1        0        0        1        0       0       1       1    0
B_DT         0        1        0        0        1        0       0       1       1    0
A_DT         0        1        0        0        0        0       0       0       0    0
S_DT         0        1        0        0        0    

statistically significantly better
         B_GNB    A_GNB    S_GNB    B_SVC    A_SVC    S_SVC    B_DT    A_DT    S_DT    V
-----  -------  -------  -------  -------  -------  -------  ------  ------  ------  ---
B_GNB        0        0        0        0        0        0       0       0       0    0
A_GNB        1        0        0        1        0        0       0       0       0    1
S_GNB        1        0        0        1        0        0       0       0       0    1
B_SVC        1        0        0        0        0        0       0       0       0    0
A_SVC        1        0        0        1        0        0       0       0       0    1
S_SVC        1        0        0        1        0        0       0       0       0    1
B_DT         1        0        0        1        0        0       0       0       0    1
A_DT         1        0        0        1        0        0       0       0       0    1
S_DT         1        0        0        1        0        0       0       0