In [106]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel
from scipy.stats import wilcoxon
from tabulate import tabulate
from typing import List
from scipy.stats import rankdata
from scipy.stats import ranksums
pd.set_option('display.max_columns', 100)

In [107]:
def fix_arrays(data: pd.DataFrame):
    def str_to_arr(s: str) -> str:
        s = s[1:-1]
        return np.fromstring(s, sep=',')
    means_columns = ['Unreduced_acc','Reduced_acc', 'Clf_unreducted_time', 'Clf_reducted_time','Reduction_time']
    for column in means_columns:
        data[column] = data[column].apply(lambda x: str_to_arr(x))
    data['Name'] = data['Reduction_method'] + '_' + data['Classificator']
    # data['Unreduced_acc'] = data['Unreduced_acc'].apply(lambda x: x/100)
    # data['Reduced_acc'] = data['Reduced_acc'].apply(lambda x: x*100)
    return data

In [108]:
iris = fix_arrays(pd.read_csv('../data/results/iris_results.csv'))
ibm = fix_arrays(pd.read_csv('../data/results/IBM_results.csv'))
d21 = fix_arrays(pd.read_csv('../data/results/d21_results.csv'))
wine = fix_arrays(pd.read_csv('../data/results/wine_results.csv'))
d10 = fix_arrays(pd.read_csv('../data/results/d10 - student_alchohol_results.csv'))
d16 = fix_arrays(pd.read_csv('../data/results/d16 - restaurant_results.csv'))
sets = [iris, ibm, d21, wine, d10, d16]
sets_d16 = [d16, d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16,d16]
sets_iris = [iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris,iris]

In [109]:
def tt_test(data: pd.DataFrame, set_name: str):

    alfa = .05
    t_statistic = np.zeros((data.shape[0], data.shape[0]))
    p_value = np.zeros((data.shape[0], data.shape[0]))

    for i in range(data.shape[0]):
        for j in range(data.shape[0]):
            t_statistic[i, j], p_value[i, j] = ttest_rel(data['Reduced_acc'][i], data['Reduced_acc'][j])

    names_column = data['Name'].values.reshape(-1, 1)
    t_statistic_table = np.concatenate((names_column, t_statistic), axis=1)
    t_statistic_table = tabulate(t_statistic_table, data['Name'].values, floatfmt=".2f")
    p_value_table = np.concatenate((names_column, p_value), axis=1)
    p_value_table = tabulate(p_value_table, data['Name'].values, floatfmt=".2f")

    advantage = np.zeros((data.shape[0], data.shape[0]))
    advantage[t_statistic > 0] = 1
    advantage_table = tabulate(np.concatenate(
        (names_column, advantage), axis=1), data['Name'].values)

    significance = np.zeros((data.shape[0], data.shape[0]))
    significance[p_value <= alfa] = 1
    significance_table = tabulate(np.concatenate(
        (names_column, significance), axis=1), data['Name'].values)

    stat_better = significance * advantage
    stat_better_table = tabulate(np.concatenate(
        (names_column, stat_better), axis=1), data['Name'].values)
    # print("Statistically significantly better:\n", stat_better_table)
    for i in range(stat_better.shape[0]):
        for j in range(stat_better.shape[1]):
            if stat_better[i][j]:
                print(f"{data['Name'].values[i]} better than {data['Name'].values[j]}")

    text_file = open(f"{set_name}.txt", "w")
    text_file.write(stat_better_table)
    text_file.close()

In [110]:
tt_test(iris, 'iris')

LDA_Naive Bayes better than PCA_Random Forest
LDA_Random Forest better than KPCA_Nearest Neighbors
LDA_Random Forest better than PCA_Nearest Neighbors
LDA_Random Forest better than KPCA_RBF SVM
LDA_Random Forest better than KPCA_Random Forest
LDA_Random Forest better than PCA_Random Forest


In [111]:
tt_test(ibm, 'ibm')

KPCA_Naive Bayes better than KPCA_Nearest Neighbors
KPCA_Naive Bayes better than LDA_Nearest Neighbors
KPCA_Naive Bayes better than PCA_Nearest Neighbors
KPCA_Naive Bayes better than UMAP_Nearest Neighbors
KPCA_Naive Bayes better than KPCA_RBF SVM
KPCA_Naive Bayes better than LDA_RBF SVM
KPCA_Naive Bayes better than PCA_RBF SVM
KPCA_Naive Bayes better than UMAP_RBF SVM
KPCA_Naive Bayes better than KPCA_Random Forest
KPCA_Naive Bayes better than LDA_Random Forest
KPCA_Naive Bayes better than UMAP_Random Forest
LDA_Naive Bayes better than KPCA_Nearest Neighbors
LDA_Naive Bayes better than LDA_Nearest Neighbors
LDA_Naive Bayes better than PCA_Nearest Neighbors
LDA_Naive Bayes better than KPCA_RBF SVM
LDA_Naive Bayes better than PCA_RBF SVM
LDA_Naive Bayes better than UMAP_RBF SVM
LDA_Naive Bayes better than KPCA_Random Forest
LDA_Naive Bayes better than LDA_Random Forest
LDA_Naive Bayes better than UMAP_Random Forest
PCA_Naive Bayes better than KPCA_Nearest Neighbors
PCA_Naive Bayes bette

In [112]:
def wilx(sets: List[pd.DataFrame]):
    scores = np.zeros((len(sets), sets[0].shape[0]))
    for i, s in enumerate(sets):
        scores[i] = s['Reduced_acc'].apply(lambda x: x.mean()).values
    from scipy.stats import rankdata
    ranks = []
    for ms in scores:
        ranks.append(rankdata(ms).tolist())
    ranks = np.array(ranks)
    mean_ranks = np.mean(ranks, axis=0)
    alfa = .05
    w_statistic = np.zeros((sets[0].shape[0], sets[0].shape[0]))
    p_value = np.zeros((sets[0].shape[0], sets[0].shape[0]))

    for i in range(sets[0].shape[0]):
        for j in range(sets[0].shape[0]):
            w_statistic[i, j], p_value[i, j] = ranksums(ranks.T[i], ranks.T[j])

    headers = list(sets[0]['Name'].values)
    names_column = np.expand_dims(np.array(list(sets[0]['Name'].values)), axis=1)
    w_statistic_table = np.concatenate((names_column, w_statistic), axis=1)
    w_statistic_table = tabulate(w_statistic_table, headers, floatfmt=".2f")
    p_value_table = np.concatenate((names_column, p_value), axis=1)
    p_value_table = tabulate(p_value_table, headers, floatfmt=".2f")
    advantage = np.zeros((sets[0].shape[0], sets[0].shape[0]))
    advantage[w_statistic > 0] = 1
    advantage_table = tabulate(np.concatenate(
        (names_column, advantage), axis=1), headers)
    significance = np.zeros((sets[0].shape[0], sets[0].shape[0]))
    significance[p_value <= alfa] = 1
    significance_table = tabulate(np.concatenate(
        (names_column, significance), axis=1), headers)

    text_file = open(f"wilcoxon.txt", "w")
    text_file.write(significance_table)
    text_file.close()
    return significance


In [113]:
# best
res = wilx(sets)
b_n = np.array([iris['Reduction_method'].values, iris['Classificator'].values, res.sum(axis=1)])
best_clf_red = pd.DataFrame(data=b_n.T, columns=['R', 'C', 'V'])

In [114]:
best_clf_red

Unnamed: 0,R,C,V
0,KPCA,Naive Bayes,3.0
1,LDA,Naive Bayes,4.0
2,PCA,Naive Bayes,3.0
3,UMAP,Naive Bayes,2.0
4,KPCA,Nearest Neighbors,3.0
5,LDA,Nearest Neighbors,1.0
6,PCA,Nearest Neighbors,3.0
7,UMAP,Nearest Neighbors,4.0
8,KPCA,RBF SVM,6.0
9,LDA,RBF SVM,3.0


In [115]:
best_clf_red.groupby('R').sum().drop(columns=['C'])

Unnamed: 0_level_0,V
R,Unnamed: 1_level_1
KPCA,15.0
LDA,18.0
PCA,19.0
UMAP,22.0


In [116]:
best_clf_red.groupby('C').sum().drop(columns=['R'])

Unnamed: 0_level_0,V
C,Unnamed: 1_level_1
Naive Bayes,12.0
Nearest Neighbors,11.0
RBF SVM,28.0
Random Forest,23.0


In [117]:
def wilcoxon_test_for_column_pair(sets, cols):
    names = cols
    scores_reduced = np.zeros((len(sets), sets[0].shape[0]))
    scores_unreduced = np.zeros((len(sets), sets[0].shape[0]))
    for i, s in enumerate(sets):
        scores_reduced[i] = s[cols[0]].apply(lambda x: x.mean()).values
        scores_unreduced[i] = s[cols[1]].apply(lambda x: x.mean()).values
    
    wilcoxons = []
    for i in range(scores_reduced.shape[1]):
        wilcoxons.append(np.concatenate(([scores_unreduced[:,i]], [scores_reduced[:, i]]), axis=0).T)
    
    ranks_set = []
    for mean_scores in wilcoxons:
        print(mean_scores)
        ranks = []
        for ms in mean_scores:
            ranks.append(rankdata(ms).tolist())
        ranks = np.array(ranks)
        alfa = .05
        w_statistic = np.zeros((2, 2))
        p_value = np.zeros((2, 2))
        for i in range(2):
            for j in range(2):
                w_statistic[i, j], p_value[i, j] = ranksums(ranks.T[i], ranks.T[j])
        
        headers = list(names)
        names_column = np.expand_dims(np.array(list(names)), axis=1)
        w_statistic_table = np.concatenate((names_column, w_statistic), axis=1)
        w_statistic_table = tabulate(w_statistic_table, headers, floatfmt=".2f")
        p_value_table = np.concatenate((names_column, p_value), axis=1)
        p_value_table = tabulate(p_value_table, headers, floatfmt=".2f")
        print(w_statistic_table)
        print(p_value_table)
        # print("\nw-statistic:\n", w_statistic_table, "\n\np-value:\n", p_value_table)

        advantage = np.zeros((2, 2))
        advantage[w_statistic > 0] = 1
        advantage_table = tabulate(np.concatenate(
            (names_column, advantage), axis=1), headers)
        significance = np.zeros((2, 2))
        significance[p_value <= alfa] = 1
        significance_table = tabulate(np.concatenate(
            (names_column, significance), axis=1), headers)
        print(significance)


In [118]:
wilcoxon_test_for_column_pair(sets, ['Reduced_acc','Unreduced_acc'])

[[0.95333333 0.95333333]
 [0.30068027 0.31496599]
 [0.3550098  0.39337743]
 [0.54843553 0.46781053]
 [0.05826923 0.10608974]
 [0.32017766 0.36488335]]
                 Reduced_acc    Unreduced_acc
-------------  -------------  ---------------
Reduced_acc             0.00            -1.92
Unreduced_acc           1.92             0.00
                 Reduced_acc    Unreduced_acc
-------------  -------------  ---------------
Reduced_acc             1.00             0.05
Unreduced_acc           0.05             1.00
[[0. 0.]
 [0. 0.]]
[[0.95333333 0.98      ]
 [0.30068027 0.30408163]
 [0.3550098  0.35684305]
 [0.54843553 0.59783412]
 [0.05826923 0.10628205]
 [0.32017766 0.50340391]]
                 Reduced_acc    Unreduced_acc
-------------  -------------  ---------------
Reduced_acc             0.00            -2.88
Unreduced_acc           2.88             0.00
                 Reduced_acc    Unreduced_acc
-------------  -------------  ---------------
Reduced_acc             1.00       

In [119]:
wilcoxon_test_for_column_pair(sets_d16, ['Clf_reducted_time','Clf_unreducted_time'])

99794 0.00449343]
 [0.00799794 0.00449343]
 [0.00799794 0.00449343]
 [0.00799794 0.00449343]
 [0.00799794 0.00449343]]
                       Clf_reducted_time    Clf_unreducted_time
-------------------  -------------------  ---------------------
Clf_reducted_time                   0.00                   6.65
Clf_unreducted_time                -6.65                   0.00
                       Clf_reducted_time    Clf_unreducted_time
-------------------  -------------------  ---------------------
Clf_reducted_time                   1.00                   0.00
Clf_unreducted_time                 0.00                   1.00
[[0. 1.]
 [1. 0.]]
[[0.01506302 0.01144695]
 [0.01506302 0.01144695]
 [0.01506302 0.01144695]
 [0.01506302 0.01144695]
 [0.01506302 0.01144695]
 [0.01506302 0.01144695]
 [0.01506302 0.01144695]
 [0.01506302 0.01144695]
 [0.01506302 0.01144695]
 [0.01506302 0.01144695]
 [0.01506302 0.01144695]
 [0.01506302 0.01144695]
 [0.01506302 0.01144695]
 [0.01506302 0.01144695]
