In [88]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel
from tabulate import tabulate
pd.set_option('display.max_columns', 100)

In [89]:
def fix_arrays(data: pd.DataFrame):
    def str_to_arr(s: str) -> str:
        s = s[1:-1]
        return np.fromstring(s, sep=',')
    means_columns = ['Unreduced_acc','Reduced_acc', 'Clf_unreducted_time', 'Clf_reducted_time','Reduction_time']
    for column in means_columns:
        data[column] = data[column].apply(lambda x: str_to_arr(x))
    return data

In [90]:
iris = fix_arrays(pd.read_csv('../data/iris.csv'))
ibm = fix_arrays(pd.read_csv('../data/IBM.csv'))

In [91]:
def tt_test(data: pd.DataFrame, set_name: str):

    data['Name'] = data['Reduction_method'] + '_' + data['Classificator']

    alfa = .05
    t_statistic = np.zeros((data.shape[0], data.shape[0]))
    p_value = np.zeros((data.shape[0], data.shape[0]))

    for i in range(data.shape[0]):
        for j in range(data.shape[0]):
            t_statistic[i, j], p_value[i, j] = ttest_rel(data['Reduced_acc'][i], data['Reduced_acc'][j])

    names_column = data['Name'].values.reshape(-1, 1)
    t_statistic_table = np.concatenate((names_column, t_statistic), axis=1)
    t_statistic_table = tabulate(t_statistic_table, data['Name'].values, floatfmt=".2f")
    p_value_table = np.concatenate((names_column, p_value), axis=1)
    p_value_table = tabulate(p_value_table, data['Name'].values, floatfmt=".2f")

    advantage = np.zeros((data.shape[0], data.shape[0]))
    advantage[t_statistic > 0] = 1
    advantage_table = tabulate(np.concatenate(
        (names_column, advantage), axis=1), data['Name'].values)

    significance = np.zeros((data.shape[0], data.shape[0]))
    significance[p_value <= alfa] = 1
    significance_table = tabulate(np.concatenate(
        (names_column, significance), axis=1), data['Name'].values)

    stat_better = significance * advantage
    stat_better_table = tabulate(np.concatenate(
        (names_column, stat_better), axis=1), data['Name'].values)
    # print("Statistically significantly better:\n", stat_better_table)
    for i in range(stat_better.shape[0]):
        for j in range(stat_better.shape[1]):
            if stat_better[i][j]:
                print(f"{data['Name'].values[i]} better than {data['Name'].values[j]}")

    text_file = open(f"{set_name}.txt", "w")
    text_file.write(stat_better_table)
    text_file.close()

In [92]:
tt_test(iris, 'iris')

LDA_Naive Bayes better than PCA_Random Forest
LDA_Random Forest better than KPCA_Nearest Neighbors
LDA_Random Forest better than PCA_Nearest Neighbors
LDA_Random Forest better than KPCA_RBF SVM
LDA_Random Forest better than KPCA_Random Forest
LDA_Random Forest better than PCA_Random Forest


In [93]:
tt_test(ibm, 'ibm')

KPCA_Naive Bayes better than KPCA_Nearest Neighbors
KPCA_Naive Bayes better than LDA_Nearest Neighbors
KPCA_Naive Bayes better than PCA_Nearest Neighbors
KPCA_Naive Bayes better than UMAP_Nearest Neighbors
KPCA_Naive Bayes better than KPCA_RBF SVM
KPCA_Naive Bayes better than LDA_RBF SVM
KPCA_Naive Bayes better than PCA_RBF SVM
KPCA_Naive Bayes better than UMAP_RBF SVM
KPCA_Naive Bayes better than KPCA_Random Forest
KPCA_Naive Bayes better than LDA_Random Forest
KPCA_Naive Bayes better than UMAP_Random Forest
LDA_Naive Bayes better than KPCA_Nearest Neighbors
LDA_Naive Bayes better than LDA_Nearest Neighbors
LDA_Naive Bayes better than PCA_Nearest Neighbors
LDA_Naive Bayes better than KPCA_RBF SVM
LDA_Naive Bayes better than PCA_RBF SVM
LDA_Naive Bayes better than UMAP_RBF SVM
LDA_Naive Bayes better than KPCA_Random Forest
LDA_Naive Bayes better than LDA_Random Forest
LDA_Naive Bayes better than UMAP_Random Forest
PCA_Naive Bayes better than KPCA_Nearest Neighbors
PCA_Naive Bayes bette