In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd


In [2]:
def load_result(file_name):
    data = np.loadtxt("results/results_"+file_name+".txt", delimiter=",")
    return data

base_data = load_result("base")
baseSoft_data = load_result("baseSoft")
paste_data = load_result("paste")
pasteSoft_data = load_result("pasteSoft")
pix_data = load_result("pix")
pixSoft_data = load_result("pixSoft")


Paired t-test for each with each

In [16]:
k=3 # 0 - accyracy, 1 - precision, 2 - recall, 3 - balanced accuracy 

model = [base_data, paste_data, pix_data, baseSoft_data, pasteSoft_data, pixSoft_data]
model_names = ['base', 'paste', 'pix', 'baseSoft', 'pasteSoft', 'pixSoft']
t_stat = pd.DataFrame(index=model_names, columns=model_names, dtype=float)
p_values = pd.DataFrame(index=model_names, columns=model_names, dtype=float)
for i in range(len(model_names)):
    for j in range(len(model_names)):
        t, p = stats.ttest_ind_from_stats(mean1=model[i].mean(axis=0)[k], std1=model[i].std(axis=0)[k], nobs1=len(model[i]), 
                                    mean2=model[j].mean(axis=0)[k], std2=model[j].std(axis=0)[k], nobs2=len(model[j]))
        t_stat.loc[model_names[i], model_names[j]] = t
        t_stat.loc[model_names[j], model_names[i]] = -t
        p_values.loc[model_names[i], model_names[j]] = p
        p_values.loc[model_names[j], model_names[i]] = p
t_stat_binary = t_stat.applymap(lambda x: 1 if x > 0 else 0)
p_values_binary = p_values.applymap(lambda x: 1 if x < 0.05 else 0)
final_results = t_stat_binary * p_values_binary

print("t>0, p<0.05")
final_results

t>0, p<0.05


Unnamed: 0,base,paste,pix,baseSoft,pasteSoft,pixSoft
base,0,0,0,0,0,0
paste,1,0,0,0,0,0
pix,1,0,0,0,0,0
baseSoft,1,1,0,0,0,0
pasteSoft,1,1,1,1,0,0
pixSoft,1,1,1,1,1,0


Friedman test

In [17]:
from scipy.stats import friedmanchisquare

statistic, p_value = friedmanchisquare(base_data[:,k], paste_data[:,k], 
                                pix_data[:,k], baseSoft_data[:,k], 
                                pasteSoft_data[:,k], pixSoft_data[:,k])

print("Test statistics:", statistic)
print("Value p:", p_value)

Test statistics: 31.85185185185183
Value p: 6.356361774331756e-06


There is a significant difference between at least one of the pairs of groups

Tukey test

In [18]:
data = (list(base_data[:,k]) + list(paste_data[:,k]) + 
        list(pix_data[:,k]) + list(baseSoft_data[:,k]) + 
        list(pasteSoft_data[:,k]) + list(pixSoft_data[:,k]))

labels = (['base'] * len(base_data) + ['paste'] * len(paste_data) 
        + ['pix'] * len(pix_data) + ['baseSoft'] * len(baseSoft_data) 
        + ['pasteSoft'] * len(pasteSoft_data) + ['pixSoft'] * len(pixSoft_data))
tukey_results = pairwise_tukeyhsd(data, labels, 0.05)
print(tukey_results)

   Multiple Comparison of Means - Tukey HSD, FWER=0.05    
  group1    group2  meandiff p-adj   lower   upper  reject
----------------------------------------------------------
     base  baseSoft     0.27    0.0  0.2235  0.3165   True
     base     paste   0.2317    0.0  0.1852  0.2782   True
     base pasteSoft   0.3423    0.0  0.2958  0.3888   True
     base       pix   0.2596    0.0  0.2131  0.3061   True
     base   pixSoft   0.3961    0.0  0.3496  0.4426   True
 baseSoft     paste  -0.0383 0.1569 -0.0848  0.0082  False
 baseSoft pasteSoft   0.0723 0.0005  0.0258  0.1188   True
 baseSoft       pix  -0.0105 0.9834 -0.0569   0.036  False
 baseSoft   pixSoft   0.1261    0.0  0.0796  0.1726   True
    paste pasteSoft   0.1106    0.0  0.0641  0.1571   True
    paste       pix   0.0279  0.476 -0.0186  0.0744  False
    paste   pixSoft   0.1644    0.0  0.1179  0.2109   True
pasteSoft       pix  -0.0827 0.0001 -0.1292 -0.0363   True
pasteSoft   pixSoft   0.0538 0.0155  0.0073  0.1003   Tr

One difference from the t-student test.  
No reason to reject hypothesis H0 of no difference between Base + Soft paste.