In [38]:
from util import load_user_data
import pandas as pd
import numpy as np
from scipy.stats import kruskal, mannwhitneyu

def post_hoc_comparison(data_list_1, data_list_2, name1, name2):
	# print("Use pots-hoc analysis")
	threshold = 0.05 / 4
	flag = False
	statistic, pvalue = mannwhitneyu(data_list_1, data_list_2, alternative='greater')
	if pvalue < threshold:
		print("Alternative {} > {},".format(name1, name2), "pvalue %.4f"%pvalue, "statistic %.4f"%statistic)
		flag = True
	statistic, pvalue = mannwhitneyu(data_list_1, data_list_2, alternative='less')
	if pvalue < threshold:
		print("Alternative {} < {},".format(name1, name2), "pvalue %.4f"%pvalue, "statistic %.4f"%statistic)
		flag = True
	if not flag:
		# print("No significant difference with post-hoc analysis")
		pass

In [39]:
valid_users, tp_data = load_user_data(folder_name="../anonymized_data", reserved_users=None)

variable_dict = {}
variable_dict["condition"] = []
dimensions = ["recall", "acc_strict", "acc_relaxed", "acc_execution"]
all_conditions = ["AP-AE", "AP-UE", "UP-AE", "UP-UE"]
for condition in all_conditions:
    variable_dict[condition] = {}
    for dimension in dimensions:
        variable_dict[condition][dimension] = []
user2condition = tp_data['user2condition']
for user in valid_users:
    tp_condition = user2condition[user]
    if tp_condition not in all_conditions:
        continue
    task_performance = tp_data["task_performance"]
    for dimension in dimensions:
        variable_dict[tp_condition][dimension].append(task_performance[user]["avg"][dimension])

print("-" * 34)

248 complete the NASA-TLX
248 complete the whole study
----------------------------------


In [8]:
print("For all participants, compare with experimental conditions")
for dimension in ["acc_strict", "acc_execution"]:
    print(dimension)
    AP_performance = variable_dict["AP-AE"][dimension] + variable_dict["AP-UE"][dimension]
    UP_performance = variable_dict["UP-AE"][dimension] + variable_dict["UP-UE"][dimension]
    print(len(AP_performance), len(UP_performance))
    statistic, pvalue = kruskal(AP_performance, UP_performance)
    print("AP vs UP; kruskal test result: H:{:.2f}, p:{:.3f}".format(statistic, pvalue))
    tp_str = dimension + "&" + "{:.2f} & {:.3f}& ".format(statistic, pvalue)
    print("AP performance: {:.2f}".format(np.mean(AP_performance)))
    print("UP performance: {:.2f}".format(np.mean(UP_performance)))

    if pvalue < 0.05 / 4:
        post_hoc_comparison(AP_performance, UP_performance, 'AP', 'UP')
    # for condition in all_conditions:
    #     data_list_1 = variable_dict[condition][dimension]
    #     print("{}, Mean: M({}):{:.2f}, SD({}):{:.2f}".format(len(data_list_1), condition, np.mean(data_list_1), condition, np.std(data_list_1)))
    #     tp_str += "${:.2f} \\pm {:.2f}$ &".format(np.mean(data_list_1), np.std(data_list_1))
    # print(tp_str)
    if pvalue < 0.05 / 4:
        length = len(all_conditions)
        for i in range(length - 1):
            for j in range(i+1, length):
                group_1 = all_conditions[i]
                group_2 = all_conditions[j]
                data_list_1 = variable_dict[group_1][dimension]
                data_list_2 = variable_dict[group_2][dimension]
                post_hoc_comparison(data_list_1, data_list_2, group_1, group_2)
    print("-" * 17)

For all participants, compare with experimental conditions
acc_strict
127 121
AP vs UP; kruskal test result: H:1.85, p:0.173
AP performance: 0.50
UP performance: 0.47
-----------------
acc_execution
127 121
AP vs UP; kruskal test result: H:0.57, p:0.450
AP performance: 0.53
UP performance: 0.51
-----------------


In [5]:
print("For all participants, compare with experimental conditions")
for dimension in ["acc_strict", "acc_execution"]:
    print(dimension)
    kwargs = [variable_dict[condition][dimension] for condition in ["AP-AE", "UP-AE"]]
    statistic, pvalue_1 = kruskal(*kwargs)
    print("AP-AE vs UP-AE; kruskal test result: H:{:.2f}, p:{:.3f}".format(statistic, pvalue_1))
    tp_str = dimension + "&" + "{:.2f} & {:.3f}& ".format(statistic, pvalue_1)

    kwargs = [variable_dict[condition][dimension] for condition in ["AP-UE", "UP-UE"]]
    statistic, pvalue_2 = kruskal(*kwargs)
    print("AP-UE vs UP-UE; kruskal test result: H:{:.2f}, p:{:.3f}".format(statistic, pvalue_2))
    for condition in all_conditions:
        data_list_1 = variable_dict[condition][dimension]
        print("{}, Mean: M({}):{:.2f}, SD({}):{:.2f}".format(len(data_list_1), condition, np.mean(data_list_1), condition, np.std(data_list_1)))
        tp_str += "${:.2f} \\pm {:.2f}$ &".format(np.mean(data_list_1), np.std(data_list_1))
    print(tp_str)
    if pvalue_1 < 0.05 / 4 or pvalue_2 < 0.05 / 4:
        length = len(all_conditions)
        for i in range(length - 1):
            for j in range(i+1, length):
                group_1 = all_conditions[i]
                group_2 = all_conditions[j]
                data_list_1 = variable_dict[group_1][dimension]
                data_list_2 = variable_dict[group_2][dimension]
                post_hoc_comparison(data_list_1, data_list_2, group_1, group_2)
    print("-" * 17)

For all participants, compare with experimental conditions
acc_strict
AP-AE vs UP-AE; kruskal test result: H:6.25, p:0.012
AP-UE vs UP-UE; kruskal test result: H:0.29, p:0.593
63, Mean: M(AP-AE):0.53, SD(AP-AE):0.12
64, Mean: M(AP-UE):0.46, SD(AP-UE):0.17
61, Mean: M(UP-AE):0.46, SD(UP-AE):0.19
60, Mean: M(UP-UE):0.48, SD(UP-UE):0.19
acc_strict&6.25 & 0.012& $0.53 \pm 0.12$ &$0.46 \pm 0.17$ &$0.46 \pm 0.19$ &$0.48 \pm 0.19$ &
Alternative AP-AE > AP-UE, pvalue 0.0104 statistic 2465.0000
Alternative AP-AE > UP-AE, pvalue 0.0063 statistic 2396.0000
-----------------
acc_execution
AP-AE vs UP-AE; kruskal test result: H:4.31, p:0.038
AP-UE vs UP-UE; kruskal test result: H:0.57, p:0.449
63, Mean: M(AP-AE):0.54, SD(AP-AE):0.12
64, Mean: M(AP-UE):0.53, SD(AP-UE):0.20
61, Mean: M(UP-AE):0.47, SD(UP-AE):0.19
60, Mean: M(UP-UE):0.56, SD(UP-UE):0.20
acc_execution&4.31 & 0.038& $0.54 \pm 0.12$ &$0.53 \pm 0.20$ &$0.47 \pm 0.19$ &$0.56 \pm 0.20$ &
-----------------


## Task-speciifc Analysis

In [40]:
from util import task_order
variable_dict = {}
variable_dict["condition"] = []
dimensions = ["recall", "acc_strict", "acc_relaxed", "acc_execution"]
all_conditions = ["AP-AE", "AP-UE", "UP-AE", "UP-UE"]
for condition in all_conditions:
    variable_dict[condition] = {}
    for dimension in dimensions:
        variable_dict[condition][dimension] = {}
        for task_id in task_order:
            variable_dict[condition][dimension][task_id] = []
        variable_dict[condition][dimension]["avg"] = []
user2condition = tp_data['user2condition']
task_performance = tp_data["task_performance"]
for user in valid_users:
    tp_condition = user2condition[user]
    if tp_condition not in all_conditions:
        continue
    for dimension in dimensions:
        tp_list = []
        for task_id in task_order:
            tp_performance = task_performance[user][task_id][dimension]
            variable_dict[tp_condition][dimension][task_id].append(tp_performance)
            tp_list.append(tp_performance)
        variable_dict[tp_condition][dimension]["avg"].append(np.mean(tp_list))
print("-" * 34)

----------------------------------


In [45]:
str_dict = {}
for task_id in task_order + ['avg']:
    str_dict[task_id] = ""
for dimension in ["acc_strict", "acc_execution"]:
    print("-" * 32)
    print(dimension)
    for task_id in task_order + ['avg']:
        print(task_id)
        AP_performance = variable_dict["AP-AE"][dimension][task_id] + variable_dict["AP-UE"][dimension][task_id]
        UP_performance = variable_dict["UP-AE"][dimension][task_id] + variable_dict["UP-UE"][dimension][task_id]
        print(len(AP_performance), len(UP_performance))
        statistic, pvalue = kruskal(AP_performance, UP_performance)
        print("AP vs UP; kruskal test result: H:{:.2f}, p:{:.3f}".format(statistic, pvalue))
        # tp_str = dimension + "&" + "{:.2f} & {:.3f}& ".format(statistic, pvalue)
        print("AP performance: {:.2f}".format(np.mean(AP_performance)))
        print("UP performance: {:.2f}".format(np.mean(UP_performance)))

        if pvalue < 0.05 / 4:
            post_hoc_comparison(AP_performance, UP_performance, 'AP', 'UP')
        for condition in all_conditions:
            data_list_1 = variable_dict[condition][dimension][task_id]
            print("{}, Mean: M({}):{:.2f}, SD({}):{:.2f}".format(len(data_list_1), condition, np.mean(data_list_1), condition, np.std(data_list_1)))
            str_dict[task_id] += "{:.2f} & ".format(np.mean(data_list_1))
        if pvalue < 0.05 / 4:
            if np.mean(AP_performance) > np.mean(UP_performance):
                str_dict[task_id] += "AP > UP & "
            else:
                str_dict[task_id] += "AP < UP & "
        else:
            str_dict[task_id] += "- & "
for task_id in ['avg'] + task_order:
    print(task_id, str_dict[task_id])

--------------------------------
acc_strict
test-149
127 121
AP vs UP; kruskal test result: H:14.34, p:0.000
AP performance: 0.00
UP performance: 0.11
Alternative AP < UP, pvalue 0.0001 statistic 6858.0000
63, Mean: M(AP-AE):0.00, SD(AP-AE):0.00
64, Mean: M(AP-UE):0.00, SD(AP-UE):0.00
61, Mean: M(UP-AE):0.10, SD(UP-AE):0.30
60, Mean: M(UP-UE):0.12, SD(UP-UE):0.32
test-200
127 121
AP vs UP; kruskal test result: H:4.03, p:0.045
AP performance: 0.71
UP performance: 0.59
63, Mean: M(AP-AE):0.78, SD(AP-AE):0.42
64, Mean: M(AP-UE):0.64, SD(AP-UE):0.48
61, Mean: M(UP-AE):0.61, SD(UP-AE):0.49
60, Mean: M(UP-UE):0.57, SD(UP-UE):0.50
test-859
127 121
AP vs UP; kruskal test result: H:0.44, p:0.506
AP performance: 0.28
UP performance: 0.32
63, Mean: M(AP-AE):0.44, SD(AP-AE):0.50
64, Mean: M(AP-UE):0.12, SD(AP-UE):0.33
61, Mean: M(UP-AE):0.36, SD(UP-AE):0.48
60, Mean: M(UP-UE):0.28, SD(UP-UE):0.45
test-388
127 121
AP vs UP; kruskal test result: H:9.22, p:0.002
AP performance: 0.92
UP performance: 0

In [24]:
print("For all participants, compare with experimental conditions")
for task_id in task_order:
    print("-" * 32)
    print(task_id)
    for dimension in ["acc_strict", "acc_execution"]:
        print(dimension)
        kwargs = [variable_dict[condition][task_id][dimension] for condition in all_conditions]
        statistic, pvalue = kruskal(*kwargs)
        print("all conditions; kruskal test result: H:{:.2f}, p:{:.3f}".format(statistic, pvalue))


        kwargs = [variable_dict[condition][task_id][dimension] for condition in ["AP-AE", "UP-AE"]]
        statistic, pvalue_1 = kruskal(*kwargs)
        print("AP-AE vs UP-AE; kruskal test result: H:{:.2f}, p:{:.3f}".format(statistic, pvalue_1))
        tp_str = dimension + "&" + "{:.2f} & {:.3f}& ".format(statistic, pvalue_1)

        kwargs = [variable_dict[condition][task_id][dimension] for condition in ["AP-UE", "UP-UE"]]
        statistic, pvalue_2 = kruskal(*kwargs)
        print("AP-UE vs UP-UE; kruskal test result: H:{:.2f}, p:{:.3f}".format(statistic, pvalue_2))
        for condition in all_conditions:
            data_list_1 = variable_dict[condition][task_id][dimension]
            print("{}, Mean: M({}):{:.2f}, SD({}):{:.2f}".format(len(data_list_1), condition, np.mean(data_list_1), condition, np.std(data_list_1)))
            tp_str += "${:.2f} \\pm {:.2f}$ &".format(np.mean(data_list_1), np.std(data_list_1))
        print(tp_str)
        if pvalue < 0.05 / 4 or pvalue_1 < 0.05 / 4 or pvalue_2 < 0.05 / 4:
            length = len(all_conditions)
            for i in range(length - 1):
                for j in range(i+1, length):
                    group_1 = all_conditions[i]
                    group_2 = all_conditions[j]
                    data_list_1 = variable_dict[group_1][task_id][dimension]
                    data_list_2 = variable_dict[group_2][task_id][dimension]
                    post_hoc_comparison(data_list_1, data_list_2, group_1, group_2)
        print("-" * 17)

For all participants, compare with experimental conditions
--------------------------------
test-149
acc_strict
all conditions; kruskal test result: H:14.54, p:0.002
AP-AE vs UP-AE; kruskal test result: H:6.46, p:0.011
AP-UE vs UP-UE; kruskal test result: H:7.85, p:0.005
63, Mean: M(AP-AE):0.00, SD(AP-AE):0.00
64, Mean: M(AP-UE):0.00, SD(AP-UE):0.00
61, Mean: M(UP-AE):0.10, SD(UP-AE):0.30
60, Mean: M(UP-UE):0.12, SD(UP-UE):0.32
acc_strict&6.46 & 0.011& $0.00 \pm 0.00$ &$0.00 \pm 0.00$ &$0.10 \pm 0.30$ &$0.12 \pm 0.32$ &
Alternative AP-AE < UP-AE, pvalue 0.0056 statistic 1732.5000
Alternative AP-AE < UP-UE, pvalue 0.0028 statistic 1669.5000
Alternative AP-UE < UP-AE, pvalue 0.0053 statistic 1760.0000
Alternative AP-UE < UP-UE, pvalue 0.0026 statistic 1696.0000
-----------------
acc_execution
all conditions; kruskal test result: H:16.20, p:0.001
AP-AE vs UP-AE; kruskal test result: H:6.46, p:0.011
AP-UE vs UP-UE; kruskal test result: H:9.05, p:0.003
63, Mean: M(AP-AE):0.00, SD(AP-AE):0.0

In [34]:
print("For all participants, compare with experimental conditions")
for task_id in task_order:
    print("-" * 32)
    print(task_id)
    for dimension in ["acc_strict", "acc_execution"]:
        print(dimension)
        AP_performance = variable_dict["AP-AE"][task_id][dimension] + variable_dict["AP-UE"][task_id][dimension]
        UP_performance = variable_dict["UP-AE"][task_id][dimension] + variable_dict["UP-UE"][task_id][dimension]
        print(len(AP_performance), len(UP_performance))
        statistic, pvalue = kruskal(AP_performance, UP_performance)
        print("AP vs UP; kruskal test result: H:{:.2f}, p:{:.3f}".format(statistic, pvalue))
        tp_str = dimension + "&" + "{:.2f} & {:.3f}& ".format(statistic, pvalue)
        print("AP performance: {:.2f}".format(np.mean(AP_performance)))
        print("UP performance: {:.2f}".format(np.mean(UP_performance)))

        if pvalue < 0.05 / 4:
            post_hoc_comparison(AP_performance, UP_performance, 'AP', 'UP')
        for condition in all_conditions:
            data_list_1 = variable_dict[condition][task_id][dimension]
            print("{}, Mean: M({}):{:.2f}, SD({}):{:.2f}".format(len(data_list_1), condition, np.mean(data_list_1), condition, np.std(data_list_1)))
            tp_str += "{:.2f} & ".format(np.mean(data_list_1))
        print(tp_str)
        # if pvalue < 0.05 / 4:
        #     length = len(all_conditions)
        #     for i in range(length - 1):
        #         for j in range(i+1, length):
        #             group_1 = all_conditions[i]
        #             group_2 = all_conditions[j]
        #             data_list_1 = variable_dict[group_1][task_id][dimension]
        #             data_list_2 = variable_dict[group_2][task_id][dimension]
        #             post_hoc_comparison(data_list_1, data_list_2, group_1, group_2)
        print("-" * 17)

For all participants, compare with experimental conditions
--------------------------------
test-149
acc_strict
127 121
AP vs UP; kruskal test result: H:14.34, p:0.000
AP performance: 0.00
UP performance: 0.11
Alternative AP < UP, pvalue 0.0001 statistic 6858.0000
63, Mean: M(AP-AE):0.00, SD(AP-AE):0.00
64, Mean: M(AP-UE):0.00, SD(AP-UE):0.00
61, Mean: M(UP-AE):0.10, SD(UP-AE):0.30
60, Mean: M(UP-UE):0.12, SD(UP-UE):0.32
acc_strict&14.34 & 0.000& 0.00 & 0.00 & 0.10 & 0.12 & 
-----------------
acc_execution
127 121
AP vs UP; kruskal test result: H:15.51, p:0.000
AP performance: 0.00
UP performance: 0.12
Alternative AP < UP, pvalue 0.0000 statistic 6794.5000
63, Mean: M(AP-AE):0.00, SD(AP-AE):0.00
64, Mean: M(AP-UE):0.00, SD(AP-UE):0.00
61, Mean: M(UP-AE):0.10, SD(UP-AE):0.30
60, Mean: M(UP-UE):0.13, SD(UP-UE):0.34
acc_execution&15.51 & 0.000& 0.00 & 0.00 & 0.10 & 0.13 & 
-----------------
--------------------------------
test-200
acc_strict
127 121
AP vs UP; kruskal test result: H:4.03,

In [16]:
print("For all participants, compare with experimental conditions")
for task_id in task_order:
    print("-" * 32)
    print(task_id)
    for dimension in ["acc_strict", "acc_execution"]:
        print(dimension)
        AE_performance = variable_dict["AP-AE"][task_id][dimension] + variable_dict["UP-AE"][task_id][dimension]
        UE_performance = variable_dict["AP-UE"][task_id][dimension] + variable_dict["UP-UE"][task_id][dimension]
        print(len(AE_performance), len(UE_performance))
        statistic, pvalue = kruskal(AE_performance, UE_performance)
        print("AE vs UE; kruskal test result: H:{:.2f}, p:{:.3f}".format(statistic, pvalue))
        tp_str = dimension + "&" + "{:.2f} & {:.3f}& ".format(statistic, pvalue)
        print("AE performance: {:.2f}".format(np.mean(AE_performance)))
        print("UE performance: {:.2f}".format(np.mean(UE_performance)))

        if pvalue < 0.05 / 4:
            post_hoc_comparison(AE_performance, UE_performance, 'AE', 'UE')
        for condition in all_conditions:
            data_list_1 = variable_dict[condition][task_id][dimension]
            # print("{}, Mean: M({}):{:.2f}, SD({}):{:.2f}".format(len(data_list_1), condition, np.mean(data_list_1), condition, np.std(data_list_1)))
            tp_str += "${:.2f} \\pm {:.2f}$ &".format(np.mean(data_list_1), np.std(data_list_1))
        # print(tp_str)
        # if pvalue < 0.05 / 4:
        #     length = len(all_conditions)
        #     for i in range(length - 1):
        #         for j in range(i+1, length):
        #             group_1 = all_conditions[i]
        #             group_2 = all_conditions[j]
        #             data_list_1 = variable_dict[group_1][task_id][dimension]
        #             data_list_2 = variable_dict[group_2][task_id][dimension]
        #             post_hoc_comparison(data_list_1, data_list_2, group_1, group_2)
        print("-" * 17)

For all participants, compare with experimental conditions
--------------------------------
test-149
acc_strict
124 124
AE vs UE; kruskal test result: H:0.08, p:0.776
AE performance: 0.05
UE performance: 0.06
-----------------
acc_execution
124 124
AE vs UE; kruskal test result: H:0.30, p:0.583
AE performance: 0.05
UE performance: 0.06
-----------------
--------------------------------
test-200
acc_strict
124 124
AE vs UE; kruskal test result: H:2.13, p:0.144
AE performance: 0.69
UE performance: 0.60
-----------------
acc_execution
124 124
AE vs UE; kruskal test result: H:0.08, p:0.776
AE performance: 0.72
UE performance: 0.73
-----------------
--------------------------------
test-859
acc_strict
124 124
AE vs UE; kruskal test result: H:11.90, p:0.001
AE performance: 0.40
UE performance: 0.20
Alternative AE > UE, pvalue 0.0003 statistic 9238.0000
-----------------
acc_execution
124 124
AE vs UE; kruskal test result: H:1.05, p:0.307
AE performance: 0.40
UE performance: 0.47
------------

In [26]:
variable_dict = {}
all_conditions = ["AP-AE", "AP-UE", "UP-AE", "UP-UE"]
condition_dict = {}
for condition in all_conditions:
    condition_dict[condition] = {}
    for task_id in task_order:
        condition_dict[condition][task_id] = []
    condition_dict[condition]["avg"] = []
plan_quality = tp_data['plan_quality']
variable_dict["condition"] = []
variable_dict["planning"] = []
variable_dict["execution"] = []
variable_dict["avg"] = []
for task_id in task_order:
    variable_dict[task_id] = []
for user in valid_users:
    tp_condition = user2condition[user]
    if tp_condition not in all_conditions:
        # ignore pilot study
        continue
    variable_dict["condition"].append(tp_condition)
    if tp_condition.startswith("AP"):
        variable_dict["planning"].append("automatic")
    else:
        variable_dict["planning"].append("user-involved")
    if tp_condition.endswith("AE"):
        variable_dict["execution"].append("automatic")
    else:
        variable_dict["execution"].append("user-involved")
    tp_list = []
    for task_id in task_order:
        variable_dict[task_id].append(plan_quality[user][task_id])
        condition_dict[tp_condition][task_id].append(plan_quality[user][task_id])
        tp_list.append(plan_quality[user][task_id])
    variable_dict["avg"].append(np.mean(tp_list))
    condition_dict[tp_condition]["avg"].append(np.mean(tp_list))
df = pd.DataFrame(variable_dict)

In [37]:
from pingouin import ancova, anova
from statsmodels.stats.multicomp import pairwise_tukeyhsd
print("Plan quality comparison")
for dimension in task_order + ["avg"]:
    print(dimension)
    # aov = anova(dv=dimension, between=['planning', 'execution'], data=df, effsize='n2')
    aov = anova(dv=dimension, between=['planning'], data=df, effsize='n2')
    # aov = ancova(dv=dimension, covar=["Propensity to Trust", "Familiarity", "llm_expertise", "assistant_expertise"], between='execution', data=df, effsize='n2')
    print(aov.round(3))
    if aov.to_dict()['p-unc'][0] <= 0.05 / 4:
        tukey = pairwise_tukeyhsd(endog=df[dimension], groups=df['planning'], alpha=0.0125)
        print(tukey)
    print(df.groupby('planning').mean()[dimension])
    tp_str = dimension + "& "
    for condition in all_conditions:
        print(condition, np.mean(condition_dict[condition][dimension]))
        tp_str += "{:.1f} & ".format(np.mean(condition_dict[condition][dimension]))
    print(tp_str)
    print("-" * 17)

Plan quality comparison
test-149
     Source  ddof1  ddof2       F  p-unc    n2
0  planning      1    246  15.702    0.0  0.06
    Multiple Comparison of Means - Tukey HSD, FWER=0.01     
  group1      group2    meandiff p-adj  lower  upper  reject
------------------------------------------------------------
automatic user-involved   0.3719 0.0001 0.1357 0.6081   True
------------------------------------------------------------
planning
automatic        2.000000
user-involved    2.371901
Name: test-149, dtype: float64
AP-AE 2.0
AP-UE 2.0
UP-AE 2.3114754098360657
UP-UE 2.433333333333333
test-149& 2.0 & 2.0 & 2.3 & 2.4 & 
-----------------
test-200
     Source  ddof1  ddof2     F  p-unc     n2
0  planning      1    246  4.05  0.045  0.016
planning
automatic        3.000000
user-involved    2.884298
Name: test-200, dtype: float64
AP-AE 3.0
AP-UE 3.0
UP-AE 2.8524590163934427
UP-UE 2.9166666666666665
test-200& 3.0 & 3.0 & 2.9 & 2.9 & 
-----------------
test-859
     Source  ddof1  ddof2    