In [11]:
from util import load_user_data, task_order
import pandas as pd
import numpy as np

valid_users, tp_data = load_user_data(folder_name="../anonymized_data", reserved_users=None)
user2condition = tp_data['user2condition']
condition_count = {}
for user in valid_users:
    tp_condition = user2condition[user]
    if tp_condition not in condition_count:
        condition_count[tp_condition] = 0
    condition_count[tp_condition] += 1
print(condition_count)

248 complete the NASA-TLX
248 complete the whole study
{'UP-AE': 61, 'AP-AE': 63, 'AP-UE': 64, 'UP-UE': 60}


In [12]:
variable_dict = {}
original_quality = {
    'test-149': 2,
    'test-200': 3,
    'test-859': 3,
    'test-388': 5,
    'test-497': 5,
    'test-675': 5
}
all_conditions = ["AP-AE", "AP-UE", "UP-AE", "UP-UE"]
condition_dict = {}
for condition in all_conditions:
    condition_dict[condition] = {}
    for task_id in task_order + ["avg"]:
        condition_dict[condition][task_id] = {
            "acc_strict": [],
            "acc_execution": []
        }

plan_quality = tp_data['plan_quality']
task_performance = tp_data["task_performance"]
for task_id in task_order + ["avg"]:
    variable_dict[task_id] = {
        "condition": [],
        "planning": [],
        "execution": [],
        "acc_execution": [],
        "acc_strict": []
    }

for user in valid_users:
    tp_condition = user2condition[user]
    tp_list = {
        "acc_strict": [],
        "acc_execution": []
    }
    for task_id in task_order:
        tp_plan_quality = plan_quality[user][task_id]
        # skip tasks where plan quality worse than the original quality
        if tp_plan_quality < original_quality[task_id]:
            continue
        for dimension in ["acc_strict", "acc_execution"]:
            tp_val = task_performance[user][task_id][dimension]
            variable_dict[task_id][dimension].append(tp_val)
            tp_list[dimension].append(tp_val)
            condition_dict[tp_condition][task_id][dimension].append(tp_val)
        variable_dict[task_id]["condition"].append(tp_condition)
        if tp_condition.startswith("AP"):
            variable_dict[task_id]["planning"].append("automatic")
        else:
            variable_dict[task_id]["planning"].append("user-involved")
        if tp_condition.endswith("AE"):
            variable_dict[task_id]["execution"].append("automatic")
        else:
            variable_dict[task_id]["execution"].append("user-involved")
    # print(user, tp_condition, len(tp_list))
    if len(tp_list["acc_execution"]) == 0:
        print(f"User {user} has no task where plan quality not decrease")
        assert False
    else:
        for dimension in ["acc_strict", "acc_execution"]:
            tp_val = np.mean(tp_list[dimension])
            variable_dict["avg"][dimension].append(tp_val)
            condition_dict[tp_condition]["avg"][dimension].append(tp_val)
        variable_dict["avg"]["condition"].append(tp_condition)
        if tp_condition.startswith("AP"):
            variable_dict["avg"]["planning"].append("automatic")
        else:
            variable_dict["avg"]["planning"].append("user-involved")
        if tp_condition.endswith("AE"):
            variable_dict["avg"]["execution"].append("automatic")
        else:
            variable_dict["avg"]["execution"].append("user-involved")
    

In [13]:
from scipy.stats import kruskal, mannwhitneyu

def post_hoc_comparison(data_list_1, data_list_2, name1, name2):
	# print("Use pots-hoc analysis")
	threshold = 0.05 / 4
	flag = False
	statistic, pvalue = mannwhitneyu(data_list_1, data_list_2, alternative='greater')
	if pvalue < threshold:
		print("Alternative {} > {},".format(name1, name2), "pvalue %.4f"%pvalue, "statistic %.4f"%statistic)
		flag = True
	statistic, pvalue = mannwhitneyu(data_list_1, data_list_2, alternative='less')
	if pvalue < threshold:
		print("Alternative {} < {},".format(name1, name2), "pvalue %.4f"%pvalue, "statistic %.4f"%statistic)
		flag = True
	if not flag:
		# print("No significant difference with post-hoc analysis")
		pass

In [15]:
str_dict = {}
for task_id in task_order + ["avg"]:
    str_dict[task_id] = ""
for dimension in ["acc_strict", "acc_execution"]:
    print(dimension)
    for task_id in task_order + ["avg"]:
        print(task_id)
        AE_performance = condition_dict["AP-AE"][task_id][dimension] + condition_dict["UP-AE"][task_id][dimension]
        UE_performance = condition_dict["AP-UE"][task_id][dimension] + condition_dict["UP-UE"][task_id][dimension]
        print(len(AE_performance), len(UE_performance))
        statistic, pvalue = kruskal(AE_performance, UE_performance)
        print("AE vs UE; kruskal test result: H:{:.2f}, p:{:.3f}".format(statistic, pvalue))
        # tp_str = dimension + "&" + "{:.2f} & {:.3f}& ".format(statistic, pvalue)
        print("AE performance: {:.2f}".format(np.mean(AE_performance)))
        print("UE performance: {:.2f}".format(np.mean(UE_performance)))

        if pvalue < 0.05 / 4:
            post_hoc_comparison(AE_performance, UE_performance, 'AE', 'UE')
        for condition in all_conditions:
            data_list_1 = condition_dict[condition][task_id][dimension]
            # print("{}, Mean: M({}):{:.2f}, SD({}):{:.2f}".format(len(data_list_1), condition, np.mean(data_list_1), condition, np.std(data_list_1)))
            str_dict[task_id] += "{:.2f} & ".format(np.mean(data_list_1))
        if pvalue < 0.05 / 4:
            if np.mean(AE_performance) < np.mean(UE_performance):
                str_dict[task_id] += "AE < UE & "
            else:
                str_dict[task_id] += "AE > UE & "
        else:
            str_dict[task_id] += "- & "
        print("-" * 16)
    print("-" * 32)
for task_id in ["avg"] + task_order:
    print(task_id, str_dict[task_id])

acc_strict
test-149
122 121
AE vs UE; kruskal test result: H:0.09, p:0.764
AE performance: 0.05
UE performance: 0.06
----------------
test-200
118 119
AE vs UE; kruskal test result: H:2.63, p:0.105
AE performance: 0.73
UE performance: 0.63
----------------
test-859
113 122
AE vs UE; kruskal test result: H:14.16, p:0.000
AE performance: 0.43
UE performance: 0.20
Alternative AE > UE, pvalue 0.0001 statistic 8469.5000
----------------
test-388
111 120
AE vs UE; kruskal test result: H:2.85, p:0.092
AE performance: 0.95
UE performance: 0.88
----------------
test-497
116 118
AE vs UE; kruskal test result: H:4.53, p:0.033
AE performance: 0.99
UE performance: 0.94
----------------
test-675
116 116
AE vs UE; kruskal test result: H:16.18, p:0.000
AE performance: 0.03
UE performance: 0.21
Alternative AE < UE, pvalue 0.0000 statistic 5568.0000
----------------
avg
124 124
AE vs UE; kruskal test result: H:1.85, p:0.174
AE performance: 0.52
UE performance: 0.48
----------------
---------------------