In [1]:
from util import load_user_data, task_order
import pandas as pd
import numpy as np

valid_users, tp_data = load_user_data(folder_name="../anonymized_data", reserved_users=None)
user2condition = tp_data['user2condition']
condition_count = {}
for user in valid_users:
    tp_condition = user2condition[user]
    if tp_condition not in condition_count:
        condition_count[tp_condition] = 0
    condition_count[tp_condition] += 1
print(condition_count)

248 complete the NASA-TLX
248 complete the whole study
{'UP-AE': 61, 'AP-UE': 64, 'AP-AE': 63, 'UP-UE': 60}


In [4]:
variable_dict = {}
original_quality = {
    'test-149': 2,
    'test-200': 3,
    'test-859': 3,
    'test-388': 5,
    'test-497': 5,
    'test-675': 5
}
all_conditions = ["AP-AE", "AP-UE", "UP-AE", "UP-UE"]
condition_dict = {}
for condition in all_conditions:
    condition_dict[condition] = {}
    for task_id in task_order + ["avg"]:
        condition_dict[condition][task_id] = []

plan_quality = tp_data['plan_quality']
# calibrated_trust_execution = tp_data["calibrated_trust_planning"]
calibrated_trust_execution = tp_data["calibrated_trust_execution"]
for task_id in task_order + ["avg"]:
    variable_dict[task_id] = {
        "condition": [],
        "planning": [],
        "execution": [],
        "calibrated_trust_execution": []
    }

for user in valid_users:
    tp_condition = user2condition[user]
    tp_list = []
    for task_id in task_order:
        tp_plan_quality = plan_quality[user][task_id]
        # skip tasks where plan quality worse than the original quality
        if tp_plan_quality < original_quality[task_id]:
            continue
        tp_ct_e = calibrated_trust_execution[user][task_id]
        variable_dict[task_id]["calibrated_trust_execution"].append(tp_ct_e)
        tp_list.append(tp_ct_e)
        condition_dict[tp_condition][task_id].append(tp_ct_e)
        variable_dict[task_id]["condition"].append(tp_condition)
        if tp_condition.startswith("AP"):
            variable_dict[task_id]["planning"].append("automatic")
        else:
            variable_dict[task_id]["planning"].append("user-involved")
        if tp_condition.endswith("AE"):
            variable_dict[task_id]["execution"].append("automatic")
        else:
            variable_dict[task_id]["execution"].append("user-involved")
    # print(user, tp_condition, len(tp_list))
    if len(tp_list) == 0:
        print(f"User {user} has no task where plan quality not decrease")
        assert False
    else:
        variable_dict["avg"]["calibrated_trust_execution"].append(np.mean(tp_list))
        variable_dict["avg"]["condition"].append(tp_condition)
        if tp_condition.startswith("AP"):
            variable_dict["avg"]["planning"].append("automatic")
        else:
            variable_dict["avg"]["planning"].append("user-involved")
        if tp_condition.endswith("AE"):
            variable_dict["avg"]["execution"].append("automatic")
        else:
            variable_dict["avg"]["execution"].append("user-involved")
        condition_dict[tp_condition]["avg"].append(np.mean(tp_list))
    

In [17]:
for condition in condition_dict:
    print(condition)
    for key in condition_dict[condition]:
        print(len(condition_dict[condition][key]))
for task_id in variable_dict:
    print(task_id, len(variable_dict[task_id]["condition"]))

AP-AE
63
63
63
63
63
63
63
AP-UE
64
64
64
64
64
64
64
UP-AE
59
55
50
48
53
53
61
UP-UE
57
55
58
56
54
52
60
test-149 243
test-200 237
test-859 235
test-388 231
test-497 234
test-675 232
avg 248


In [5]:
from pingouin import ancova, anova
from statsmodels.stats.multicomp import pairwise_tukeyhsd
for task_id in task_order + ["avg"]:
    print(task_id)
    df = pd.DataFrame(variable_dict[task_id])
    dimension = "calibrated_trust_execution"
    # aov = anova(dv=dimension, between=['planning', 'execution'], data=df, effsize='n2')
    aov = anova(dv=dimension, between=['execution'], data=df, effsize='n2')
    # aov = ancova(dv=dimension, covar=["Propensity to Trust", "Familiarity", "llm_expertise", "assistant_expertise"], between='planning', data=df, effsize='n2')
    print(aov.round(3))
    if aov.to_dict()['p-unc'][0] <= 0.05 / 4:
        tukey = pairwise_tukeyhsd(endog=df[dimension], groups=df['execution'], alpha=0.0125)
        print(tukey)
    tp_str = ""
    for condition in all_conditions:
        print(condition, np.mean(condition_dict[condition][task_id]))
        tp_str += "{:.2f} & ".format(np.mean(condition_dict[condition][task_id]))
    if aov.to_dict()['p-unc'][0] <= 0.05 / 4:
        tp_str += "AE < UE"
    else:
        tp_str += "-"
    print(tp_str)
    print("-" * 17)

test-149
      Source  ddof1  ddof2      F  p-unc     n2
0  execution      1    241  0.203  0.653  0.001
AP-AE 0.47619047619047616
AP-UE 0.4375
UP-AE 0.5084745762711864
UP-UE 0.49122807017543857
0.48 & 0.44 & 0.51 & 0.49 & -
-----------------
test-200
      Source  ddof1  ddof2      F  p-unc     n2
0  execution      1    235  1.662  0.199  0.007
AP-AE 0.7777777777777778
AP-UE 0.828125
UP-AE 0.7090909090909091
UP-UE 0.8
0.78 & 0.83 & 0.71 & 0.80 & -
-----------------
test-859
      Source  ddof1  ddof2      F  p-unc     n2
0  execution      1    233  3.078  0.081  0.013
AP-AE 0.5079365079365079
AP-UE 0.40625
UP-AE 0.6
UP-UE 0.46551724137931033
0.51 & 0.41 & 0.60 & 0.47 & -
-----------------
test-388
      Source  ddof1  ddof2      F  p-unc     n2
0  execution      1    229  0.212  0.645  0.001
AP-AE 0.9365079365079365
AP-UE 0.921875
UP-AE 0.875
UP-UE 0.8571428571428571
0.94 & 0.92 & 0.88 & 0.86 & -
-----------------
test-497
      Source  ddof1  ddof2      F  p-unc   n2
0  execution    