In [1]:
from util import load_user_data, task_order
import pandas as pd
import numpy as np

valid_users, tp_data = load_user_data(folder_name="../anonymized_data", reserved_users=None)
user2condition = tp_data['user2condition']
condition_count = {}
for user in valid_users:
    tp_condition = user2condition[user]
    if tp_condition not in condition_count:
        condition_count[tp_condition] = 0
    condition_count[tp_condition] += 1
print(condition_count)

248 complete the NASA-TLX
248 complete the whole study
{'AP-UE': 64, 'UP-UE': 60, 'AP-AE': 63, 'UP-AE': 61}


## Task-specific Analysis of Calibrated Trust
In our study, the tasks are selected with consideration of both plan correctness and risk level. To check how each condition works on each task, we conduct task-specific analysis.

In [2]:
variable_dict = {}
trust_dimensions = ["Reliability/Competence", "Understanding/Predictability", 
                    "Intention of Developers", "Trust in Automation"]
all_conditions = ["AP-AE", "AP-UE", "UP-AE", "UP-UE"]
condition_dict = {}
for condition in all_conditions:
    condition_dict[condition] = {}
    condition_dict[condition]["avg"] = []
    for task_id in task_order:
        condition_dict[condition][task_id] = []

calibrated_trust_planning = tp_data["calibrated_trust_planning"]
calibrated_trust_execution = tp_data["calibrated_trust_execution"]
variable_dict["condition"] = []
variable_dict["planning"] = []
variable_dict["execution"] = []
for task_id in task_order:
    variable_dict[task_id] = []
user_order = []
for user in valid_users:
    tp_condition = user2condition[user]
    if tp_condition not in all_conditions:
        # ignore pilot study
        continue
    variable_dict["condition"].append(tp_condition)
    if tp_condition.startswith("AP"):
        variable_dict["planning"].append("automatic")
    else:
        variable_dict["planning"].append("user-involved")
    if tp_condition.endswith("AE"):
        variable_dict["execution"].append("automatic")
    else:
        variable_dict["execution"].append("user-involved")
    user_order.append(user)
for task_id in task_order:
    for user in user_order:
        tp_condition = user2condition[user]
        if tp_condition not in all_conditions:
            # ignore pilot study
            continue
        variable_dict[task_id].append(calibrated_trust_planning[user][task_id])
        condition_dict[tp_condition][task_id].append(calibrated_trust_planning[user][task_id])
df = pd.DataFrame(variable_dict)

In [3]:
from pingouin import ancova, anova
from statsmodels.stats.multicomp import pairwise_tukeyhsd
# dimension = "calibrated_trust"
print("calibrated_trust_planning")
for task_id in task_order:
    dimension = task_id
    print(dimension)
    # aov = anova(dv=dimension, between=['planning', 'execution'], data=df, effsize='n2')
    aov = anova(dv=dimension, between=['planning'], data=df, effsize='n2')
    # aov = ancova(dv=dimension, covar=["Propensity to Trust", "Familiarity", "llm_expertise", "assistant_expertise"], between=['planning', 'execution'], data=df, effsize='n2')
    print(aov.round(3))
    # if aov.to_dict()['p-unc'][0] <= 0.05 / 4 or aov.to_dict()['p-unc'][1] <= 0.05 / 4 or aov.to_dict()['p-unc'][2] <= 0.05 / 4:
    if aov.to_dict()['p-unc'][0] <= 0.05 / 4:
        tukey = pairwise_tukeyhsd(endog=df[dimension], groups=df['planning'], alpha=0.0125)
        print(tukey)
    tp_str = dimension + " & "
    for condition in all_conditions:
        avg_val = np.mean(condition_dict[condition][dimension])
        print(condition, avg_val)
        tp_str += "{:.2f} & ".format(avg_val)
    print(tp_str)
    print("-" * 17)

calibrated_trust_planning
test-149
     Source  ddof1  ddof2      F  p-unc     n2
0  planning      1    246  0.705  0.402  0.003
AP-AE 0.1111111111111111
AP-UE 0.203125
UP-AE 0.13114754098360656
UP-UE 0.26666666666666666
test-149 & 0.11 & 0.20 & 0.13 & 0.27 & 
-----------------
test-200
     Source  ddof1  ddof2      F  p-unc     n2
0  planning      1    246  0.259  0.611  0.001
AP-AE 0.20634920634920634
AP-UE 0.109375
UP-AE 0.19672131147540983
UP-UE 0.16666666666666666
test-200 & 0.21 & 0.11 & 0.20 & 0.17 & 
-----------------
test-859
     Source  ddof1  ddof2      F  p-unc     n2
0  planning      1    246  0.353  0.553  0.001
AP-AE 0.09523809523809523
AP-UE 0.03125
UP-AE 0.09836065573770492
UP-UE 0.06666666666666667
test-859 & 0.10 & 0.03 & 0.10 & 0.07 & 
-----------------
test-388
     Source  ddof1  ddof2      F  p-unc     n2
0  planning      1    246  7.467  0.007  0.029
    Multiple Comparison of Means - Tukey HSD, FWER=0.01     
  group1      group2    meandiff p-adj  lower  upp

  return warn(


In [4]:
from pingouin import ancova, anova
from statsmodels.stats.multicomp import pairwise_tukeyhsd
# dimension = "calibrated_trust"
print("calibrated_trust_planning")
for task_id in task_order:
    dimension = task_id
    print(dimension)
    # aov = anova(dv=dimension, between=['planning', 'execution'], data=df, effsize='n2')
    aov = anova(dv=dimension, between=['execution'], data=df, effsize='n2')
    # aov = ancova(dv=dimension, covar=["Propensity to Trust", "Familiarity", "llm_expertise", "assistant_expertise"], between=['planning', 'execution'], data=df, effsize='n2')
    print(aov.round(3))
    # if aov.to_dict()['p-unc'][0] <= 0.05 / 4 or aov.to_dict()['p-unc'][1] <= 0.05 / 4 or aov.to_dict()['p-unc'][2] <= 0.05 / 4:
    if aov.to_dict()['p-unc'][0] <= 0.05 / 4:
        tukey = pairwise_tukeyhsd(endog=df[dimension], groups=df['execution'], alpha=0.0125)
        print(tukey)
    tp_str = dimension + " & "
    for condition in all_conditions:
        avg_val = np.mean(condition_dict[condition][dimension])
        print(condition, avg_val)
        tp_str += "{:.2f} & ".format(avg_val)
    print(tp_str)
    print("-" * 17)

calibrated_trust_planning
test-149
      Source  ddof1  ddof2      F  p-unc     n2
0  execution      1    246  5.492   0.02  0.022
AP-AE 0.1111111111111111
AP-UE 0.203125
UP-AE 0.13114754098360656
UP-UE 0.26666666666666666
test-149 & 0.11 & 0.20 & 0.13 & 0.27 & 
-----------------
test-200
      Source  ddof1  ddof2      F  p-unc     n2
0  execution      1    246  1.833  0.177  0.007
AP-AE 0.20634920634920634
AP-UE 0.109375
UP-AE 0.19672131147540983
UP-UE 0.16666666666666666
test-200 & 0.21 & 0.11 & 0.20 & 0.17 & 
-----------------
test-859
      Source  ddof1  ddof2      F  p-unc     n2
0  execution      1    246  2.158  0.143  0.009
AP-AE 0.09523809523809523
AP-UE 0.03125
UP-AE 0.09836065573770492
UP-UE 0.06666666666666667
test-859 & 0.10 & 0.03 & 0.10 & 0.07 & 
-----------------
test-388
      Source  ddof1  ddof2      F  p-unc     n2
0  execution      1    246  2.964  0.086  0.012
AP-AE 0.9365079365079365
AP-UE 0.96875
UP-AE 0.8032786885245902
UP-UE 0.9
test-388 & 0.94 & 0.97 & 0.80

In [12]:
variable_dict = {}
trust_dimensions = ["Reliability/Competence", "Understanding/Predictability", 
                    "Intention of Developers", "Trust in Automation"]
all_conditions = ["AP-AE", "AP-UE", "UP-AE", "UP-UE"]
condition_dict = {}
for condition in all_conditions:
    condition_dict[condition] = {}
    condition_dict[condition]["avg"] = []
    for task_id in task_order:
        condition_dict[condition][task_id] = []

calibrated_trust_planning = tp_data["calibrated_trust_planning"]
calibrated_trust_execution = tp_data["calibrated_trust_execution"]
variable_dict["condition"] = []
variable_dict["planning"] = []
variable_dict["execution"] = []
for task_id in task_order:
    variable_dict[task_id] = []
user_order = []
for user in valid_users:
    tp_condition = user2condition[user]
    if tp_condition not in all_conditions:
        # ignore pilot study
        continue
    variable_dict["condition"].append(tp_condition)
    if tp_condition.startswith("AP"):
        variable_dict["planning"].append("automatic")
    else:
        variable_dict["planning"].append("user-involved")
    if tp_condition.endswith("AE"):
        variable_dict["execution"].append("automatic")
    else:
        variable_dict["execution"].append("user-involved")
    user_order.append(user)
for task_id in task_order:
    for user in user_order:
        tp_condition = user2condition[user]
        if tp_condition not in all_conditions:
            # ignore pilot study
            continue
        variable_dict[task_id].append(calibrated_trust_execution[user][task_id])
        condition_dict[tp_condition][task_id].append(calibrated_trust_execution[user][task_id])
df = pd.DataFrame(variable_dict)

In [None]:
print("calibrated_trust_execution")
for task_id in task_order:
    dimension = task_id
    print(dimension)
    # aov = anova(dv=dimension, between=['planning', 'execution'], data=df, effsize='n2')
    aov = anova(dv=dimension, between=['execution'], data=df, effsize='n2')
    # aov = ancova(dv=dimension, covar=["Propensity to Trust", "Familiarity", "llm_expertise", "assistant_expertise"], between=['planning', 'execution'], data=df, effsize='n2')
    print(aov.round(3))
    if aov.to_dict()['p-unc'][0] <= 0.05 / 4:
        tukey = pairwise_tukeyhsd(endog=df[dimension], groups=df['condition'], alpha=0.0125)
        print(tukey)
    for condition in all_conditions:
        print(condition, np.mean(condition_dict[condition][dimension]))
    print("-" * 17)

In [8]:
task_specific_condition_dict = {}
condition_users = {}
for user in valid_users:
    tp_condition = user2condition[user]
    if tp_condition not in condition_users:
        condition_users[tp_condition] = set()
    condition_users[tp_condition].add(user)

for condition in all_conditions:
    tp_str = f"{condition}"
    user_set = condition_users[condition]
    # print(condition, len(user_set))
    for task_id in task_order:
        list_1 = []
        list_2 = []
        for user in user_set:
            calibrated_trust_p = calibrated_trust_planning[user][task_id]
            calibrated_trust_e = calibrated_trust_execution[user][task_id]
            list_1.append(calibrated_trust_p)
            list_2.append(calibrated_trust_e)
        # print(acc_strict, execution)
        tp_str += " & {:.2f} & {:.2f}".format(np.mean(list_1) , np.mean(list_2))
        # tp_str += " & {:.2f}".format(np.mean(list_1))
    tp_str += "\\\\"
    print(tp_str)

AP-AE & 0.11 & 0.48 & 0.21 & 0.78 & 0.10 & 0.51 & 0.94 & 0.94 & 0.87 & 0.89 & 0.81 & 0.37\\
AP-UE & 0.20 & 0.44 & 0.11 & 0.83 & 0.03 & 0.41 & 0.97 & 0.92 & 0.84 & 0.92 & 0.81 & 0.38\\
UP-AE & 0.13 & 0.49 & 0.20 & 0.67 & 0.10 & 0.56 & 0.80 & 0.77 & 0.90 & 0.90 & 0.85 & 0.30\\
UP-UE & 0.27 & 0.48 & 0.17 & 0.75 & 0.07 & 0.45 & 0.90 & 0.82 & 0.82 & 0.90 & 0.75 & 0.40\\


In [15]:
high_risk_tasks = ['test-149', 'test-200', 'test-859']
low_risk_tasks =['test-388', 'test-497', 'test-675']

variable_dict = {}
trust_dimensions = ["Reliability/Competence", "Understanding/Predictability", 
                    "Intention of Developers", "Trust in Automation"]
all_conditions = ["AP-AE", "AP-UE", "UP-AE", "UP-UE"]
condition_dict = {}
for condition in all_conditions:
    condition_dict[condition] = {}
    condition_dict[condition]["avg"] = []
    for name in ["high_risk_planning", "high_risk_execution", "low_risk_planning", "low_risk_execution"]:
        condition_dict[condition][name] = []

calibrated_trust_planning = tp_data["calibrated_trust_planning"]
calibrated_trust_execution = tp_data["calibrated_trust_execution"]
variable_dict["condition"] = []
variable_dict["planning"] = []
variable_dict["execution"] = []
for task_id in ["high_risk_planning", "high_risk_execution", "low_risk_planning", "low_risk_execution"]:
    variable_dict[task_id] = []
user_order = []
for user in valid_users:
    tp_condition = user2condition[user]
    if tp_condition not in all_conditions:
        # ignore pilot study
        continue
    variable_dict["condition"].append(tp_condition)
    if tp_condition.startswith("AP"):
        variable_dict["planning"].append("automatic")
    else:
        variable_dict["planning"].append("user-involved")
    if tp_condition.endswith("AE"):
        variable_dict["execution"].append("automatic")
    else:
        variable_dict["execution"].append("user-involved")
    user_order.append(user)
for user in user_order:
    tp_condition = user2condition[user]
    if tp_condition not in all_conditions:
        # ignore pilot study
        continue
    tp_list = []
    tp_list_2 = []
    for task_id in high_risk_tasks:
        tp_list.append(calibrated_trust_planning[user][task_id])
        tp_list_2.append(calibrated_trust_execution[user][task_id])
    high_risk_ct_planning = np.mean(tp_list)
    high_risk_ct_execution = np.mean(tp_list_2)
    variable_dict["high_risk_planning"].append(high_risk_ct_planning)
    condition_dict[tp_condition]["high_risk_planning"].append(high_risk_ct_planning)
    variable_dict["high_risk_execution"].append(high_risk_ct_execution)
    condition_dict[tp_condition]["high_risk_execution"].append(high_risk_ct_execution)

    tp_list = []
    tp_list_2 = []
    for task_id in low_risk_tasks:
        tp_list.append(calibrated_trust_planning[user][task_id])
        tp_list_2.append(calibrated_trust_execution[user][task_id])
    low_risk_ct_planning = np.mean(tp_list)
    low_risk_ct_execution = np.mean(tp_list_2)
    variable_dict["low_risk_planning"].append(low_risk_ct_planning)
    condition_dict[tp_condition]["low_risk_planning"].append(low_risk_ct_planning)
    variable_dict["low_risk_execution"].append(low_risk_ct_execution)
    condition_dict[tp_condition]["low_risk_execution"].append(low_risk_ct_execution)
df = pd.DataFrame(variable_dict)

In [1]:
print("calibrated_trust_execution")
for task_id in ["high_risk_planning", "high_risk_execution", "low_risk_planning", "low_risk_execution"]:
    dimension = task_id
    print(dimension)
    # aov = anova(dv=dimension, between=['planning', 'execution'], data=df, effsize='n2')
    aov = anova(dv=dimension, between=['execution'], data=df, effsize='n2')
    # aov = ancova(dv=dimension, covar=["Propensity to Trust", "Familiarity", "llm_expertise", "assistant_expertise"], between=['planning', 'execution'], data=df, effsize='n2')
    print(aov.round(3))
    if aov.to_dict()['p-unc'][0] <= 0.05 / 4 or aov.to_dict()['p-unc'][1] <= 0.05 / 4 or aov.to_dict()['p-unc'][2] <= 0.05 / 4:
        tukey = pairwise_tukeyhsd(endog=df[dimension], groups=df['condition'], alpha=0.0125)
        print(tukey)
    for condition in all_conditions:
        print(condition, np.mean(condition_dict[condition][dimension]))
    print("-" * 17)

calibrated_trust_execution
high_risk_planning


NameError: name 'anova' is not defined