In [1]:
from util import load_user_data, task_ID_list_to_check
import pandas as pd
import numpy as np

valid_users, tp_data = load_user_data(folder_name="../anonymized_data", reserved_users=None)
user2condition = tp_data['user2condition']
user_planning_actions = tp_data['user_planning_actions']
condition_count = {}
for user in valid_users:
    tp_condition = user2condition[user]
    if tp_condition not in condition_count:
        condition_count[tp_condition] = 0
    condition_count[tp_condition] += 1
print(condition_count)

248 complete the NASA-TLX
248 complete the whole study
{'UP-UE': 60, 'AP-UE': 64, 'UP-AE': 61, 'AP-AE': 63}


In [2]:
variable_dict = {}
trust_dimensions = ["Reliability/Competence", "Understanding/Predictability", 
                    "Intention of Developers", "Trust in Automation"]
covariates_1 = ["Propensity to Trust", "Familiarity"]
performance_dimensions = ["recall", "acc_strict", "acc_relaxed", "acc_execution"]
covariates = ["llm_expertise", "assistant_expertise"]
all_conditions = ["AP-AE", "AP-UE", "UP-AE", "UP-UE"]
condition_dict = {}
for condition in all_conditions:
    condition_dict[condition] = {}
    for dimension in trust_dimensions:
        condition_dict[condition][dimension] = []
    condition_dict[condition]["calibrated_trust_planning"] = []
    condition_dict[condition]["calibrated_trust_execution"] = []
    condition_dict[condition]["RAIR"] = []
    condition_dict[condition]["RSR"] = []

for dimension in performance_dimensions:
    variable_dict[dimension] = []
for dimension in trust_dimensions:
    variable_dict[dimension] = []
for dimension in covariates_1:
    variable_dict[dimension] = []
for dimension in covariates:
    variable_dict[dimension] = []

user_expertise = tp_data["user_expertise"]
trust = tp_data["trust_in_automation"]
calibrated_trust_planning = tp_data["calibrated_trust_planning"]
calibrated_trust_execution = tp_data["calibrated_trust_execution"]
variable_dict["condition"] = []
variable_dict["planning"] = []
variable_dict["execution"] = []
variable_dict["calibrated_trust_planning"] = []
variable_dict["calibrated_trust_execution"] = []
for user in valid_users:
    tp_condition = user2condition[user]
    if tp_condition not in all_conditions:
        # ignore pilot study
        continue
    variable_dict["condition"].append(tp_condition)
    if tp_condition.startswith("AP"):
        variable_dict["planning"].append("automatic")
    else:
        variable_dict["planning"].append("user-involved")
    if tp_condition.endswith("AE"):
        variable_dict["execution"].append("automatic")
    else:
        variable_dict["execution"].append("user-involved")
    variable_dict["calibrated_trust_planning"].append(calibrated_trust_planning[user]["avg"])
    condition_dict[tp_condition]["calibrated_trust_planning"].append(calibrated_trust_planning[user]["avg"])
    variable_dict["calibrated_trust_execution"].append(calibrated_trust_execution[user]["avg"])
    condition_dict[tp_condition]["calibrated_trust_execution"].append(calibrated_trust_execution[user]["avg"])
    for dimension in trust_dimensions:
        variable_dict[dimension].append(trust[user][dimension])
        condition_dict[tp_condition][dimension].append(trust[user][dimension])
    for dimension in ["Propensity to Trust", "Familiarity"]:
        variable_dict[dimension].append(trust[user][dimension])
    for dimension in covariates:
        variable_dict[dimension].append(user_expertise[user][dimension])
    task_performance = tp_data["task_performance"]
    for dimension in performance_dimensions:
        variable_dict[dimension].append(task_performance[user]["avg"][dimension])
df = pd.DataFrame(variable_dict)

In [3]:
for variable in variable_dict:
    print(variable, len(variable_dict[variable]))
df = pd.DataFrame(variable_dict)

recall 248
acc_strict 248
acc_relaxed 248
acc_execution 248
Reliability/Competence 248
Understanding/Predictability 248
Intention of Developers 248
Trust in Automation 248
Propensity to Trust 248
Familiarity 248
llm_expertise 248
assistant_expertise 248
condition 248
planning 248
execution 248
calibrated_trust_planning 248
calibrated_trust_execution 248


In [4]:
# Performance Overview
# dimension = "calibrated_trust"
for dimension in ["calibrated_trust_planning", "calibrated_trust_execution"]:
    print("{}, {}".format(dimension, len(variable_dict[dimension])))
    print("M: {:.2f}, SD: {:.2f}".format(np.mean(variable_dict[dimension]), np.std(variable_dict[dimension])))
for dimension in performance_dimensions:
    print("{}, {}".format(dimension, len(variable_dict[dimension])))
    print("M: {:.2f}, SD: {:.2f}".format(np.mean(variable_dict[dimension]), np.std(variable_dict[dimension])))

calibrated_trust_planning, 248
M: 0.50, SD: 0.13
calibrated_trust_execution, 248
M: 0.64, SD: 0.19
recall, 248
M: 0.77, SD: 0.11
acc_strict, 248
M: 0.48, SD: 0.17
acc_relaxed, 248
M: 0.56, SD: 0.17
acc_execution, 248
M: 0.52, SD: 0.18


In [5]:
for dimension in trust_dimensions:
    print("{}, {}".format(dimension, len(variable_dict[dimension])))
    print("M: {:.2f}, SD: {:.2f}".format(np.mean(variable_dict[dimension]), np.std(variable_dict[dimension])))

Reliability/Competence, 248
M: 3.49, SD: 0.77
Understanding/Predictability, 248
M: 3.30, SD: 0.56
Intention of Developers, 248
M: 3.61, SD: 0.81
Trust in Automation, 248
M: 3.52, SD: 1.01


In [6]:
# Covariates
from collections import Counter
for dimension in covariates:
    print("{}, {}".format(dimension, len(variable_dict[dimension])))
    print("M: {:.1f}, SD: {:.1f}".format(np.mean(variable_dict[dimension]), np.std(variable_dict[dimension])))
    print(Counter(variable_dict[dimension]))
for dimension in covariates_1:
    print("{}, {}".format(dimension, len(variable_dict[dimension])))
    print("M: {:.1f}, SD: {:.1f}".format(np.mean(variable_dict[dimension]), np.std(variable_dict[dimension])))
    print(Counter(variable_dict[dimension]))

llm_expertise, 248
M: 3.6, SD: 1.0
Counter({4: 96, 3: 72, 5: 47, 2: 31, 1: 2})
assistant_expertise, 248
M: 3.4, SD: 1.1
Counter({3: 82, 4: 71, 5: 46, 2: 32, 1: 17})
Propensity to Trust, 248
M: 3.0, SD: 0.7
Counter({3.0: 56, 3.3333333333333335: 55, 2.6666666666666665: 30, 3.6666666666666665: 26, 2.3333333333333335: 23, 2.0: 17, 4.0: 12, 1.6666666666666667: 11, 4.333333333333333: 6, 1.3333333333333333: 5, 4.666666666666667: 4, 5.0: 2, 1.0: 1})
Familiarity, 248
M: 2.9, SD: 1.2
Counter({4.0: 47, 1.0: 37, 2.0: 37, 3.0: 34, 3.5: 27, 2.5: 25, 5.0: 19, 1.5: 12, 4.5: 10})
