In [1]:
from util import load_user_data, task_ID_list_to_check
import pandas as pd
import numpy as np

valid_users, tp_data = load_user_data(folder_name="../anonymized_data", reserved_users=None)
user2condition = tp_data['user2condition']
condition_count = {}
for user in valid_users:
    tp_condition = user2condition[user]
    if tp_condition not in condition_count:
        condition_count[tp_condition] = 0
    condition_count[tp_condition] += 1
print(condition_count)

248 complete the NASA-TLX
248 complete the whole study
{'AP-AE': 63, 'UP-UE': 60, 'AP-UE': 64, 'UP-AE': 61}


In [2]:
# get variable dict
variable_dict = {}
performance_dv = ["recall", "acc_strict", "acc_relaxed", "acc_execution"]
trust_dv = ["Reliability/Competence", "Understanding/Predictability", "Intention of Developers", "Trust in Automation"]
calibrated_trust_dv = ["calibrated_trust_planning", "calibrated_trust_execution"]
nasatlx_variable_names = ["mental_demand", "physical_demand", "temporal_demand", "performance", "effort", "frustration"]
covariates = ["llm_expertise", "assistant_expertise", "Familiarity", "Propensity to Trust"]

user_expertise = tp_data["user_expertise"]
plan_quality = tp_data['plan_quality']
user_cognitive_load = tp_data['cognitive_load']
task_perfromance = tp_data['task_performance']
user_TiA_scale = tp_data["trust_in_automation"]
risk_perception = tp_data['risk_perception']

for dimension in performance_dv:
    variable_dict[dimension] = []
for dimension in trust_dv:
    variable_dict[dimension] = []
for dimension in calibrated_trust_dv:
    variable_dict[dimension] = []
for dimension in nasatlx_variable_names:
    variable_dict[dimension] = []
for dimension in covariates:
    variable_dict[dimension] = []
variable_dict['risk_perception'] = []
variable_dict['plan_quality'] = []
variable_dict['trust_planning'] = []
variable_dict['trust_execution'] = []
user_trust = tp_data['user_trust']
trust_mapping = {
    "No": 0.0,
    "Yes": 1.0
}

# correlation based on average on tasks
for user in valid_users:
    for variable in nasatlx_variable_names:
        variable_dict[variable].append(user_cognitive_load[user][variable])
    for variable in performance_dv:
        variable_dict[variable].append(task_perfromance[user]["avg"][variable])
    for variable in calibrated_trust_dv:
        variable_dict[variable].append(tp_data[variable][user]["avg"])
    for variable in ["llm_expertise", "assistant_expertise"]:
        variable_dict[variable].append(user_expertise[user][variable])
    for variable in trust_dv:
        variable_dict[variable].append(user_TiA_scale[user][variable])
    for variable in ["Familiarity", "Propensity to Trust"]:
        variable_dict[variable].append(user_TiA_scale[user][variable])
    tp_list = []
    tp_list_2 = []
    tp_list_3 = []
    tp_list_4 = []
    for task_id in task_ID_list_to_check:
        tp_risk = risk_perception[user][task_id]
        tp_list.append(tp_risk)
        tp_list_2.append(plan_quality[user][task_id])
        tp_list_3.append(trust_mapping[user_trust[user][task_id]["planning"]])
        tp_list_4.append(trust_mapping[user_trust[user][task_id]["execution"]])
    variable_dict["risk_perception"].append(np.mean(tp_list))
    variable_dict["plan_quality"].append(np.mean(tp_list_2))
    variable_dict['trust_planning'].append(np.mean(tp_list_3))
    variable_dict['trust_execution'].append(np.mean(tp_list_4))

In [3]:
df = pd.DataFrame(variable_dict)
df.head()

Unnamed: 0,recall,acc_strict,acc_relaxed,acc_execution,Reliability/Competence,Understanding/Predictability,Intention of Developers,Trust in Automation,calibrated_trust_planning,calibrated_trust_execution,...,effort,frustration,llm_expertise,assistant_expertise,Familiarity,Propensity to Trust,risk_perception,plan_quality,trust_planning,trust_execution
0,0.611111,0.166667,0.333333,0.166667,4.0,4.0,3.5,4.0,0.333333,0.833333,...,4,-6,4,4,2.5,3.333333,2.333333,2.833333,1.0,0.333333
1,0.722222,0.333333,0.666667,0.333333,2.0,3.25,3.0,2.0,0.5,0.666667,...,2,-1,3,4,2.5,2.666667,2.166667,3.833333,1.0,0.666667
2,0.722222,0.5,0.5,0.5,2.833333,3.0,3.0,2.5,0.5,0.666667,...,3,1,4,3,4.0,3.0,3.166667,3.833333,0.333333,0.166667
3,0.777778,0.5,0.666667,0.5,3.833333,3.25,4.0,4.0,0.5,0.5,...,7,-7,3,4,3.0,3.333333,2.166667,3.833333,1.0,1.0
4,0.569444,0.166667,0.5,0.166667,2.833333,3.5,4.0,3.0,0.166667,0.166667,...,3,4,4,4,2.5,2.666667,1.666667,2.833333,1.0,1.0


In [4]:
def wrap_pvalue(pvalue, correlation):
    tp_str = ""
    if pvalue < 0.0125:
        tp_str += "& {} &".format("%.3f"%correlation)
        tp_str += "\\textbf{"
        tp_str += "{}".format(("%.3f"%pvalue)[1:])
        tp_str += "}$^{\dagger\dagger}$"
    elif pvalue < 0.05:
        tp_str += "& {} & {}".format("%.3f"%correlation, ("%.3f"%pvalue)[1:])
        tp_str += "$^{\dagger}$"
    else:
        tp_str += "& {} & {}".format("%.3f"%correlation, ("%.3f"%pvalue)[1:])
    return tp_str

In [5]:
from scipy.stats import spearmanr

str_dict = {}
for dv in nasatlx_variable_names:
    str_dict[dv] = "{} ".format(dv)
    for cov in covariates:
        correlation, pvalue = spearmanr(variable_dict[cov], variable_dict[dv])
        # if pvalue < 0.05 / 4:
        # print("Variable {} and variable {} have spearman correlation {:.3f} and pvalue {:.3f}".format(cov, dv, correlation, pvalue))
        str_dict[dv] += wrap_pvalue(pvalue, correlation)
    str_dict[dv] += "\\\\"
    print(str_dict[dv])
print("-" * 17)

for dv in trust_dv:
    str_dict[dv] = "{} ".format(dv)
    for cov in covariates:
        correlation, pvalue = spearmanr(variable_dict[cov], variable_dict[dv])
        # if pvalue < 0.05 / 4:
        # print("Variable {} and variable {} have spearman correlation {:.3f} and pvalue {:.3f}".format(cov, dv, correlation, pvalue))
        str_dict[dv] += wrap_pvalue(pvalue, correlation)
    str_dict[dv] += "\\\\"
    print(str_dict[dv])
print("-" * 17)


for dv in performance_dv:
    str_dict[dv] = "{} ".format(dv)
    for cov in covariates:
        correlation, pvalue = spearmanr(variable_dict[cov], variable_dict[dv])
        # if pvalue < 0.05 / 4:
        # print("Variable {} and variable {} have spearman correlation {:.3f} and pvalue {:.3f}".format(cov, dv, correlation, pvalue))
        str_dict[dv] += wrap_pvalue(pvalue, correlation)
    str_dict[dv] += "\\\\"
    print(str_dict[dv])
print("-" * 17)

for dv in calibrated_trust_dv:
    str_dict[dv] = "{} ".format(dv)
    for cov in covariates:
        correlation, pvalue = spearmanr(variable_dict[cov], variable_dict[dv])
        # if pvalue < 0.05 / 4:
        # print("Variable {} and variable {} have spearman correlation {:.3f} and pvalue {:.3f}".format(cov, dv, correlation, pvalue))
        str_dict[dv] += wrap_pvalue(pvalue, correlation)
    str_dict[dv] += "\\\\"
    print(str_dict[dv])
print("-" * 17)

for dv in ['plan_quality', 'trust_planning', 'trust_execution']:
    str_dict[dv] = "{} ".format(dv)
    for cov in covariates:
        correlation, pvalue = spearmanr(variable_dict[cov], variable_dict[dv])
        # if pvalue < 0.05 / 4:
        # print("Variable {} and variable {} have spearman correlation {:.3f} and pvalue {:.3f}".format(cov, dv, correlation, pvalue))
        str_dict[dv] += wrap_pvalue(pvalue, correlation)
    str_dict[dv] += "\\\\"
    print(str_dict[dv])
print("-" * 17)

mental_demand & -0.046 & .472& -0.003 & .968& -0.163 &\textbf{.010}$^{\dagger\dagger}$& -0.188 &\textbf{.003}$^{\dagger\dagger}$\\
physical_demand & 0.314 &\textbf{.000}$^{\dagger\dagger}$& 0.267 &\textbf{.000}$^{\dagger\dagger}$& 0.166 &\textbf{.009}$^{\dagger\dagger}$& 0.219 &\textbf{.001}$^{\dagger\dagger}$\\
temporal_demand & 0.031 & .623& 0.034 & .593& -0.077 & .225& -0.039 & .542\\
performance & -0.133 & .036$^{\dagger}$& -0.154 & .016$^{\dagger}$& -0.154 & .015$^{\dagger}$& -0.080 & .209\\
effort & 0.037 & .562& 0.109 & .087& -0.046 & .470& -0.013 & .839\\
frustration & -0.249 &\textbf{.000}$^{\dagger\dagger}$& -0.131 & .039$^{\dagger}$& -0.296 &\textbf{.000}$^{\dagger\dagger}$& -0.348 &\textbf{.000}$^{\dagger\dagger}$\\
-----------------
Reliability/Competence & 0.334 &\textbf{.000}$^{\dagger\dagger}$& 0.245 &\textbf{.000}$^{\dagger\dagger}$& 0.321 &\textbf{.000}$^{\dagger\dagger}$& 0.679 &\textbf{.000}$^{\dagger\dagger}$\\
Understanding/Predictability & 0.307 &\textbf{.000}$^{

In [6]:
dv = 'risk_perception'
str_dict[dv] = "{} ".format(dv)
for cov in covariates:
    correlation, pvalue = spearmanr(variable_dict[cov], variable_dict[dv])
    # if pvalue < 0.05 / 4:
    # print("Variable {} and variable {} have spearman correlation {:.3f} and pvalue {:.3f}".format(cov, dv, correlation, pvalue))
    str_dict[dv] += wrap_pvalue(pvalue, correlation)
str_dict[dv] += "\\\\"
print(str_dict[dv])

risk_perception & -0.187 &\textbf{.003}$^{\dagger\dagger}$& -0.180 &\textbf{.004}$^{\dagger\dagger}$& -0.237 &\textbf{.000}$^{\dagger\dagger}$& -0.363 &\textbf{.000}$^{\dagger\dagger}$\\


## Task-specific Correlation Analysis

In [7]:
data_dict = {
    "condition": [],
    "planning": [],
    "execution": [],
    "task_id": [],
    "trust_planning": [],
    "trust_execution": [],
    "plan_quality": [],
    "acc_execution": [],
    "acc_strict": [],
    "risk_perception": [],
    "confidence_planning": [],
    "confidence_execution": [],
    "calibrated_trust_planning": [],
    "calibrated_trust_execution": []
}
user_trust = tp_data['user_trust']
trust_mapping = {
    "No": 0.0,
    "Yes": 1.0
}
user_confidence = tp_data['confidence']
calibrated_trust_planning = tp_data['calibrated_trust_planning']
calibrated_trust_execution = tp_data['calibrated_trust_execution']

for user in valid_users:
    tp_condition = user2condition[user]
    planning, execution = tp_condition.split("-")
    for task_id in task_ID_list_to_check:
        # condition
        data_dict["condition"].append(tp_condition)
        data_dict["planning"].append(planning)
        data_dict["execution"].append(execution)
        data_dict["task_id"].append(task_id)

        # feedback at planning stage
        data_dict["plan_quality"].append(plan_quality[user][task_id])
        data_dict["risk_perception"].append(risk_perception[user][task_id])
        data_dict["trust_planning"].append(trust_mapping[user_trust[user][task_id]['planning']])
        data_dict["confidence_planning"].append(user_confidence[user][task_id]['planning'])
        data_dict["calibrated_trust_planning"].append(calibrated_trust_planning[user][task_id])

        # feedback at execution stage
        for variable in ["acc_execution", "acc_strict"]:
            data_dict[variable].append(task_perfromance[user][task_id][variable])
        data_dict["trust_execution"].append(trust_mapping[user_trust[user][task_id]['execution']])
        data_dict["confidence_execution"].append(user_confidence[user][task_id]['execution'])
        data_dict["calibrated_trust_execution"].append(calibrated_trust_execution[user][task_id])

In [8]:
# for variable in data_dict:
#     print(variable, len(data_dict[variable]))
df = pd.DataFrame(data_dict)

In [9]:
from pingouin import ancova, anova
from statsmodels.stats.multicomp import pairwise_tukeyhsd
for dimension in ["acc_execution", "acc_strict"]:
    # dimension = "acc_execution"
    # aov = anova(dv=dimension, between=['condition'], data=df, effsize='n2')
    aov = ancova(dv=dimension, covar=["plan_quality"], between='condition', data=df, effsize='n2')
    print(aov.round(3))
    if aov.to_dict()['p-unc'][0] <= 0.05 / 4:
        tukey = pairwise_tukeyhsd(endog=df[dimension], groups=df['condition'], alpha=0.0125)
        print(tukey)
    print(df.groupby('condition').mean()[dimension])
    print("-" * 17)

  return warn(


         Source       SS    DF        F  p-unc     n2
0     condition    0.584     3    0.920   0.43  0.002
1  plan_quality   56.149     1  265.231   0.00  0.151
2      Residual  313.948  1483      NaN    NaN    NaN
condition
AP-AE    0.534392
AP-UE    0.526042
UP-AE    0.467213
UP-UE    0.547222
Name: acc_execution, dtype: float64
-----------------
         Source       SS    DF        F  p-unc     n2
0     condition    0.986     3    1.631   0.18  0.003
1  plan_quality   71.306     1  353.953   0.00  0.192
2      Residual  298.759  1483      NaN    NaN    NaN
condition
AP-AE    0.534392
AP-UE    0.463542
UP-AE    0.456284
UP-UE    0.472222
Name: acc_strict, dtype: float64
-----------------


In [25]:
s = df.groupby(['task_id', 'risk_perception', 'plan_quality'])
# print(type(s.count()['acc_execution']))
res = pd.concat([s.count()['acc_execution'], s.mean()['acc_execution']], axis=1, ignore_index=True)
res.to_csv("risk_perception-plan_quality.csv")
# print(df.groupby(['task_id', 'plan_quality']).mean()['acc_execution'])

In [27]:
s = df.groupby(['risk_perception'])
res = pd.concat([s.count()['acc_execution'], s.mean()['acc_execution']], axis=1, ignore_index=True)
print(res)

s = df.groupby(['plan_quality'])
res = pd.concat([s.count()['acc_execution'], s.mean()['acc_execution']], axis=1, ignore_index=True)
print(res)

                   0         1
risk_perception               
1                485  0.606186
2                409  0.484108
3                259  0.482625
4                227  0.466960
5                108  0.453704
                0         1
plan_quality               
1              50  0.060000
2             231  0.008658
3             482  0.591286
4               8  0.500000
5             717  0.666667


In [2]:
(50 * 0.060000 + 231 * 0.008658) / 281

0.017793587188612098

In [3]:
(0.591286 * 482 + 4) / 490

0.5897956163265305

In [1]:
248 * 6

1488

In [36]:
for dimension in ["calibrated_trust_planning", "calibrated_trust_execution"]:
    # dimension = "acc_execution"
    # aov = anova(dv=dimension, between=['condition'], data=df, effsize='n2')
    aov = ancova(dv=dimension, covar=["plan_quality"], between='condition', data=df, effsize='n2')
    print(aov.round(3))
    if aov.to_dict()['p-unc'][0] <= 0.05 / 4:
        tukey = pairwise_tukeyhsd(endog=df[dimension], groups=df['condition'], alpha=0.0125)
        print(tukey)
    print(df.groupby('condition').mean()[dimension])
    print("-" * 17)

         Source       SS    DF         F  p-unc     n2
0     condition    1.202     3     3.334  0.019  0.003
1  plan_quality  193.756     1  1612.379  0.000  0.519
2      Residual  178.209  1483       NaN    NaN    NaN
condition
AP-AE    0.505291
AP-UE    0.494792
UP-AE    0.497268
UP-UE    0.494444
Name: calibrated_trust_planning, dtype: float64
-----------------
         Source       SS    DF       F  p-unc     n2
0     condition    0.136     3   0.206  0.892  0.000
1  plan_quality   16.894     1  76.495  0.000  0.049
2      Residual  327.513  1483     NaN    NaN    NaN
condition
AP-AE    0.656085
AP-UE    0.645833
UP-AE    0.612022
UP-UE    0.625000
Name: calibrated_trust_execution, dtype: float64
-----------------


In [22]:
from scipy.stats import spearmanr

str_dict = {}
task_performance_variables = ["acc_execution", "acc_strict"]
calibrated_trust_variables = ["calibrated_trust_planning", "calibrated_trust_execution"]
trust_variables = ["trust_planning", "trust_execution"]
confidence_variables = ["confidence_planning", "confidence_execution"]
for dv in task_performance_variables:
    str_dict[dv] = "{} ".format(dv)
    for cov in ['plan_quality', 'risk_perception']:
        correlation, pvalue = spearmanr(data_dict[cov], data_dict[dv])
        # if pvalue < 0.05 / 4:
        # print("Variable {} and variable {} have spearman correlation {:.3f} and pvalue {:.3f}".format(cov, dv, correlation, pvalue))
        str_dict[dv] += wrap_pvalue(pvalue, correlation)
    str_dict[dv] += "\\\\"
    print(str_dict[dv])
print("-" * 17)

for dv in calibrated_trust_variables:
    str_dict[dv] = "{} ".format(dv)
    for cov in ['plan_quality', 'risk_perception']:
        correlation, pvalue = spearmanr(data_dict[cov], data_dict[dv])
        # if pvalue < 0.05 / 4:
        # print("Variable {} and variable {} have spearman correlation {:.3f} and pvalue {:.3f}".format(cov, dv, correlation, pvalue))
        str_dict[dv] += wrap_pvalue(pvalue, correlation)
    str_dict[dv] += "\\\\"
    print(str_dict[dv])
print("-" * 17)

for dv in trust_variables:
    str_dict[dv] = "{} ".format(dv)
    for cov in ['plan_quality', 'risk_perception']:
        correlation, pvalue = spearmanr(data_dict[cov], data_dict[dv])
        # if pvalue < 0.05 / 4:
        # print("Variable {} and variable {} have spearman correlation {:.3f} and pvalue {:.3f}".format(cov, dv, correlation, pvalue))
        str_dict[dv] += wrap_pvalue(pvalue, correlation)
    str_dict[dv] += "\\\\"
    print(str_dict[dv])
print("-" * 17)

for dv in confidence_variables:
    str_dict[dv] = "{} ".format(dv)
    for cov in ['plan_quality', 'risk_perception']:
        correlation, pvalue = spearmanr(data_dict[cov], data_dict[dv])
        # if pvalue < 0.05 / 4:
        # print("Variable {} and variable {} have spearman correlation {:.3f} and pvalue {:.3f}".format(cov, dv, correlation, pvalue))
        str_dict[dv] += wrap_pvalue(pvalue, correlation)
    str_dict[dv] += "\\\\"
    print(str_dict[dv])
print("-" * 17)

acc_execution & 0.400 &\textbf{.000}$^{\dagger\dagger}$& -0.110 &\textbf{.000}$^{\dagger\dagger}$\\
acc_strict & 0.446 &\textbf{.000}$^{\dagger\dagger}$& -0.096 &\textbf{.000}$^{\dagger\dagger}$\\
-----------------
calibrated_trust_planning & 0.723 &\textbf{.000}$^{\dagger\dagger}$& -0.102 &\textbf{.000}$^{\dagger\dagger}$\\
calibrated_trust_execution & 0.221 &\textbf{.000}$^{\dagger\dagger}$& 0.000 & .995\\
-----------------
trust_planning & 0.056 & .032$^{\dagger}$& -0.293 &\textbf{.000}$^{\dagger\dagger}$\\
trust_execution & 0.258 &\textbf{.000}$^{\dagger\dagger}$& -0.160 &\textbf{.000}$^{\dagger\dagger}$\\
-----------------
confidence_planning & 0.137 &\textbf{.000}$^{\dagger\dagger}$& -0.532 &\textbf{.000}$^{\dagger\dagger}$\\
confidence_execution & 0.225 &\textbf{.000}$^{\dagger\dagger}$& -0.271 &\textbf{.000}$^{\dagger\dagger}$\\
-----------------


In [23]:
for dv in ['plan_quality']:
    str_dict[dv] = "{} ".format(dv)
    for cov in ['plan_quality', 'risk_perception']:
        correlation, pvalue = spearmanr(data_dict[cov], data_dict[dv])
        # if pvalue < 0.05 / 4:
        # print("Variable {} and variable {} have spearman correlation {:.3f} and pvalue {:.3f}".format(cov, dv, correlation, pvalue))
        str_dict[dv] += wrap_pvalue(pvalue, correlation)
    str_dict[dv] += "\\\\"
    print(str_dict[dv])
print("-" * 17)

plan_quality & 1.000 &\textbf{.000}$^{\dagger\dagger}$& -0.141 &\textbf{.000}$^{\dagger\dagger}$\\
-----------------
