In [1]:
import os
import json
import numpy as np
import pandas as pd
import pingouin as pg

In [2]:
METRIC_COLUMNS = [
    "Consistency",
    "Overconfidence",
]

LIKERT_COLUMNS = [
    "Cognitive Load",
    "Confidence",
    "C1",
    "C1.1",
    "C1.2",
    "C1.3",
    "C2",
    "C2.1",
    "C2.2",
    "C2.3",
    "C3.1",
    "C3.2",
    "C3.3",
    "C3.4",
    "C3.5"
]

In [3]:
METRIC_ALTERNATIVE_DICT = {
    "Total Time": "less",
    "Consistency": "greater",
    "Overconfidence": "less",
    "Cognitive Load": "less",
    "Confidence": "greater",
    "C1": "greater",
    "C1.1": "greater",
    "C1.2": "greater",
    "C1.3": "greater",
    "C2": "less",
    "C2.1": "less",
    "C2.2": "greater",
    "C2.3": "greater",
    "C3.1": "greater",
    "C3.2": "less",
    "C3.3": "greater",
    "C3.4": "greater",
    "C3.5": "greater"
}

In [4]:
baseline_df = pd.read_csv('questionnaire/baseline.csv')
baseline_df[METRIC_COLUMNS] = baseline_df[METRIC_COLUMNS].astype(float)
baseline_df[LIKERT_COLUMNS] = baseline_df[LIKERT_COLUMNS].astype(int)
experiment_df = pd.read_csv('questionnaire/experiment.csv')
experiment_df[METRIC_COLUMNS] = experiment_df[METRIC_COLUMNS].astype(float)
experiment_df[LIKERT_COLUMNS] = experiment_df[LIKERT_COLUMNS].astype(int)

In [5]:
result_df = pd.DataFrame(
    index=list(METRIC_ALTERNATIVE_DICT.keys())[1:],
    columns=["U-val", "alternative", "p-val", "RBC", "CLES", "significant"],
)
metrics = METRIC_COLUMNS + LIKERT_COLUMNS 
for key in metrics:
    sig = False
    if key in baseline_df.columns:
        y = baseline_df[key].values
        x = experiment_df[key].values
        result = pg.mwu(x, y, alternative=METRIC_ALTERNATIVE_DICT[key])
        sig = result["p-val"].values[0] < 0.05
        result_df.loc[key, :] = result.values[0].tolist() + [sig]
        if not sig:
            result = pg.mwu(x, y, alternative="two-sided")
            result_df.loc[key, :] = result.values[0].tolist() + [
                result["p-val"].values[0] < 0.05
            ]

In [6]:
with open("questionnaire/experiment_questions.json", "r") as f:
    questions = json.load(f)

result_df["question"] = result_df.index.map(lambda x: questions.get(x, x))

In [7]:
if os.path.exists("result/metric_alternative.json"):
    with open("result/metric_alternative.json", "r") as f:
        METRIC_ALTERNATIVE_DICT_FILE = json.load(f)
if not os.path.exists("result/metric_alternative.json") or METRIC_ALTERNATIVE_DICT != METRIC_ALTERNATIVE_DICT_FILE:
    with open("result/metric_alternative.json", "w") as f:
        json.dump(METRIC_ALTERNATIVE_DICT, f, indent=4)

In [8]:
result_df.to_csv("result/mann_whitney_u_test.csv")

In [9]:
C1 = [
    "C1",
    "C1.1",
    "C1.2",
    "C1.3",
]

C2 = [
    "C2",
    "C2.1",
    "C2.2",
    "C2.3",
]

C3 = [
    "C3.1",
    "C3.2",
    "C3.3",
    "C3.4",
    "C3.5",
]

In [10]:
result_df["p-val"] = result_df["p-val"].astype(float)
result_df.loc[C1, ["U-val", "alternative", "p-val"]].copy().round({"p-val": 4}).to_csv("result/mann_whitney_u_test_c1.csv")
result_df.loc[C2, ["U-val", "alternative", "p-val"]].copy().round({"p-val": 4}).to_csv("result/mann_whitney_u_test_c2.csv")
result_df.loc[C3, ["U-val", "alternative", "p-val"]].copy().round({"p-val": 4}).to_csv("result/mann_whitney_u_test_c3.csv")

In [11]:
result_df

Unnamed: 0,U-val,alternative,p-val,RBC,CLES,significant,question
Consistency,321.5,greater,0.004635,-0.45805,0.729025,True,Consistency
Overconfidence,192.0,two-sided,0.478883,0.129252,0.435374,False,Overconfidence
Cognitive Load,190.0,two-sided,0.437514,0.138322,0.430839,False,How mentally challenging was it for you to com...
Confidence,269.0,two-sided,0.202667,-0.219955,0.609977,False,How confident were you about your labels in ge...
C1,295.5,greater,0.024603,-0.340136,0.670068,True,I can easily establish comparison criteria in ...
C1.1,307.5,greater,0.012769,-0.394558,0.697279,True,My comparison criteria can cover all the new s...
C1.2,281.0,two-sided,0.11733,-0.274376,0.637188,False,I am clear about the set of features I rely on...
C1.3,275.0,two-sided,0.153264,-0.247166,0.623583,False,I am clear about the scope of each feature req...
C2,214.5,two-sided,0.887038,0.027211,0.486395,False,I feel that I may overlook some important deta...
C2.1,193.0,two-sided,0.490124,0.124717,0.437642,False,My knowledge of this robot arm task affects th...


In [12]:
sig_result = result_df[result_df['significant'] == True]

In [13]:
sig_result

Unnamed: 0,U-val,alternative,p-val,RBC,CLES,significant,question
Consistency,321.5,greater,0.004635,-0.45805,0.729025,True,Consistency
C1,295.5,greater,0.024603,-0.340136,0.670068,True,I can easily establish comparison criteria in ...
C1.1,307.5,greater,0.012769,-0.394558,0.697279,True,My comparison criteria can cover all the new s...
C2.2,349.0,greater,0.000453,-0.582766,0.791383,True,The system provides enough support for me to i...
C2.3,306.5,greater,0.012501,-0.390023,0.695011,True,The system provides enough support for me to c...
C3.2,152.5,less,0.041933,0.30839,0.654195,True,The preference labeling process is boring.
C3.3,288.0,greater,0.04236,-0.306122,0.653061,True,I receive encouragement in the preference labe...
C3.4,334.0,greater,0.001938,-0.514739,0.75737,True,I receive feedback on my performance in the pr...
C3.5,312.0,greater,0.010058,-0.414966,0.707483,True,I find the preference labeling process rewarding.
