RQ2 & RQ3

In [None]:
import pandas as pd
import numpy as np


# Mapping of the perceived quality metric to the corresponding question from the survey of Qualtrics
metrics_dict = {
    "Accuracy": ["Q48_1", "Q47_1", "Q49_1", "Q18_1", "Q50_1"],
    "Understandability": ["Q53_1", "Q54_1", "Q20_1", "Q52_1", "Q51_1"],
    "Completeness": ["Q58_1", "Q57_1", "Q56_1", "Q45_1", "Q55_1"],
    "Relevance": ["Q60_1", "Q64_1", "QID63_1", "Q62_1", "Q61_1"]
}


# Mapping of the combination of specific LLM and prompting technique (and groundtruth file) to the questions in survey
variable_dict = {
    "Claude few-shot": ["Q48_1", "Q53_1", "Q58_1", "Q60_1"],
    "ChatGPT zero-shot": ["Q47_1", "Q54_1", "Q57_1", "Q64_1"],
    "Claude zero-shot": ["Q18_1", "Q52_1", "Q45_1", "Q62_1"],
    "ChatGPT few-shot": ["Q50_1", "Q51_1", "Q55_1", "Q61_1"],
    "Ground truth": ["Q49_1", "Q20_1", "Q56_1", "QID63_1"]
}

# Reading CSV file
df1 = pd.read_csv('sumtree_survey_binary.csv')



In [None]:
def compute_summary(df: pd.DataFrame):
    overall_summary = {}
    per_metric_summary = {}

    # Overall scores per model (average of all 4 metrics)
    for model, questions in variable_dict.items():
        overall_scores = df[questions].mean(axis=1)
        overall_summary[model] = {
            "mean": np.mean(overall_scores),
            "std": np.std(overall_scores)
        }

    # Per-metric scores per model
    for metric, q_list in metrics_dict.items():
        per_metric_summary[metric] = {}
        for model, model_q in variable_dict.items():
            overlap = list(set(q_list) & set(model_q))
            if overlap:
                metric_scores = df[overlap].mean(axis=1)
                per_metric_summary[metric][model] = {
                    "mean": np.mean(metric_scores),
                    "std": np.std(metric_scores),
                }

    # Storing the values for the appendix table (overview scores seperately and aggregated with respect to chosen LLM and chosen prompt techn.)
    table_RQ2 = {}

    # Calculating the values of each metric seperately for each combination of LLM and prompt (and the ground truth)
    for metric in metrics_dict.keys():
        table_RQ2[metric] = {}

        for model in variable_dict.keys():
            mean_score = per_metric_summary[metric][model]['mean']
            std_score = per_metric_summary[metric][model]['std']
            table_RQ2[metric][model] = f"{mean_score:.2f} ({std_score:.2f})"

    # Calculating the average of the values for each combination of LLM and prompt (and the ground truth)
    table_RQ2['average'] = {}
    for model in variable_dict.keys():
        mean_score = overall_summary[model]['mean']
        std_score = overall_summary[model]['std']
        table_RQ2['average'][model] = f"{mean_score:.2f} ({std_score:.2f})"

    df_table_RQ2 = pd.DataFrame(table_RQ2)

    print(df_table_RQ2)



In [None]:
results = compute_summary(df=df1)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



def creating_df_visualisation(df: pd.DataFrame):

    store_df=[]

    for model, questions in variable_dict.items():

        average_score_metrics = df1[questions].mean(axis=1)

        # Mapping of LLM models and prompt techniques for the df
        if model == "Claude few-shot" or model == "ChatGPT few-shot":
            group_technique = "Few-shot"
        elif model == "Claude zero-shot" or model == "ChatGPT zero-shot":
            group_technique = "Zero-shot"
        elif model == "Ground truth":
            group_technique = "Ground truth"

        if model == "Claude few-shot" or model == "Claude zero-shot":
            group_model = "Claude"
        elif model == "ChatGPT few-shot" or model == "ChatGPT zero-shot":
            group_model = "ChatGPT"
        elif model == "Ground truth":
            group_model = "Ground truth"

        for score in average_score_metrics:
            store_df.append({
                "Prompt technique": group_technique,
                "Generation origin" : group_model,
                "Score": score
            })

    df_box= pd.DataFrame(store_df)
    print(df_box)


        # The plot

    # For consistency using this order
    prompt_technique_order = ["Zero-shot", "Few-shot", "Ground truth"]
    llms_order = ["Claude", "ChatGPT", "Ground truth"]

    LLM_colors = {
        "ChatGPT": "#74AA9C",       
        "Claude":  "#DE7356", 
        "Ground truth": "#A7A7A7" 
    }

    boxplot = sns.boxplot(
        x="Score",
        y="Prompt technique",
        hue="Generation origin",
        data=df_box,
        palette=LLM_colors,
        orient='h', 
        order=prompt_technique_order,
        hue_order=llms_order,
        showmeans=True,
        medianprops=dict(linewidth=1.5)
        

    )
    # Retrieving the values of the legend and changing ground truth to human-written to fit the legend more
    handles, labels = boxplot.get_legend_handles_labels()
    labels = [label.replace("Ground truth", "Human-written") for label in labels]

    plt.title(label='Sumtree: Average perceived quality per condition')
    plt.xlabel("Average perceived quality score")



    plt.legend(handles, labels, title="Generation Origin", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()
    

    plt.show()

results = creating_df_visualisation(df=df1)
      

