# Set up and global variables

In [None]:
from pathlib import Path

import os
import json

from IPython.display import display, HTML

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import binomtest, chisquare, kruskal, mannwhitneyu


import src.ipython_loader as loader

In [None]:
os.environ["CONFIG_ENV"] = "debug"
if False:
    os.environ["CONFIG_ENV"] = "production"

from config import load_config
config = load_config()

DEBUG = config["DEBUG"]
RESOLUTION = config['DEFAULTS']['resolution']

# input data
STUDY_RESULTS_PATH = config['PATHS']['student_study_results']
STUDENT_METADATA_PATH = config["PATHS"]["student_hold_out_set"] / 'student_study_submissions'

# output data
IMAGE_DIR = config['PATHS']['images'] / 'student_study_results'

os.makedirs(IMAGE_DIR, exist_ok=True)

***

# Loading data

## Parsing utils

In [None]:
# Parsing utils
def _normalize_model_versions(x):
    if x == "Version A":
        return "model"
    elif x == "Version B":
        return "baseline"
    # neutral or undecided
    return "neutral"

def _normalize_preference_category(x):
    if x == "The ordering made more sense.":
        return "order"
    elif x == "The explanations were clearer.":
        return "explanation"
    elif x == "I understood one version more clearly.":
        return "clarity"
    return "other"

def _normalize_explanations_category(x):
    if x == "Not really.":
        return "no"
    elif x == "Somewhat.":
        return "somewhat"
    elif x == "Yes, definitely.":
        return "yes"
    raise ValueError(f"{x} not recognized.")
    
def _normalize_fix_category(x):
    if x == "Yes, it would help.":
        return "yes"
    elif x == "No, I would ignore it.":
        return "no"
    elif x == "Sometimes.":
        return "somewhat"
    raise ValueError(f"{x} not recognized.")
    
def _normalize_understood(x):
    """Set manually based on inspection of the data."""
    x = str(x).lower()
    if "yes" in x or "hope" in x or "mostly" in x:
        return "yes"
    if "no" in x:
        return "no"
    return "maybe"

## Data loading and cleaning

In [None]:
assignment_rows = []

for folder in Path(STUDENT_METADATA_PATH).glob("student_*"):
    student_idx = int(folder.name.split("_")[1])
    assignment_file = folder / "assignment.json"

    with open(assignment_file, "r") as f:
        data = json.load(f)

    for task_idx, version in data["left_versions"].items():
        assignment_rows.append({
            "student_id": str(student_idx),
            "question_id": str(task_idx),
            "left_version": 'model' if version == 'A' else 'baseline'
        })

assignment_df = pd.DataFrame(assignment_rows)

In [None]:
original_final = pd.read_csv(STUDY_RESULTS_PATH / "final.csv")
original_submissions = pd.read_csv(STUDY_RESULTS_PATH / "per_submission.csv")

per_submission_df = pd.DataFrame()

per_submission_df['time'] = pd.to_datetime(
    original_submissions["Timestamp"],
    format="%Y/%m/%d %I:%M:%S %p %Z",
    errors="coerce"
)

per_submission_df['student_id'] = original_submissions["Student ID"].astype(str)
per_submission_df['question_id'] = original_submissions["Question ID"].astype(str)

# add if the model was on the left
per_submission_df = per_submission_df.merge(
    assignment_df,
    on=["student_id", "question_id"],
)

per_submission_df["prefer_model"] = original_submissions["Which version did you prefer?"].apply(_normalize_model_versions)
per_submission_df["helps_first"] = original_submissions["Which version would help you decide what to fix first?"].apply(_normalize_model_versions)
per_submission_df["top_defect"] = original_submissions["Focusing only on the first defect, which version ranked it better?"].apply(_normalize_model_versions)

per_submission_df["confidence"] = pd.to_numeric(original_submissions["How confident you are in your choices?"], errors="coerce")

final_df = pd.DataFrame()

final_df['Timestamp'] = pd.to_datetime(
    original_final["Timestamp"],
    format="%Y/%m/%d %I:%M:%S %p %Z",
    errors="coerce"
)

final_df['student_id'] = original_final["Student ID"].fillna("Unknown")

final_df["why_version"] = original_final["What made you prefer one version over the other?"].apply(_normalize_preference_category)
final_df["explanations_helped"] = original_final["Did the explanations help you understand why defects were ordered in that way?"].apply(_normalize_explanations_category)
final_df["ordering_effect"] = original_final["Did the ordering affect how you would approach fixing the code?"].apply(_normalize_fix_category)
final_df["confused_tasks"] = original_final["Were any tasks confusing?"].apply(_normalize_understood)
final_df['confused_defects'] = original_final["Did you understand all the defects?"].apply(_normalize_understood)
final_df["comments"] = original_final["Do you have any additional comments?"]

In [None]:
per_submission_df.info()

In [None]:
student_pref_df = (
    per_submission_df
    .loc[per_submission_df["prefer_model"].isin(["model", "baseline"])]
    .groupby("student_id")
    .agg(
        n_tasks=("prefer_model", "count"),
        model_votes=("prefer_model", lambda x: (x == "model").sum()),
        baseline_votes=("prefer_model", lambda x: (x == "baseline").sum()),
        mean_confidence=("confidence", "mean"),
    )
)

student_pref_df["model_fraction"] = (
    student_pref_df["model_votes"] /
    (student_pref_df["model_votes"] + student_pref_df["baseline_votes"])
)

student_pref_df["student_prefers_model"] = student_pref_df["model_fraction"] > 0.5

student_pref_df["dominant_preference"] = np.where(
    student_pref_df["model_votes"] > student_pref_df["baseline_votes"],
    "model",
    "baseline"
)

student_pref_df["preference_strength"] = (
    student_pref_df[["model_votes", "baseline_votes"]].max(axis=1)
    / student_pref_df["n_tasks"]
)


***

# Sanity check

In [None]:
final_df.head(2)

In [None]:
per_submission_df.head(2)

In [None]:
student_pref_df.head(2)

***

# Student-level


In [None]:
print(f"Number of students: {len(student_pref_df)}")


In [None]:
print(student_pref_df["dominant_preference"].value_counts(), "\n")

In [None]:
print("Model preference:")
print(student_pref_df["model_fraction"].describe(), "\n")

In [None]:
proportional_preference = student_pref_df[["model_votes", "baseline_votes"]]
proportional_preference = proportional_preference.div(proportional_preference.sum(axis=1), axis=0)

plt.figure(figsize=(10,5))

proportional_preference.sort_values(
    by="model_votes", ascending=False
).plot(
    kind="bar",
    stacked=True,
    figsize=(10,5)
)

plt.xlabel("Student")
plt.ylabel("Proportion of preferences")
plt.title("Student-level preference distribution (model vs baseline)")
plt.legend(title="Preferred version")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "student_preference_stacked.png", dpi=RESOLUTION)
plt.show()


In [None]:
proportional_preference = student_pref_df[["model_votes", "baseline_votes"]]
proportional_preference = proportional_preference.div(proportional_preference.sum(axis=1), axis=0)

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(student_pref_df["preference_strength"], bins=10)
plt.xlabel("Fraction of consistent choices per student")
plt.ylabel("Number of students")
plt.title("Within-student consistency of preferences")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "student_preference_consistency.png", dpi=RESOLUTION)
plt.show()


***

# Task-level

In [None]:
task_pref = (
    per_submission_df
    .groupby("question_id")["prefer_model"]
    .value_counts(normalize=True)
    .unstack()
    .fillna(0)
)

task_pref.sort_values(by=["model", "neutral"], ascending=False, inplace=True)

task_pref.plot(
    kind="bar",
    stacked=True,
    figsize=(10,5),
    colormap="viridis"
)

plt.title("Preference Proportions per Task (sorted by model preference)")
plt.xlabel("Task (question_id)")
plt.ylabel("Proportion")
plt.legend(title="Preferred version")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "preference_by_task.png", dpi=RESOLUTION)
plt.show()


***

# Submission-level

In [None]:
print("=== Per-submission preference distribution ===\n")

pref_counts = per_submission_df["prefer_model"].value_counts()
pref_props = pref_counts / pref_counts.sum()

display(pd.DataFrame({
    "count": pref_counts,
    "proportion": pref_props.round(3)
}))


In [None]:
plot_df = per_submission_df.melt(
    value_vars=["prefer_model", "helps_first", "top_defect"],
    var_name="question",
    value_name="response"
)

# Human-readable labels
plot_df["question"] = plot_df["question"].map({
    "prefer_model": "Overall preference",
    "helps_first": "Helps decide what to fix",
    "top_defect": "Ranks first defect better"
})

g = sns.catplot(
    data=plot_df,
    x="response",
    col="question",
    kind="count",
    order=["model", "baseline", "neutral"],
    col_order=[
        "Overall preference",
        "Helps decide what to fix",
        "Ranks first defect better"
    ],
    height=4,
    aspect=0.9,
    sharey=True
)

g.set_axis_labels("Version chosen", "Number of responses")
g.set_titles("{col_name}")
g.fig.suptitle("Student judgments across three related questions", y=1.05)

plt.tight_layout()
plt.savefig(IMAGE_DIR / "combined_student_judgments.png", dpi=RESOLUTION)
plt.show()


In [None]:
agreement_df = per_submission_df[
    ["prefer_model", "helps_first", "top_defect"]
]

pairwise_agreement = {
    "preference vs helpfulness":
        (agreement_df["prefer_model"] == agreement_df["helps_first"]).mean(),
    "preference vs top defect":
        (agreement_df["prefer_model"] == agreement_df["top_defect"]).mean(),
    "helpfulness vs top defect":
        (agreement_df["helps_first"] == agreement_df["top_defect"]).mean(),
}

full_agreement = (agreement_df.nunique(axis=1) == 1).mean()

print("=== Agreement across related questions ===\n")
print(f"Full agreement: {full_agreement:.2f}\n")
for k, v in pairwise_agreement.items():
    print(f"{k}: {v:.2f}")


***

# Ordering vs explanation

In [None]:
print("=== Reasons for preferring one version ===\n")

reason_counts = final_df["why_version"].value_counts()
reason_props = reason_counts / reason_counts.sum()

display(pd.DataFrame({
    "count": reason_counts,
    "proportion": reason_props.round(3)
}))


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(
    data=final_df,
    x="why_version",
    order=["order", "explanation", "clarity", "other"]
)
plt.xlabel("Stated reason for preference")
plt.ylabel("Number of students")
plt.title("Why did students prefer one version?")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "preference_reasons.png", dpi=RESOLUTION)
plt.show()


In [None]:
print("=== Did explanations help students understand the ordering? ===\n")

expl_counts = final_df["explanations_helped"].value_counts()
expl_props = expl_counts / expl_counts.sum()

display(pd.DataFrame({
    "count": expl_counts,
    "proportion": expl_props.round(3)
}))


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(
    data=final_df,
    x="explanations_helped",
    order=["no", "somewhat", "yes"]
)
plt.xlabel("Did explanations help?")
plt.ylabel("Number of students")
plt.title("Perceived usefulness of explanations")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "explanations_helpfulness.png", dpi=RESOLUTION)
plt.show()

***

# Left and right placement influence

In [None]:
left_right_df = (
    per_submission_df
    .groupby("left_version")["prefer_model"]
    .value_counts(normalize=True)
    .rename("fraction_chosen")
    .reset_index()
)

plt.figure(figsize=(6,4))
sns.barplot(
    data=left_right_df,
    x="left_version",
    y="fraction_chosen",
    hue="prefer_model"
)
plt.title("Does left/right placement affect preference?")
plt.xlabel("Version shown on the left (model or baseline)")
plt.ylabel("Fraction chosen")
plt.ylim(0, 1)
plt.tight_layout()
plt.savefig(IMAGE_DIR / "left_right_effect.png", dpi=RESOLUTION)
plt.show()

# Confidence

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(per_submission_df["confidence"], bins=7)
plt.xlabel("Self-reported confidence")
plt.ylabel("Number of responses")
plt.title("Distribution of self-reported confidence")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "confidence_distribution.png", dpi=RESOLUTION)
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(
    data=per_submission_df,
    x="prefer_model",
    y="confidence",
    order=["model", "baseline", "neutral"]
)
plt.xlabel("Preferred version")
plt.ylabel("Self-reported confidence")
plt.title("Confidence by preferred version")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "submission_confidence_by_preference.png", dpi=RESOLUTION)
plt.show()

plt.figure(figsize=(6,4))
sns.boxplot(
    data=per_submission_df,
    x="helps_first",
    y="confidence",
    order=["model", "baseline", "neutral"]
)
plt.xlabel("Version reported as more helpful")
plt.ylabel("Self-reported confidence")
plt.title("Confidence by perceived helpfulness (descriptive)")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "confidence_by_helpfulness.png", dpi=RESOLUTION)
plt.show()

In [None]:
student_confidence = (
    per_submission_df
    .groupby("student_id")["confidence"]
    .mean()
    .rename("mean_confidence")
)

display(student_confidence.describe())


In [None]:
plt.figure(figsize=(6,4))
sns.histplot(student_confidence, bins=10)
plt.xlabel("Mean confidence per student")
plt.ylabel("Number of students")
plt.title("Average confidence per student")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "student_mean_confidence.png", dpi=RESOLUTION)
plt.show()


# Confusing tasks and defects

In [None]:
print("=== Were any tasks confusing? ===\n")
print(final_df["confused_tasks"].value_counts(), "\n")

print("=== Did you understand all the defects? ===\n")
print(final_df["confused_defects"].value_counts(), "\n")

# Ordering

In [None]:
plt.figure(figsize=(5,4))

sns.countplot(
    data=final_df,
    x="ordering_effect",
    order=["no", "somewhat", "yes"]
)

plt.xlabel("Student response")
plt.ylabel("Number of students")
plt.title("Does defect ordering affect how students approach fixing code?")
plt.tight_layout()

plt.savefig(IMAGE_DIR / "ordering_effect.png", dpi=RESOLUTION)
plt.show()


# Open-ended question

In [None]:
for k, v in original_final['Were any tasks confusing?'].items():
    print(k, v)

In [None]:
for k, v in original_final['Did you understand all the defects?'].items():
    print(k, v)

In [None]:
for k, v in original_final['Do you have any additional comments?'].items():
    print(k, v)