# Set up and global variables

In [None]:
from pathlib import Path

import os
import json

from IPython.display import display, HTML

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import binomtest, chisquare, kruskal, mannwhitneyu


import src.ipython_loader as loader

In [None]:
os.environ["CONFIG_ENV"] = "debug"
if False:
    os.environ["CONFIG_ENV"] = "production"

from config import load_config
config = load_config()

DEBUG = config["DEBUG"]
RESOLUTION = config['DEFAULTS']['resolution']

# input data
STUDY_RESULTS_PATH = config['PATHS']['student_study_results']
STUDENT_METADATA_PATH = config["PATHS"]["student_hold_out_set"] / 'student_study_submissions'

# output data
IMAGE_DIR = config['PATHS']['images'] / 'student_study_preparation'

os.makedirs(IMAGE_DIR, exist_ok=True)

***

# Loading data

## Parsing utils

In [None]:
# Parsing utils
def _normalize_model_versions(x):
    if x == "Version A":
        return "model"
    elif x == "Version B":
        return "baseline"
    # neutral or undecided
    return "neutral"

def _normalize_preference_category(x):
    if x == "The ordering made more sense.":
        return "order"
    elif x == "The explanations were clearer.":
        return "explanation"
    elif x == "I understood one version more clearly.":
        return "clarity"
    return "other"

def _normalize_explanations_category(x):
    if x == "Not really.":
        return "no"
    elif x == "Somewhat.":
        return "somewhat"
    elif x == "Yes, definitely.":
        return "yes"
    raise ValueError(f"{x} not recognized.")
    
def _normalize_fix_category(x):
    if x == "Yes, it would help.":
        return "yes"
    elif x == "No, I would ignore it.":
        return "no"
    elif x == "Sometimes.":
        return "somewhat"
    raise ValueError(f"{x} not recognized.")
    
def _normalize_understood(x):
    """Set manually based on inspection of the data."""
    x = str(x).lower()
    if "yes" in x or "hope" in x or "mostly" in x:
        return "yes"
    if "no" in x:
        return "no"
    return "maybe"

## Data loading and cleaning

In [None]:
assignment_rows = []

for folder in Path(STUDENT_METADATA_PATH).glob("student_*"):
    sid = int(folder.name.split("_")[1])
    assignment_file = folder / "assignment.json"

    with open(assignment_file, "r") as f:
        data = json.load(f)

    for task_idx, version in data["left_versions"].items():
        assignment_rows.append({
            "student_id": int(sid),
            "question_id": int(task_idx),
            "left_version": 'model' if version == 'A' else 'baseline'
        })

assignment_df = pd.DataFrame(assignment_rows)

In [None]:
original_final = pd.read_csv(STUDY_RESULTS_PATH / "final.csv")
original_submissions = pd.read_csv(STUDY_RESULTS_PATH / "per_submission.csv")

per_submission_df = pd.DataFrame()

per_submission_df['time'] = pd.to_datetime(
    original_submissions["Timestamp"],
    format="%Y/%m/%d %I:%M:%S %p %Z",
    errors="coerce"
)

per_submission_df['student_id'] = original_submissions["Student ID"].fillna("Unknown")
per_submission_df['question_id'] = original_submissions["Question ID"].fillna("Unknown")

# add if the model was on the left
per_submission_df = per_submission_df.merge(
    assignment_df,
    on=["student_id", "question_id"],
)

per_submission_df["prefer_model"] = original_submissions["Which version did you prefer?"].apply(_normalize_model_versions)
per_submission_df["helps_first"] = original_submissions["Which version would help you decide what to fix first?"].apply(_normalize_model_versions)
per_submission_df["top_defect"] = original_submissions["Focusing only on the first defect, which version ranked it better?"].apply(_normalize_model_versions)

per_submission_df["confidence"] = pd.to_numeric(original_submissions["How confident you are in your choices?"], errors="coerce")

final_df = pd.DataFrame()

final_df['Timestamp'] = pd.to_datetime(
    original_final["Timestamp"],
    format="%Y/%m/%d %I:%M:%S %p %Z",
    errors="coerce"
)

final_df["Student ID"] = original_final["Student ID"].fillna("Unknown")

final_df["why_version"] = original_final["What made you prefer one version over the other?"].apply(_normalize_preference_category)
final_df["explanations_helped"] = original_final["Did the explanations help you understand why defects were ordered in that way?"].apply(_normalize_explanations_category)
final_df["ordering_effect"] = original_final["Did the ordering affect how you would approach fixing the code?"].apply(_normalize_fix_category)
final_df["confused_tasks"] = original_final["Were any tasks confusing?"].apply(_normalize_understood)
final_df['confused_defects'] = original_final["Did you understand all the defects?"].apply(_normalize_understood)
final_df["comments"] = original_final["Do you have any additional comments?"]

***

# Sanity check

In [None]:
final_df.head(2)

In [None]:
per_submission_df.head(2)

***

# Preference for model vs baseline

## Basic stats

In [None]:
print("=== Overall version preference ===")
print(per_submission_df["prefer_model"].value_counts(dropna=False) / len(per_submission_df), "\n")

In [None]:
print("=== Which version helps decide what to fix first ===")
print(per_submission_df["helps_first"].value_counts(dropna=False) / len(per_submission_df), "\n")

In [None]:
print("=== Which version ranked the first defect better ===")
print(per_submission_df["top_defect"].value_counts(dropna=False) / len(per_submission_df), "\n")

## Significance of preference

TODO take into account confidence?

In [None]:
# --- Model vs Baseline ---

# Filter to hard choices
binary_pref = per_submission_df[per_submission_df["prefer_model"].isin(["model", "baseline"])]

n_model = (binary_pref["prefer_model"] == "model").sum()
n_total = len(binary_pref)

binom_result = binomtest(n_model, n_total, p=0.5, alternative='two-sided')

print("=== Binomial Test (model vs baseline only) ===")
print(f"Model chosen: {n_model}/{n_total}")
print("p-value:", binom_result.pvalue, "\n")


In [None]:
# --- Model vs Baseline vs Neutral ---

obs = per_submission_df["prefer_model"].value_counts().reindex(["model", "baseline", "neutral"], fill_value=0)
exp = [len(per_submission_df)/3]*3

chi_result = chisquare(f_obs=obs, f_exp=exp)

print("=== Chi-square Test (model, baseline, neutral) ===")
print("Observed:", obs.values)
print("Expected:", exp)
print("p-value:", chi_result.pvalue, "\n")


In [None]:
# --- Does left/right placement affect preference? ---

n_model = (per_submission_df["prefer_model"] == "model").sum()
n_total = (per_submission_df["prefer_model"].isin(["model", "baseline"])).sum()

test = binomtest(n_model, n_total, p=0.5, alternative='two-sided')
print("p =", test.pvalue)

## Consistency across questions

In [None]:
agreement_df = per_submission_df[["prefer_model", "helps_first", "top_defect"]]

full_agreement = (agreement_df.nunique(axis=1) == 1).mean()
pair_agreements = {
    "prefer vs helps": (per_submission_df["prefer_model"] == per_submission_df["helps_first"]).mean(),
    "prefer vs top":   (per_submission_df["prefer_model"] == per_submission_df["top_defect"]).mean(),
    "helps vs top":    (per_submission_df["helps_first"] == per_submission_df["top_defect"]).mean(),
}

print("=== Agreement Across Related Questions ===")
print(f"Full agreement across all 3: {full_agreement:.2f}\n")

for name, val in pair_agreements.items():
    print(f"{name}: {val:.2f}")

## Preference across tasks

In [None]:
task_pref = (
    per_submission_df
    .groupby("question_id")["prefer_model"]
    .value_counts(normalize=True)
    .unstack()
    .fillna(0)
)

task_pref.sort_values(by=["model", "neutral"], ascending=False, inplace=True)

task_pref.plot(
    kind="bar",
    stacked=True,
    figsize=(10,5),
    colormap="viridis"
)

plt.title("Preference Proportions per Task (sorted by model preference)")
plt.xlabel("Task (question_id)")
plt.ylabel("Proportion")
plt.legend(title="Preferred version")
plt.tight_layout()
plt.show()


# Preference across students

In [None]:
student_pref = (
    per_submission_df.groupby("student_id")["prefer_model"]
    .value_counts()
    .unstack()
    .fillna(0)
)

student_pref["total"] = student_pref.sum(axis=1)
student_pref["dominant_score"] = student_pref[["model", "baseline"]].max(axis=1) / student_pref["total"]
student_pref["dominant_preference"] = student_pref[["model", "baseline"]].idxmax(axis=1)

display(student_pref)


In [None]:
plt.figure(figsize=(6,4))
sns.histplot(student_pref["dominant_score"], bins=10)
plt.title("How Consistent Were Students In Their Preferences?")
plt.xlabel("Proportion of times student chose their dominant version")
plt.ylabel("Number of students")
plt.show()

## Preference distribution

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(data=per_submission_df, x="prefer_model",
              order=["model", "baseline", "neutral"])
plt.title("Version Preference Distribution")
plt.xlabel("Preferred Version")
plt.ylabel("Number of responses")
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(data=per_submission_df,
            x="prefer_model",
            y="confidence",
            order=["model", "baseline", "neutral"])
plt.title("Confidence vs Preferred Version")
plt.xlabel("Preferred Version")
plt.ylabel("Confidence (1–7)")
plt.tight_layout()
plt.show()


# Usefulness for fixing (Which defect to fix first?)

## Significance

In [None]:
print("=== Which version helps decide what to fix first? ===\n")

help_prop = per_submission_df["helps_first"].value_counts(normalize=True)

print("Proportions:\n", help_prop, "\n")

In [None]:
binary_help = per_submission_df[per_submission_df["helps_first"].isin(["model", "baseline"])]

n_model = (binary_help["helps_first"] == "model").sum()
n_total = len(binary_help)

binom_result = binomtest(n_model, n_total, p=0.5, alternative='two-sided')

print("=== Binomial Test (usefulness: model vs baseline) ===")
print(f"Model chosen as more helpful: {n_model}/{n_total}")
print("p-value:", binom_result.pvalue, "\n")


In [None]:
obs = per_submission_df["helps_first"].value_counts().reindex(
    ["model", "baseline", "neutral"], fill_value=0
)

expected = [len(per_submission_df)/3]*3

chi_result = chisquare(f_obs=obs.values, f_exp=expected)

print("=== Chi-square Test (usefulness: model/baseline/neutral) ===")
print("Observed:", obs.values)
print("Expected:", expected)
print("p-value:", chi_result.pvalue, "\n")


## Relationship with preference

In [None]:
agreement = (per_submission_df["prefer_model"] == per_submission_df["helps_first"]).mean()

print("=== Alignment of Preference with Helpfulness ===")
print(f"Proportion where preference == usefulness: {agreement:.2f}\n")

## Distribution

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(
    data=per_submission_df,
    x="helps_first",
    order=["model", "baseline", "neutral"]
)
plt.title("Which Version Helps Decide What to Fix First?")
plt.xlabel("Version chosen as more helpful")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(
    data=per_submission_df,
    x="helps_first",
    y="confidence",
    order=["model", "baseline", "neutral"]
)
plt.title("Confidence vs Helpful Version Choice")
plt.xlabel("Helpful Version")
plt.ylabel("Confidence (1–7)")
plt.tight_layout()
plt.show()


# Confidence

## Significance

In [None]:
print("=== Confidence by helpful version ===\n")

conf_stats = per_submission_df.groupby("helps_first")["confidence"].describe()
display(conf_stats)


In [None]:
# --- All three groups ---
groups = [
    per_submission_df.loc[per_submission_df["helps_first"] == "model", "confidence"].dropna(),
    per_submission_df.loc[per_submission_df["helps_first"] == "baseline", "confidence"].dropna(),
    per_submission_df.loc[per_submission_df["helps_first"] == "neutral", "confidence"].dropna(),
]

kw = kruskal(*groups)

print("=== Kruskal-Wallis test: confidence differences across groups ===")
print("p-value:", kw.pvalue, "\n")


In [None]:
# --- Model vs baseline ---

conf_model = per_submission_df.loc[per_submission_df["helps_first"] == "model", "confidence"]
conf_base = per_submission_df.loc[per_submission_df["helps_first"] == "baseline", "confidence"]

u = mannwhitneyu(conf_model, conf_base, alternative="two-sided")

print("=== Mann-Whitney U test: model vs baseline confidence ===")
print("p-value:", u.pvalue, "\n")


## Distribution

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(
    data=per_submission_df,
    x="helps_first",
    y="confidence",
    order=["model", "baseline", "neutral"]
)
plt.title("Confidence vs Helpful Version Chosen")
plt.xlabel("Version chosen as more helpful")
plt.ylabel("Confidence (1–7)")
plt.tight_layout()
plt.show()


## Did explanations help students understand the ranking?

In [None]:
print("=== Did explanations help? ===\n")
print(final_df["explanations_helped"].value_counts(), "\n")

explanation_stats = final_df["explanations_helped"].value_counts(normalize=True) * 100
print("Percentage breakdown:\n", explanation_stats.round(1))


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=final_df, x="explanations_helped",
              order=["no", "somewhat", "yes"])
plt.title("Did Explanations Help?")
plt.xlabel("Student response")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


# Did the ordering affect how students would approach fixing the code?

In [None]:
print("=== Did ordering affect how students would fix the code? ===\n")
print(final_df["ordering_effect"].value_counts(), "\n")

print("Percentage:\n",
      (final_df["ordering_effect"].value_counts(normalize=True) * 100).round(1))


In [None]:
n_yes = (final_df["ordering_effect"] == "yes").sum()
n_total = len(final_df)

binom = binomtest(n_yes, n_total, p=0.5, alternative="greater")
print("Binomial test p-value:", binom.pvalue)


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=final_df, x="ordering_effect",
              order=["no", "somewhat", "yes"])
plt.title("Did Ordering Influence How Students Would Fix Code?")
plt.xlabel("Response")
plt.tight_layout()
plt.show()


# Were any tasks/defects confusing?

In [None]:
print("=== Were any tasks confusing? ===\n")
print(final_df["confused_tasks"].value_counts(), "\n")

print("=== Did you understand all the defects? ===\n")
print(final_df["confused_defects"].value_counts(), "\n")


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=final_df, x="confused_tasks",
              order=["no", "maybe", "yes"])
plt.title("Were Any Tasks Confusing?")
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=final_df, x="confused_defects",
              order=["no", "maybe", "yes"])
plt.title("Did Students Understand All Defects?")
plt.tight_layout()
plt.show()


In [None]:
student_pref = (
    per_submission_df.groupby("student_id")["prefer_model"]
    .value_counts(normalize=True)
    .unstack()
    .fillna(0)
)

student_pref["dominant_score"] = student_pref.max(axis=1)
student_pref["dominant_preference"] = student_pref.idxmax(axis=1)

## Task level

In [None]:
task_summary = per_submission_df.groupby("question_id").agg(
    n_students = ("prefer_model", "count"),
    prefer_counts = ("prefer_model", lambda x: x.value_counts().to_dict()),
    helps_counts = ("helps_first", lambda x: x.value_counts().to_dict()),
    top_counts = ("top_defect", lambda x: x.value_counts().to_dict()),
    mean_confidence = ("confidence", "mean")
).reset_index()


# Other

In [None]:
plt.figure(figsize=(10,4))
sns.boxplot(data=per_submission_df, x="question_id", y="confidence")
plt.title("Confidence per task")
plt.tight_layout()
plt.show()


# Left/Right influence

In [None]:
# Compute proportions
prop_df = (
    per_submission_df
    .groupby("left_version")["prefer_model"]
    .value_counts(normalize=True)
    .rename("fraction_chosen")
    .reset_index()
)

plt.figure(figsize=(6,4))
sns.barplot(
    data=prop_df,
    x="left_version",
    y="fraction_chosen",
    hue="prefer_model"
)
plt.title("Does left/right placement affect preference?")
plt.xlabel("Version shown on the left (model or baseline)")
plt.ylabel("Fraction chosen")
plt.ylim(0, 1)
plt.tight_layout()
plt.show()


In [None]:
tbl = pd.crosstab(per_submission_df["left_version"], per_submission_df["prefer_model"])
chi2, p, dof, expected = stats.chi2_contingency(tbl)

print(tbl)
print(f"\nChi-square p-value: {p:.4f} (reject = significant difference based on left/right placement)")
