# Set up and global variables

In [None]:
import os
import joblib
import json
import networkx as nx

from pathlib import Path
from itertools import combinations

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from tqdm import tqdm
from scipy.stats import kendalltau
from sklearn.pipeline import Pipeline

from src.prioritization import *
from src.feature_engineering import select_features, build_pairwise_features
from src.ordering import rank_submission
from src.explanations import explain_submission, explain_baseline_submission

SAMPLE_SIZE = 25
N_STUDENTS = 10
MAIN_TASKS_PER_STUDENT = 5
MIN_CONTRIBUTION = 1e-3
MAX_EXPLANATION_LENGTH = 3

RETRAIN_MODELS = True

In [None]:
os.environ["CONFIG_ENV"] = "debug"
if False:
    os.environ["CONFIG_ENV"] = "production"

from config import load_config
config = load_config()

DEBUG = config["DEBUG"]
RESOLUTION = config['DEFAULTS']['resolution']
RANDOM_SEED = config['DEFAULTS']['random_seed']

# input data
STORAGE_PATH = config['PATHS']['storage']
HOLD_OUT_DATA_PATH = config['PATHS']['student_hold_out_set']
TRAINED_SCORING_MODELS_PATH = config['PATHS']['hold_out_trained_heuristics']
benchmark_path = config['PATHS']['benchmark_dataset']
FINAL_MODEL_PATH = benchmark_path / "final_teacher_model.pkl"
FINAL_FEATURES_PATH = benchmark_path / "final_selected_features.pkl"
FINAL_BASELINE_PATH = benchmark_path / "baseline_models"
FINAL_BASELINE_FEATURES_PATH = benchmark_path / "baseline_features"

# output data
STUDY_OUTPUT_PATH = HOLD_OUT_DATA_PATH / 'student_study_submissions'
IMAGE_DIR = config['PATHS']['images'] / 'student_study_preparation'

os.makedirs(STUDY_OUTPUT_PATH, exist_ok=True)
os.makedirs(IMAGE_DIR, exist_ok=True)

***

# Loading data

In [None]:
items = pd.read_csv(STORAGE_PATH / 'items.csv', index_col=0)
defects = pd.read_csv(STORAGE_PATH / f'defects.csv', index_col=0)

log = pd.read_csv(HOLD_OUT_DATA_PATH / 'log.csv', index_col=0, parse_dates=['time'])
defect_log = pd.read_csv(HOLD_OUT_DATA_PATH / 'defect_log.csv', index_col=0)
defect_log.columns = defect_log.columns.astype(int)

In [None]:
scoring_models = [
    TaskCommonModel,
    TaskCharacteristicModel,
    StudentCommonModel,
    StudentCharacteristicModel,
    StudentEncounteredBeforeModel,
    DefectMultiplicityModel,
    SeverityModel,
]

scoring_models = [model.load(TRAINED_SCORING_MODELS_PATH / f'{model.get_model_name()}.pkl') for model in scoring_models]

scoring_models = {model.get_model_name(): model for model in scoring_models}

In [None]:
ordering_model = joblib.load(FINAL_MODEL_PATH)
model_columns = joblib.load(FINAL_FEATURES_PATH)

In [None]:
primary_model = joblib.load(FINAL_BASELINE_PATH / 'primary_model.joblib')
secondary_model = joblib.load(FINAL_BASELINE_PATH / 'secondary_model.joblib')
primary_cols = joblib.load(FINAL_BASELINE_FEATURES_PATH / 'primary_features.joblib')
secondary_cols = joblib.load(FINAL_BASELINE_FEATURES_PATH / 'secondary_features.joblib')

***

# Quality filtering

In [None]:
# at least two defects
is_non_trivial = (defect_log > 0).sum(axis=1) > 1

***

# Generate all heuristic scores

In [None]:
if RETRAIN_MODELS:
    # Calculate sampling statistics and remember heuristic scores
    discrete_scores = []
    continuous_scores = []

    # Ensure log is sorted by time
    log = log.sort_values(by='time')

    for idx, submission in tqdm(log.iloc[:500].iterrows() if DEBUG else log.iterrows(), total=log.shape[0], desc="Calculating statistics for sampling"):
        defect_counts = defect_log.loc[idx]

        if not is_non_trivial.loc[idx]:
            for model in scoring_models.values():
                model.update(submission, defect_counts)
            continue

        # Heuristic scores
        discrete_model_scores = {name: model.discretize(submission, defect_counts).dropna() for name, model in scoring_models.items()}
        continuous_model_scores = {name: model._calculate_scores(submission, defect_counts) for name, model in scoring_models.items()}

        # Save model scores
        for defect in defect_counts[defect_counts > 0].index:
            discrete_row = {"submission id": idx, "defect id": defect}
            continuous_row = discrete_row.copy()
            for name, scores in discrete_model_scores.items():
                discrete_row[name] = scores[defect]
                continuous_row[name] = continuous_model_scores[name][defect]
            discrete_scores.append(discrete_row)
            continuous_scores.append(continuous_row)

        for model in scoring_models.values():
            model.update(submission, defect_counts)

    discrete_scores = pd.DataFrame(discrete_scores)
    continuous_scores = pd.DataFrame(continuous_scores)

    discrete_scores.to_csv(HOLD_OUT_DATA_PATH / 'student_holdout_discrete_scores.csv', index=False)
    continuous_scores.to_csv(HOLD_OUT_DATA_PATH / 'student_holdout_continuous_scores.csv', index=False)
else:
    discrete_scores = pd.read_csv(HOLD_OUT_DATA_PATH / 'student_holdout_discrete_scores.csv')
    continuous_scores = pd.read_csv(HOLD_OUT_DATA_PATH / 'student_holdout_continuous_scores.csv')

In [None]:
# standardize scale
for name, model in scoring_models.items():
    if model.get_discretization_scale() == '-2-2':
        discrete_scores[name] = discrete_scores[name] + 3

***

# Construct dataset

In [None]:
# build the pairwise dataset
long_defects = defect_log.melt(var_name='defect id', value_name='count', ignore_index=False).reset_index(names=['submission id'])
long_defects = long_defects[long_defects['count'] > 0]

def generate_defect_pairs(group):
    """Generate all possible pairs of defects in a submission."""
    defects = group['defect id'].tolist()
    return pd.DataFrame(combinations(defects, 2), columns=['left', 'right'])

all_pairs = (
    long_defects.groupby('submission id')
    .apply(generate_defect_pairs, include_groups=False)
    .reset_index(level=1, drop=True)
    .reset_index()
)

# add metadata
all_pairs['item'] = log.loc[all_pairs['submission id'], 'item'].values

In [None]:
# add scores
df, catalog = build_pairwise_features(all_pairs, discrete_scores, continuous_scores, items, defects)
df.dropna(inplace=True)

***

# Calculate predictions

In [None]:
if isinstance(ordering_model, Pipeline) and hasattr(ordering_model['clf'], 'coef_'):
    weights = ordering_model['clf'].coef_.ravel()
    bias = ordering_model['clf'].intercept_[0]
else:
    raise NotImplementedError

In [None]:
X_ordering_model = df[model_columns]
df['model_tiebreak'] = ordering_model.predict_proba(X_ordering_model)[:, 1]
df['model_prediction'] = ordering_model.predict(X_ordering_model)

X_primary_heuristic = df[primary_cols]
X_secondary_heuristic = df[secondary_cols]
df['baseline_tiebreak'] = secondary_model.predict_proba(X_secondary_heuristic)[:, 1]
df['baseline_prediction'] = primary_model.predict(X_primary_heuristic)

In [None]:
agree_on_first = {}
model_rankings = {}
baseline_rankings = {}
model_explanations = {}
baseline_explanations = {}

for submission_id, submission_df in df.groupby('submission id'):
    model_ranked = rank_submission(submission_df, 'model_prediction', 'model_tiebreak')
    baseline_ranked = rank_submission(submission_df, 'baseline_prediction', 'baseline_tiebreak')
    model_rankings[submission_id] = model_ranked
    baseline_rankings[submission_id] = baseline_ranked
    agree_on_first[submission_id] = model_ranked[0] == baseline_ranked[0]
    model_explanations[submission_id] = explain_submission(
        submission_df, model_ranked, X_ordering_model.loc[submission_df.index],
        weights, catalog
    )
    baseline_explanations[submission_id] = explain_baseline_submission(
        submission_df, baseline_ranked, primary_cols, secondary_cols
    )

agree_on_first = pd.Series(agree_on_first)

***

# Sample for the study

In [None]:
eligible = agree_on_first[~agree_on_first].index

In [None]:
rng = np.random.default_rng(RANDOM_SEED)

# Stratify by topic
strata_labels = items['topic'].unique()
strata_counts = np.zeros(strata_labels.shape[0], dtype=int)
strata = log.loc[eligible, 'item'].map(items['topic'])

# Greedy sampling
sampled_ids = []
while len(sampled_ids) < SAMPLE_SIZE:
    # Least represented strata
    topic_idx = rng.choice(np.where(strata_counts == strata_counts.min())[0], size=1)
    # Collect eligible submissions
    in_strata = strata[strata == strata_labels[topic_idx][0]].index
    unassigned_ids_in_strata = in_strata.difference(sampled_ids)
    try:
        sampled_ids.append(rng.choice(unassigned_ids_in_strata, replace=False))
    except ValueError:
        pass
    strata_counts[topic_idx] += 1

***

# Analysis

## Coverage

In [None]:
defect_counts = defect_log.loc[sampled_ids].sum().sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(8, 4))
ax.bar(defect_counts.index.astype(str), defect_counts.values)
ax.set_xlabel("Defect ID")
ax.set_ylabel("Number of Submissions")
ax.set_title("Coverage of Defects in Hold-Out Partition")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(IMAGE_DIR / "defect_coverage.png", dpi=RESOLUTION)
plt.show()

In [None]:
task_counts = log.loc[sampled_ids, "item"].value_counts().reindex(items.index).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(8, 4))
ax.bar(task_counts.index.astype(str), task_counts.values)
ax.set_xlabel("Task ID")
ax.set_ylabel("Number of Submissions")
ax.set_title("Coverage of Tasks in Hold-Out Partition")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(IMAGE_DIR / "task_coverage.png", dpi=RESOLUTION)
plt.show()


In [None]:
topics = items['topic'].unique()
topic_counts = items.loc[log.loc[sampled_ids, "item"], "topic"].value_counts().reindex(topics).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(topic_counts.index.astype(str), topic_counts.values)
ax.set_xlabel("Topic")
ax.set_ylabel("Number of Submissions")
ax.set_title("Coverage of Topics in Hold-Out Partition")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(IMAGE_DIR / "topic_coverage.png", dpi=RESOLUTION)
plt.show()


***

# Generate survey files

## Page html

In [None]:
import numpy as np

def generate_study_html(
    submission_id,
    task_row,
    student_id,
    task_number,
    task_info,
    defects_df,
    ranking_A,
    ranking_B,
    explanations_A,
    explanations_B,
    output_path,
    rng=None
):
    """
    Generate a neutral HTML page comparing two versions of defect prioritization.

    Versions are labeled "Version A" and "Version B".
    Left/right assignment is randomized.

    Returns
    -------
    left_version : list - Which version appears on the left.
    """
    if rng is None:
        rng = np.random.default_rng()

    # Randomize which version appears on the left
    if rng.random() < 0.5:
        left_label, right_label = "Version A", "Version B"
        left_order, right_order = ranking_A, ranking_B
        left_expl, right_expl = explanations_A, explanations_B
        left_version = "A"
    else:
        left_label, right_label = "Version B", "Version A"
        left_order, right_order = ranking_B, ranking_A
        left_expl, right_expl = explanations_B, explanations_A
        left_version = "B"

    # --- Minimal CSS ---
    css = """
<style>
body {
    font-family: Arial, sans-serif;
    margin: 20px;
    color: #222;
}
.code-block {
    background: #f0f0f0;
    padding: 10px;
    border-radius: 4px;
    white-space: pre-wrap;
    font-family: monospace;
}
.section {
    margin-bottom: 30px;
}
.comparison-container {
    display: flex;
    gap: 20px;
}
.task-meta {
    background: #f0f0f0;
    padding: 10px;
    border-radius: 6px;
    margin-bottom: 15px;
    font-size: 0.9rem;
}
.task-meta p {
    margin: 2px 0;
}
@media (max-width: 900px) {
    .comparison-container {
        flex-direction: column;
    }
}
.column {
    flex: 1;
}
.column h3 {
    text-align: center;
}
.defect-card {
    border: 1px solid #ccc;
    padding: 12px;
    margin-bottom: 15px;
    border-radius: 4px;
    background: #fafafa;
}
.defect-card h4 {
    margin: 0 0 5px 0;
}
details summary {
    cursor: pointer;
    font-weight: bold;
}
</style>
"""

    # --- Defect card builder ---
    def build_defect_cards(order, explanations):
        N = len(order)
        parts = []

        for rank_idx, defect_id in enumerate(order, start=1):
            defect = defects_df.loc[defect_id]

            name = defect["defect name"]
            desc = defect["description"]
            example = defect.get("code example", "")
            fix = defect.get("code fix example", "")
            expl_lines = explanations.get(defect_id, [])

            card = f"""
            <div class="defect-card">
                <h4>Rank {rank_idx} of {N}: {name}</h4>
                <p>{desc}</p>
            """

            if isinstance(example, str) and example.strip():
                card += f"""
                <div><strong>Example:</strong>
                    <div class="code-block">{example}</div>
                </div>
                """

            if isinstance(fix, str) and fix.strip():
                card += f"""
                <div><strong>Fix:</strong>
                    <div class="code-block">{fix}</div>
                </div>
                """

            if expl_lines:
                card += "<details><summary>Why this is here</summary><ul>"
                for line in expl_lines:
                    card += f"<li>{line}</li>"
                card += "</ul></details>"

            card += "</div>"
            parts.append(card)

        return "\n".join(parts)

    left_cards = build_defect_cards(left_order, left_expl)
    right_cards = build_defect_cards(right_order, right_expl)

    # --- Final HTML ---
    html = f"""
<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <meta name="student-id" content="{student_id}">
    <meta name="submission-id" content="{submission_id}">
    <meta name="task-number" content="{task_number}">
    <meta name="version-on-left" content="{left_version}">
    <title>Task {task_number}</title>
    {css}
</head>
<body>

<div class="task-meta">
    <p><strong>Student ID:</strong> {student_id}</p>
    <p><strong>Task Number:</strong> {task_number}</p>
</div>

<div class="section">
    <h2>Task: {task_info['name']}</h2>
    <p>{task_info['instructions']}</p>

    <h3>Student Submission</h3>
    <div class="code-block">{task_row['answer']}</div>
</div>

<div class="comparison-container">

    <div class="column">
        <h3>{left_label}</h3>
        {left_cards}
    </div>

    <div class="column">
        <h3>{right_label}</h3>
        {right_cards}
    </div>

</div>

</body>
</html>
"""

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(html)

    return left_version

## Assign tasks to students

In [None]:
task_ids = list(range(SAMPLE_SIZE))  # internal indices 0..SAMPLE_SIZE-1

task_usage = np.zeros(SAMPLE_SIZE, dtype=int)
student_main_tasks = {sid: [] for sid in range(N_STUDENTS)}

for sid in range(N_STUDENTS):
    # pick tasks with lowest usage
    available = np.argsort(task_usage)
    chosen = []
    for t in available:
        if len(chosen) == MAIN_TASKS_PER_STUDENT:
            break
        chosen.append(t)
    student_main_tasks[sid] = chosen
    task_usage[chosen] += 1

print("Task usage distribution:", task_usage)

## Generate student folders

In [None]:
all_students_metadata = {}

for sid in range(N_STUDENTS):
    student_folder = STUDY_OUTPUT_PATH / f"student_{sid+1:02d}"
    main_folder = student_folder / "main_tasks"
    extra_folder = student_folder / "extra_tasks"

    student_folder.mkdir(exist_ok=True)
    main_folder.mkdir(exist_ok=True)
    extra_folder.mkdir(exist_ok=True)

    # tasks for this student
    main_tasks = student_main_tasks[sid]
    extra_tasks = [t for t in task_ids if t not in main_tasks]

    left_versions = {}

    # generate main tasks
    for task_index in main_tasks:
        submission_id = sampled_ids[task_index]

        row = log.loc[submission_id]
        task_info = items.loc[row["item"]]

        order_A = model_rankings[submission_id]
        order_B = baseline_rankings[submission_id]
        expl_A = model_explanations[submission_id]
        expl_B = baseline_explanations[submission_id]

        output_path = main_folder / f"task_{task_index+1}.html"

        which_left = generate_study_html(
            submission_id=submission_id,
            task_row=row,
            task_info=task_info,
            defects_df=defects,
            ranking_A=order_A,
            ranking_B=order_B,
            explanations_A=expl_A,
            explanations_B=expl_B,
            output_path=output_path,
            rng=rng,
            student_id=sid+1,
            task_number=task_index+1,
        )

        left_versions[int(task_index + 1)] = which_left

    # Generate extra tasks
    for task_index in extra_tasks:
        submission_id = sampled_ids[task_index]

        row = log.loc[submission_id]
        task_info = items.loc[row["item"]]

        order_A = model_rankings[submission_id]
        order_B = baseline_rankings[submission_id]
        expl_A = model_explanations[submission_id]
        expl_B = baseline_explanations[submission_id]

        output_path = extra_folder / f"task_{task_index+1}.html"

        which_left = generate_study_html(
            submission_id=submission_id,
            task_row=row,
            task_info=task_info,
            defects_df=defects,
            ranking_A=order_A,
            ranking_B=order_B,
            explanations_A=expl_A,
            explanations_B=expl_B,
            output_path=output_path,
            rng=rng,
            student_id=sid+1,
            task_number=task_index+1,
        )

        left_versions[int(task_index + 1)] = which_left

    # Save assignment file
    assignment = {
        "student_id": sid + 1,
        "main_tasks": [int(t) for t in main_tasks],
        "extra_tasks": [int(t) for t in extra_tasks],
        "left_versions": left_versions,   # robust mapping
    }

    with open(student_folder / "assignment.json", "w", encoding="utf-8") as f:
        json.dump(assignment, f, indent=2)

    all_students_metadata[sid+1] = assignment

print("Finished generating all student packages!")