# Set up and global variables

In [None]:
import os

from pathlib import Path
from itertools import combinations

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from tqdm import tqdm
from scipy.stats import kendalltau

from src.prioritization import *

VERSION = '0.0.0'
DATASET_PATH = Path('data') / 'datasets' / f'ipython_{VERSION}'
MODEL_PATH = DATASET_PATH / 'trained_heuristics'
OUTPUT_PATH = Path('labelling_app') / 'survey_data' / f'ipython_{VERSION}'

DATA_PARTITION = 'hold_out'

RANDOM_SEED = 42
SAMPLE_SIZE = 70

RESOLUTION = 300

IMAGE_DIR = Path('images') / "survey_preparation"

os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(IMAGE_DIR, exist_ok=True)

***

# Loading data

In [None]:
items = pd.read_csv(DATASET_PATH / 'items.csv', index_col=0)
defects = pd.read_csv(DATASET_PATH / f'defects.csv', index_col=0)

log = pd.read_csv(DATASET_PATH / DATA_PARTITION/ 'log.csv', index_col=0, parse_dates=['time'])
defect_log = pd.read_csv(DATASET_PATH / DATA_PARTITION / 'defect_log.csv', index_col=0)
defect_log.columns = defect_log.columns.astype(int)

In [None]:
models = {
    "Task Common": TaskCommonModel,
    "Task Characteristic": TaskCharacteristicModel,
    "Student Frequency": StudentFrequencyModel,
    "Student Characteristic": StudentCharacteristicModel,
    "Student Encountered": StudentEncounteredBeforeModel,
    "Defect Multiplicity": DefectMultiplicityModel,
    "Severity Baseline": SeverityModel,
}

models = {
    name: model.load(MODEL_PATH / f'{name}.pkl')
    for name, model in models.items()
}

***

# Quality filtering

In [None]:
def has_close_pair(row):
    """Check if there is a pair of values with difference two or less."""
    row_values = row.values
    return np.any(np.abs(row_values[:, None] - row_values) <= 2)

In [None]:
# at least two defects
filtered = defect_log[(defect_log > 0).sum(axis=1) > 1]

In [None]:
# at most difference of two in severity
filtered *= defects.loc[filtered.columns]['severity']
filtered = filtered[filtered.apply(has_close_pair, axis=1)]

In [None]:
# apply the filter
defect_log = defect_log.loc[filtered.index]
log = log.loc[filtered.index]

***

# Sampling criteria

In [None]:
# Calculate sampling statistics and remember heuristic scores
stats = []
heuristic_scores = []
for idx, submission in tqdm(log.iloc[:200].iterrows(), total=log.shape[0], desc="Calculating statistics for sampling"):
    defect_counts = defect_log.loc[idx]

    # Heuristic scores
    discrete_scores = {name: model.discretize(submission, defect_counts).dropna() for name, model in models.items()}

    for defect in defect_counts[defect_counts > 0].index:
        row = {"submission id": idx, "defect id": defect}
        for name, scores in discrete_scores.items():
            row[name] = scores[defect]
        heuristic_scores.append(row)

    # Disagreement
    kendall_distances = []
    for left, right in combinations(discrete_scores.values(), 2):
        tau, _ = kendalltau(left, right, nan_policy='raise')
        kendall_distances.append(1 - tau)

    disagreement = np.nanmean(kendall_distances) if any(~np.isnan(kendall_distances)) else 0.0

    # Signal strength
    signal_strength = np.mean([np.ptp(values) for values in discrete_scores.values()])

    for model in models.values():
        model.update(submission, defect_counts)

    stats.append({
        "id": idx,
        "signal strength": signal_strength,
        "model disagreement": disagreement,
    })

stats = pd.DataFrame(stats).set_index("id")
heuristic_scores = pd.DataFrame(heuristic_scores)

***

# Survey sample

In [None]:
min_score = 0.1
rng = np.random.default_rng(RANDOM_SEED)

# Normalize
scores = (stats - stats.min()) / (stats.max() - stats.min())

# Combine scores
scores['total'] = scores.sum(axis=1)
scores['total'] = np.where(scores['total'] < min_score, min_score, scores['total'])

# Stratify by topic
strata_labels = items['topic'].unique()
strata_counts = np.zeros(strata_labels.shape[0], dtype=int)
scores['strata'] = log.loc[scores.index, 'item'].map(items['topic'])

# Greedy sampling
sampled_ids = []
while len(sampled_ids) < SAMPLE_SIZE:
    # Least represented strata
    topic_idx = rng.choice(np.where(strata_counts == strata_counts.min())[0], size=1)
    # Collect eligible submissions
    eligible_ids = scores[scores['strata'] == strata_labels[topic_idx][0]].index.difference(sampled_ids)
    # Normalize scores into probabilities
    probabilities = scores.loc[eligible_ids, 'total'] / scores.loc[eligible_ids, 'total'].sum()
    # Sample, skipping if there are no eligible submissions
    try:
        sampled_ids.append(rng.choice(eligible_ids, replace=False, p=probabilities.values))
    except ValueError:
        pass
    strata_counts[topic_idx] += 1

***

# Analysis

In [None]:
sample_stats = stats.loc[sampled_ids]

## Validation plots

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

axes[0].hist(sample_stats["signal strength"], bins=30, edgecolor="black")
axes[0].set_title("Distribution of Signal Strength")
axes[0].set_xlabel("Signal Strength")
axes[0].set_ylabel("Count")

axes[1].hist(sample_stats["model disagreement"], bins=30, edgecolor="black")
axes[1].set_title("Distribution of Model Disagreement")
axes[1].set_xlabel("Model Disagreement")
axes[1].set_ylabel("Count")

plt.tight_layout()
plt.savefig(IMAGE_DIR / "criterion_distribution.png", dpi=RESOLUTION)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(6, 5))

ax.scatter(
    sample_stats["signal strength"],
    sample_stats["model disagreement"],
    alpha=0.6,
    s=30,
    c=log.loc[sample_stats.index, "item"].astype("category").cat.codes,  # color by task
)

ax.set_xlabel("Signal Strength")
ax.set_ylabel("Model Disagreement")
ax.set_title("Signal Strength vs Model Disagreement")

plt.tight_layout()
plt.savefig(IMAGE_DIR / "signal_vs_disagreement.png", dpi=RESOLUTION)
plt.show()


## Coverage

In [None]:
task_counts = log.loc[sample_stats.index, "item"].value_counts().reindex(items.index).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(8, 4))
ax.bar(task_counts.index.astype(str), task_counts.values)
ax.set_xlabel("Task ID")
ax.set_ylabel("Number of Submissions")
ax.set_title("Coverage of Tasks in Hold-Out Partition")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(IMAGE_DIR / "task_coverage.png", dpi=RESOLUTION)
plt.show()


In [None]:
topics = items['topic'].unique()
topic_counts = items.loc[log.loc[sample_stats.index, "item"], "topic"].value_counts().reindex(topics).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(topic_counts.index.astype(str), topic_counts.values)
ax.set_xlabel("Topic")
ax.set_ylabel("Number of Submissions")
ax.set_title("Coverage of Topics in Hold-Out Partition")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(IMAGE_DIR / "topic_coverage.png", dpi=RESOLUTION)
plt.show()


In [None]:
fig, axes = plt.subplots(
    nrows=int(np.ceil(len(models.keys()) / 3)),
    ncols=3,
    figsize=(12, 3 * int(np.ceil(len(models.keys()) / 3))),
    constrained_layout=True
)

axes = axes.flatten()

for ax, col in zip(axes, models.keys()):
    has_1_5_scale = models[col].get_discretization_scale() == '1-5'
    ax.hist(heuristic_scores[col].dropna(), bins=np.arange(0.5, 6.5, 1) if has_1_5_scale else np.arange(-2.5, 3.5, 1), edgecolor="black", rwidth=0.8)
    ax.set_title(col)
    ax.set_xticks([1, 2, 3, 4, 5] if has_1_5_scale else [-2, -1, 0, 1, 2])
    ax.set_xlabel("Score")
    ax.set_ylabel("Count")

# Remove empty subplots
for ax in axes[len(models.keys()):]:
    ax.axis("off")

plt.suptitle("Distribution of Heuristic Scores (All Defects)", fontsize=14, y=1.02)
plt.savefig(IMAGE_DIR / "heuristic_value_distributions.png", dpi=RESOLUTION, bbox_inches="tight")
plt.show()


## Storytelling

In [None]:
# High and low agreement / signal strength examples

***

# Combine to a final dataframe

In [None]:
# --- Submissions ---
submission_df = (
    log.loc[sampled_ids]
    .merge(items[['name', 'instructions']], left_on='item', right_index=True)
    .rename(columns={
        'answer': 'submission',
        'name': 'task name',
    })
    [['submission', 'task name', 'instructions']]
)

# --- Defects ---
# Long format
melted_defects = (
    defect_log.loc[sampled_ids]
    .stack()
    .reset_index()
    .rename(columns={'index': 'submission id', 'level_1': 'defect id', 0: 'count'})
)
melted_defects = melted_defects[melted_defects['count'] > 0]

# Add metadata
defect_df = (
    melted_defects
    .merge(defects, left_on='defect id', right_index=True)
    [['submission id', 'defect id', 'defect name', 'description', 'code example', 'code fix example']]
    .rename(columns={'defect name': 'name'})
)

# Add heuristic scores
defect_df = defect_df.merge(
    heuristic_scores,
    on=['submission id', 'defect id'],
    how='left'
)

# --- Heuristics ---
heuristics_df = []
for name, model in models.items():
    heuristics_df.append({
        'name': name,
        'description': model.get_model_description(),
        'scale': model.get_discretization_scale(),
    })

heuristics_df = pd.DataFrame(heuristics_df).set_index('name')

In [None]:
submission_df.to_csv(OUTPUT_PATH / 'submissions.csv', sep=';', index_label='index')
defect_df.to_csv(OUTPUT_PATH / 'defects.csv', sep=';', index_label='index')
heuristics_df.to_csv(OUTPUT_PATH / 'heuristics.csv', sep=';', index_label='name')