# Set up and global variables

In [None]:
import os
import joblib
import json
import networkx as nx

from pathlib import Path
from itertools import combinations

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from tqdm import tqdm
from scipy.stats import kendalltau

from src.prioritization import *
from src.feature_engineering import add_heuristic_scores, add_feature_sets

SAMPLE_SIZE = 30

In [None]:
os.environ["CONFIG_ENV"] = "debug"
if False:
    os.environ["CONFIG_ENV"] = "production"

from config import load_config
config = load_config()

DEBUG = config["DEBUG"]
RESOLUTION = config['DEFAULTS']['resolution']
RANDOM_SEED = config['DEFAULTS']['random_seed']

# input data
STORAGE_PATH = config['PATHS']['storage']
HOLD_OUT_DATA_PATH = config['PATHS']['student_hold_out_set']
TRAINED_SCORING_MODELS_PATH = config['PATHS']['hold_out_trained_heuristics']
benchmark_path = config['PATHS']['benchmark_dataset']
FINAL_MODEL_PATH = benchmark_path / "final_teacher_model.pkl"
FINAL_FEATURES_PATH = benchmark_path / "final_selected_features.pkl"
FINAL_HEURISTIC_HYPERPARAMETERS_PATH = benchmark_path / "final_heuristic_selected_features.pkl"

# output data
STUDY_OUTPUT_PATH = HOLD_OUT_DATA_PATH / 'student_study_submissions'
IMAGE_DIR = config['PATHS']['images'] / 'student_study_preparation'

os.makedirs(STUDY_OUTPUT_PATH, exist_ok=True)
os.makedirs(IMAGE_DIR, exist_ok=True)

***

# Loading data

In [None]:
items = pd.read_csv(STORAGE_PATH / 'items.csv', index_col=0)
defects = pd.read_csv(STORAGE_PATH / f'defects.csv', index_col=0)

log = pd.read_csv(HOLD_OUT_DATA_PATH / 'log.csv', index_col=0, parse_dates=['time'])
defect_log = pd.read_csv(HOLD_OUT_DATA_PATH / 'defect_log.csv', index_col=0)
defect_log.columns = defect_log.columns.astype(int)

In [None]:
scoring_models = [
    TaskCommonModel,
    TaskCharacteristicModel,
    StudentCommonModel,
    StudentCharacteristicModel,
    StudentEncounteredBeforeModel,
    DefectMultiplicityModel,
    SeverityModel,
]

scoring_models = [model.load(TRAINED_SCORING_MODELS_PATH / f'{model.get_model_name()}.pkl') for model in scoring_models]

scoring_models = {model.get_model_name(): model for model in scoring_models}

In [None]:
ordering_model = joblib.load(FINAL_MODEL_PATH)
model_columns = joblib.load(FINAL_FEATURES_PATH)

In [None]:
primary_heuristic, secondary_heuristic = json.load(open(FINAL_HEURISTIC_HYPERPARAMETERS_PATH, "r")).values()

In [None]:
primary_heuristic

In [None]:
def make_baseline_predict(primary_col, secondary_col):
    """Baseline ordering heuristic.
    
    Args:
        row: A single row of the log.
        primary_col: The name of the primary heuristic column (as a dict).
        secondary_col: The name of the secondary heuristic column (as a dict).
        
    Returns:
        A function that takes a single row of the log and returns a prediction.
    """
    primary_col, primary_val_type = primary_col['heuristic'], primary_col['value_type']
    secondary_col, secondary_val_type = secondary_col['heuristic'], secondary_col['value_type']
    
    def predict(row):
        value_left = row[f'{primary_col} (Left {primary_val_type})']
        value_right = row[f'{primary_col} (Right {primary_val_type})']

        if value_left > value_right:
            return 1
        elif value_right > value_left:
            return 0

        # TIE → use secondary heuristic
        value_left = row[f'{secondary_col} (Left {secondary_val_type})']
        value_right = row[f'{secondary_col} (Right {secondary_val_type})']

        if value_left > value_right:
            return 1
        return 0
    return predict


def make_baseline_tiebreak(secondary_col):
    """Tiebreak function to be used with the baseline oredering heuristic.

    Arguments:
        secondary_col -- The name of the secondary heuristic column (as a dict).
    Returns:
        A function that takes a single row of the log and returns a tiebreak score.
    """
    secondary_col, secondary_val_type = secondary_col['heuristic'], secondary_col['value_type']
    def predict(row):
        value_left = row[f'{secondary_col} (Left {secondary_val_type})']
        value_right = row[f'{secondary_col} (Right {secondary_val_type})']
        return value_left - value_right
    return predict


***

# Quality filtering

In [None]:
# at least two defects
is_non_trivial = (defect_log > 0).sum(axis=1) > 1

***

# Generate all heuristic scores

In [None]:
# Calculate sampling statistics and remember heuristic scores
discrete_scores = []
continuous_scores = []

# Ensure log is sorted by time
log = log.sort_values(by='time')

for idx, submission in tqdm(log.iloc[:500].iterrows() if DEBUG else log.iterrows(), total=log.shape[0], desc="Calculating statistics for sampling"):
    defect_counts = defect_log.loc[idx]

    if not is_non_trivial.loc[idx]:
        for model in scoring_models.values():
            model.update(submission, defect_counts)
        continue

    # Heuristic scores
    discrete_model_scores = {name: model.discretize(submission, defect_counts).dropna() for name, model in scoring_models.items()}
    continuous_model_scores = {name: model._calculate_scores(submission, defect_counts) for name, model in scoring_models.items()}

    # Save model scores
    for defect in defect_counts[defect_counts > 0].index:
        discrete_row = {"submission id": idx, "defect id": defect}
        continuous_row = discrete_row.copy()
        for name, scores in discrete_model_scores.items():
            discrete_row[name] = scores[defect]
            continuous_row[name] = continuous_model_scores[name][defect]
        discrete_scores.append(discrete_row)
        continuous_scores.append(continuous_row)

    for model in scoring_models.values():
        model.update(submission, defect_counts)

discrete_scores = pd.DataFrame(discrete_scores)
continuous_scores = pd.DataFrame(continuous_scores)

***

# Construct dataset

In [None]:
# build the pairwise dataset
long_defects = defect_log.melt(var_name='defect id', value_name='count', ignore_index=False).reset_index(names=['submission id'])
long_defects = long_defects[long_defects['count'] > 0]

def generate_defect_pairs(group):
    """Generate all possible pairs of defects in a submission."""
    defects = group['defect id'].tolist()
    return pd.DataFrame(combinations(defects, 2), columns=['left', 'right'])

all_pairs = (
    long_defects.groupby('submission id')
    .apply(generate_defect_pairs, include_groups=False)
    .reset_index(level=1, drop=True)
    .reset_index()
)

# add metadata
all_pairs['item'] = log.loc[all_pairs['submission id'], 'item'].values

In [None]:
# add scores
df = add_heuristic_scores(all_pairs, discrete_scores, continuous_scores)
df.dropna(inplace=True)

In [None]:
# add features

X, _ = add_feature_sets(df, items, defects)

X = X[model_columns]

***

# Calculate predictions

In [None]:
def rank_submission(submission_df: pd.DataFrame, prediction_col: str, tiebreak_col: str):
    """
    Rank all defects in a single submission from pairwise predictions.
    
    Args:
        submission_df: All rows corresponding to a single submission.
        prediction_col: Name of column containing pairwise predictions.
        tiebreak_col: Name of column containing pairwise tiebreak scores.
    Returns:
        List of nodes in ranked order (highest rank first).
    """
    
    # Build full directed graph of pairwise decisions
    G = nx.DiGraph()
    
    nodes = pd.unique(submission_df[['left','right']].values.ravel())
    G.add_nodes_from(nodes)
    
    for _, row in submission_df.iterrows():
        left = row['left']
        right = row['right']
        pred = row[prediction_col]
        tiebreak = row[tiebreak_col]
        
        if pred == 1:
            G.add_edge(left, right, tiebreak=tiebreak)
        else:
            G.add_edge(right, left, tiebreak=tiebreak)
    
    # Topological sort with cycle-breaking
    ranked_nodes = []
    while len(G) > 0:
        # Nodes with zero in-degree (sources)
        zero_in = [n for n, d in G.in_degree() if d == 0]
        
        if zero_in:
            # Break ties by sum of outgoing tiebreak scores
            scores = {n: sum(data['tiebreak'] for _, _, data in G.out_edges(n, data=True)) 
                      for n in zero_in}
            next_node = max(scores, key=scores.get)
        else:
            # Cycle exists → pick node with lowest "net in vs out tiebreak"
            scores = {}
            for n in G.nodes():
                out_score = sum(data['tiebreak'] for _, _, data in G.out_edges(n, data=True))
                in_score = sum(data['tiebreak'] for _, _, data in G.in_edges(n, data=True))
                scores[n] = out_score - in_score  # higher is more "dominant"
            # Remove node with highest dominance first
            next_node = max(scores, key=scores.get)
        
        ranked_nodes.append(next_node)
        G.remove_node(next_node)
    
    return ranked_nodes


In [None]:
df['model_probability'] = ordering_model.predict_proba(X)[:, 1]
df['model_prediction'] = ordering_model.predict(X)

df['baseline_tiebreak'] = df.apply(make_baseline_tiebreak(secondary_heuristic), axis=1)
df['baseline_prediction'] = df.apply(make_baseline_predict(primary_heuristic, secondary_heuristic), axis=1)

In [None]:
agree_on_first = {}
model_rankings = {}
baseline_rankings = {}

for submission_id, submission_df in df.groupby('submission id'):
    model_ranked = rank_submission(submission_df, 'model_prediction', 'model_probability')
    baseline_ranked = rank_submission(submission_df, 'baseline_prediction', 'baseline_tiebreak')
    agree_on_first[submission_id] = model_ranked[0] == baseline_ranked[0]
    model_rankings[submission_id] = model_ranked
    baseline_rankings[submission_id] = baseline_ranked

agree_on_first = pd.Series(agree_on_first)

In [None]:
eligible = agree_on_first[agree_on_first].index

***

# Sample for the study

In [None]:
rng = np.random.default_rng(RANDOM_SEED)

# Stratify by topic
strata_labels = items['topic'].unique()
strata_counts = np.zeros(strata_labels.shape[0], dtype=int)
strata = log.loc[eligible, 'item'].map(items['topic'])

# Greedy sampling
sampled_ids = []
while len(sampled_ids) < SAMPLE_SIZE:
    # Least represented strata
    topic_idx = rng.choice(np.where(strata_counts == strata_counts.min())[0], size=1)
    # Collect eligible submissions
    in_strata = strata[strata == strata_labels[topic_idx][0]].index
    unassigned_ids_in_strata = eligible.intersection(in_strata).difference(sampled_ids)
    try:
        sampled_ids.append(rng.choice(unassigned_ids_in_strata, replace=False))
    except ValueError:
        pass
    strata_counts[topic_idx] += 1

***

# Analysis

## Coverage

In [None]:
defect_counts = defect_log.loc[sampled_ids].sum().sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(8, 4))
ax.bar(defect_counts.index.astype(str), defect_counts.values)
ax.set_xlabel("Defect ID")
ax.set_ylabel("Number of Submissions")
ax.set_title("Coverage of Defects in Hold-Out Partition")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(IMAGE_DIR / "defect_coverage.png", dpi=RESOLUTION)
plt.show()

In [None]:
task_counts = log.loc[sampled_ids, "item"].value_counts().reindex(items.index).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(8, 4))
ax.bar(task_counts.index.astype(str), task_counts.values)
ax.set_xlabel("Task ID")
ax.set_ylabel("Number of Submissions")
ax.set_title("Coverage of Tasks in Hold-Out Partition")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(IMAGE_DIR / "task_coverage.png", dpi=RESOLUTION)
plt.show()


In [None]:
topics = items['topic'].unique()
topic_counts = items.loc[log.loc[sampled_ids, "item"], "topic"].value_counts().reindex(topics).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(topic_counts.index.astype(str), topic_counts.values)
ax.set_xlabel("Topic")
ax.set_ylabel("Number of Submissions")
ax.set_title("Coverage of Topics in Hold-Out Partition")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(IMAGE_DIR / "topic_coverage.png", dpi=RESOLUTION)
plt.show()


***

# Generate explanations

***

# Generate survey files

***

# Export