# Set up and global variables

In [None]:
import os

from pathlib import Path
from itertools import combinations

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from tqdm import tqdm
from scipy.stats import kendalltau

from src.prioritization import *

VERSION = '0.0.0'
DATASET_PATH = Path('data') / 'datasets' / f'ipython_{VERSION}'
MODEL_PATH = DATASET_PATH / 'trained_heuristics'    

DATA_PARTITION = 'hold_out'

RANDOM_SEED = 42
SAMPLE_SIZE = 70

RESOLUTION = 300

IMAGE_DIR = Path('images') / "heuristics"

os.makedirs(IMAGE_DIR, exist_ok=True)

***

# Loading data

In [None]:
items = pd.read_csv(DATASET_PATH / 'items.csv', index_col=0)
defects = pd.read_csv(DATASET_PATH / f'defects.csv', index_col=0)

log = pd.read_csv(DATASET_PATH / DATA_PARTITION/ 'log.csv', index_col=0, parse_dates=['time'])
defect_log = pd.read_csv(DATASET_PATH / DATA_PARTITION / 'defect_log.csv', index_col=0)
defect_log.columns = defect_log.columns.astype(int)

In [None]:
models = {
    "Task Common": TaskCommonModel,
    "Task Characteristic": TaskCharacteristicModel,
    "Student Frequency": StudentFrequencyModel,
    "Student Characteristic": StudentCharacteristicModel,
    "Student Encountered": StudentEncounteredBeforeModel,
    "Defect Multiplicity": DefectMultiplicityModel,
    "Severity Baseline": SeverityModel,
}

models = {
    name: model.load(MODEL_PATH / f'{name}.pkl')
    for name, model in models.items()
}

***

# Quality filtering

In [None]:
def has_close_pair(row):
    """Check if there is a pair of values with difference two or less."""
    row_values = row.values
    return np.any(np.abs(row_values[:, None] - row_values) <= 2)

In [None]:
# at least two defects
filtered = defect_log[defect_log.sum(axis=1) > 1]

In [None]:
# at most difference of one in severity
filtered *= defects.loc[filtered.columns]['severity']
filtered = filtered[filtered.apply(has_close_pair, axis=1)]

In [None]:
# apply the filter
defect_log = defect_log.loc[filtered.index]
log = log.loc[filtered.index]

***

# Sampling criteria

In [None]:
# Calculate sampling statistics and remember heuristic scores
stats = []
heuristic_scores = []
for idx, submission in tqdm(log.iloc[:200].iterrows(), total=log.shape[0], desc="Calculating statistics for sampling"):
    defect_counts = defect_log.loc[idx]

    # Heuristic scores
    discrete_scores = {name: model.discretize(submission, defect_counts).dropna() for name, model in models.items()}

    for defect in defect_counts[defect_counts > 0].index:
        row = {"submission id": idx, "defect id": defect}
        for name, scores in discrete_scores.items():
            row[name] = scores[defect]
        heuristic_scores.append(row)

    # Disagreement
    kendall_distances = []
    for left, right in combinations(discrete_scores.values(), 2):
        tau, _ = kendalltau(left, right, nan_policy='raise')
        kendall_distances.append(1 - tau)

    disagreement = np.nanmean(kendall_distances) if any(~np.isnan(kendall_distances)) else 0.0

    # Signal strength
    signal_strength = np.mean([np.ptp(values) for values in discrete_scores.values()])

    for model in models.values():
        model.update(submission, defect_counts)

    stats.append({
        "id": idx,
        "signal strength": signal_strength,
        "model disagreement": disagreement,
    })

stats = pd.DataFrame(stats).set_index("id")
heuristic_scores = pd.DataFrame(heuristic_scores)

***

# Survey sample

In [None]:
min_score = 0.1
rng = np.random.default_rng(RANDOM_SEED)

# Normalize
scores = (stats - stats.min()) / (stats.max() - stats.min())

# Combine scores
scores['total'] = scores.sum(axis=1)
scores['total'] = np.where(scores['total'] < min_score, min_score, scores['total'])

# Scale
scores['probability'] = scores['total'] / scores['total'].sum()

# Sample without replacement
sampled_ids = rng.choice(scores.index, size=min(SAMPLE_SIZE, len(scores)), replace=False, p=scores['probability'].values)


***

# Combine to a final dataframe

In [None]:
# --- Submissions ---
submission_df = (
    log.loc[sampled_ids]
    .merge(items[['name', 'instructions']], left_on='item', right_index=True)
    .rename(columns={
        'answer': 'submission',
        'name': 'task name',
    })
    [['submission', 'task name', 'instructions']]
)

# --- Defects ---
# Long format
melted_defects = (
    defect_log.loc[sampled_ids]
    .stack()
    .reset_index()
    .rename(columns={'index': 'submission id', 'level_1': 'defect id', 0: 'count'})
)
melted_defects = melted_defects[melted_defects['count'] > 0]

# Add metadata
defect_df = (
    melted_defects
    .merge(defects, left_on='defect id', right_index=True)
    [['submission id', 'defect id', 'defect name', 'description', 'code example', 'code fix example']]
    .rename(columns={'defect name': 'name'})
)

# Add heuristic scores
defect_df = defect_df.merge(
    heuristic_scores,
    on=['submission id', 'defect id'],
    how='left'
)

# --- Heuristics ---
heuristics_df = []
for name, model in models.items():
    heuristics_df.append({
        'name': name,
        'description': model.get_model_description(),
        'scale': model.get_discretization_scale(),
    })

heuristics_df = pd.DataFrame(heuristics_df).set_index('name')