# Set up and global variables

In [None]:
import os

from pathlib import Path
from itertools import combinations

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from tqdm import tqdm
from scipy.stats import kendalltau

from src.prioritization import *

SAMPLE_SIZE = 30

In [None]:
os.environ["CONFIG_ENV"] = "debug"
if False:
    os.environ["CONFIG_ENV"] = "production"

from config import load_config
config = load_config()

DEBUG = config["DEBUG"]
RESOLUTION = config['DEFAULTS']['resolution']
RANDOM_SEED = config['DEFAULTS']['random_seed']

# input data
STORAGE_PATH = config['PATHS']['storage']
HOLD_OUT_DATA_PATH = config['PATHS']['student_hold_out_set']
TRAINED_MODELS_PATH = config['PATHS']['hold_out_trained_heuristics']
FINAL_MODEL_PATH = None  # TODO

# output data
STUDY_OUTPUT_PATH = HOLD_OUT_DATA_PATH / 'student_study_submissions'
IMAGE_DIR = config['PATHS']['images'] / 'student_study_preparation'

os.makedirs(STUDY_OUTPUT_PATH, exist_ok=True)
os.makedirs(IMAGE_DIR, exist_ok=True)

***

# Loading data

In [None]:
items = pd.read_csv(STORAGE_PATH / 'items.csv', index_col=0)
defects = pd.read_csv(STORAGE_PATH / f'defects.csv', index_col=0)

log = pd.read_csv(HOLD_OUT_DATA_PATH / 'log.csv', index_col=0, parse_dates=['time'])
defect_log = pd.read_csv(HOLD_OUT_DATA_PATH / 'defect_log.csv', index_col=0)
defect_log.columns = defect_log.columns.astype(int)

In [None]:
models = [
    TaskCommonModel,
    TaskCharacteristicModel,
    StudentCommonModel,
    StudentCharacteristicModel,
    StudentEncounteredBeforeModel,
    DefectMultiplicityModel,
    SeverityModel,
]

models = [model.load(TRAINED_MODELS_PATH / f'{model.get_model_name()}.pkl') for model in models]

models = {model.get_model_name(): model for model in models}

***

# Quality filtering

In [None]:
# at least two defects
is_non_trivial = (defect_log > 0).sum(axis=1) > 1

In [None]:
# severity is decisive
severities = (defect_log > 0).astype(int) * defects.loc[defect_log.columns]['severity']
is_decisive = severities.apply(lambda s: np.ptp(s) > 0, axis=1)
is_non_trivial = is_non_trivial & is_decisive

In [None]:
print(f"Pct of tied severity results: {is_decisive[is_non_trivial].mean()}")

***

# Generate heuristic scores

In [None]:
# Calculate sampling statistics and remember heuristic scores
discrete_scores = []
continuous_scores = []

# Ensure log is sorted by time
log = log.sort_values(by='time')

for idx, submission in tqdm(log.iloc[:500].iterrows() if DEBUG else log.iterrows(), total=log.shape[0], desc="Calculating statistics for sampling"):
    defect_counts = defect_log.loc[idx]

    if not is_non_trivial.loc[idx]:
        for model in models.values():
            model.update(submission, defect_counts)
        continue

    # Heuristic scores
    discrete_model_scores = {name: model.discretize(submission, defect_counts).dropna() for name, model in models.items()}
    continuous_model_scores = {name: model._calculate_scores(submission, defect_counts) for name, model in models.items()}

    # Save model scores
    for defect in defect_counts[defect_counts > 0].index:
        discrete_row = {"submission id": idx, "defect id": defect}
        continuous_row = discrete_row.copy()
        for name, scores in discrete_model_scores.items():
            discrete_row[name] = scores[defect]
            continuous_row[name] = continuous_model_scores[name][defect]
        discrete_scores.append(discrete_row)
        continuous_scores.append(continuous_row)

discrete_scores = pd.DataFrame(discrete_scores)
continuous_scores = pd.DataFrame(continuous_scores)

***

# Construct dataset

## Encode as pairs

## Add features

***

# Filter for cases where model and baseline disagree

In [None]:
eligible = log.loc[is_non_trivial]

***

# Survey sample

In [None]:
rng = np.random.default_rng(RANDOM_SEED)

# Stratify by topic
strata_labels = items['topic'].unique()
strata_counts = np.zeros(strata_labels.shape[0], dtype=int)
strata = log.loc[eligible.index, 'item'].map(items['topic'])

# Greedy sampling
sampled_ids = []
while len(sampled_ids) < SAMPLE_SIZE:
    # Least represented strata
    topic_idx = rng.choice(np.where(strata_counts == strata_counts.min())[0], size=1)
    # Collect eligible submissions
    unassigned_ids_in_strata = eligible[strata == strata_labels[topic_idx][0]].index.difference(sampled_ids)
    try:
        sampled_ids.append(rng.choice(unassigned_ids_in_strata, replace=False))
    except ValueError:
        pass
    strata_counts[topic_idx] += 1

***

# Analysis

## Coverage

In [None]:
defect_counts = defect_log.loc[sampled_ids].sum().sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(8, 4))
ax.bar(defect_counts.index.astype(str), defect_counts.values)
ax.set_xlabel("Defect ID")
ax.set_ylabel("Number of Submissions")
ax.set_title("Coverage of Defects in Hold-Out Partition")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(IMAGE_DIR / "defect_coverage.png", dpi=RESOLUTION)
plt.show()

In [None]:
task_counts = log.loc[sampled_ids, "item"].value_counts().reindex(items.index).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(8, 4))
ax.bar(task_counts.index.astype(str), task_counts.values)
ax.set_xlabel("Task ID")
ax.set_ylabel("Number of Submissions")
ax.set_title("Coverage of Tasks in Hold-Out Partition")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(IMAGE_DIR / "task_coverage.png", dpi=RESOLUTION)
plt.show()


In [None]:
topics = items['topic'].unique()
topic_counts = items.loc[log.loc[sampled_ids, "item"], "topic"].value_counts().reindex(topics).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(topic_counts.index.astype(str), topic_counts.values)
ax.set_xlabel("Topic")
ax.set_ylabel("Number of Submissions")
ax.set_title("Coverage of Topics in Hold-Out Partition")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(IMAGE_DIR / "topic_coverage.png", dpi=RESOLUTION)
plt.show()


***

# Generate survey files

***

# Save the survey files