# Set up and global variables

In [None]:
from pathlib import Path
from collections import defaultdict

import os
import matplotlib.pyplot as plt
import re
import seaborn as sns
import pandas as pd
import numpy as np
import random
import json
from matplotlib.colors import ListedColormap
from IPython.display import display, HTML
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupKFold, cross_val_score, LeaveOneGroupOut, train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import src.ipython_loader as loader
from src.prioritization import PrioritizationModel, DummyTaskPrioritizer

RESOLUTION = 300
VERSION = '0.0.0'
DATASET_PATH = Path('data') / 'datasets' / f'ipython_{VERSION}'
OUTPUT_PATH = DATASET_PATH / 'heuristics'
BINARY_CMAP = ListedColormap(['red', 'green'])

IMAGE_DIR = Path('images') / "heuristics"

os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(IMAGE_DIR, exist_ok=True)

MIN_TASK_DEFECT_SUBMISSIONS = 5

## Plotting utils

In [None]:
def truncate_text(text: str, max_length: int=20):
    """Truncate text if it exceeds the maximum length."""
    if len(text) > max_length:
        return text[:max_length - 3] + '...'
    return text

def abbreviate_and_truncate_text(text: str, ordered_abbreviations: dict | None=None, max_length: int=20):
    """Shorten text by applying a list of abbreviations and truncating if necessary."""
    current_text = text

    if ordered_abbreviations:
        for full_word, abbr in ordered_abbreviations.items():
            # Use regex with word boundaries to ensure we replace full words only
            pattern = re.escape(full_word)
            current_text = re.sub(pattern, abbr, current_text, flags=re.IGNORECASE)

            if len(current_text) <= max_length:
                return current_text
    
    return truncate_text(current_text)

ordered_abbreviations = {
    'whitespace': 'ws',
    'constant': 'const',
    'variable': 'var',
    'function': 'func',
    'parameter': 'param',
    'expression': 'expr',
    'argument': 'arg',
    'operator': 'op',
    'operation': 'op',
    'augmentable': 'aug',
    'assignment': 'assign',
    'container': 'cont',
    'statement': 'stmt',
    'arithmetic': 'arith',
    'condition': 'cond',
    'identifier': 'identif',
    'multiple': 'multi',
    'redundant': 'redun',
    'necessary': 'necces',
    'comparison': 'compar',
    'negated': 'neg',
    'unreachable': 'unreach',
    'inappropriate': 'inapp',
    'parenthesis': '()',
}

In [None]:
def task_and_defect_description(task, defect, items, defects, log, defect_log):
    """Generate an HTML display for a specific task and defect."""
    task_row = items.loc[task]
    defect_row = defects.loc[defect]
    submissions = log[(log["item"] == task) & (defect_log[defect])]
    
    return f"""
    <div style="display: flex; justify-content: space-between; gap: 20px;">
        
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>{task_row["name"]}</h3>
            <div><strong>Instructions:</strong><br>{task_row["instructions"]}</div>
            <div><strong>Solution:</strong><br>
                <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{task_row["solution"]}</pre>
            </div>
        </div>
        
        
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>{defect_row["defect name"]}</h3>
            <div><strong>Defect Type:</strong> {defect_row["defect type"]}</div>
            <div><strong>Severity:</strong> {defect_row["severity"]}</div>
            <div><strong>Description:</strong><br>{defect_row["description"]}</div>
            
            <div style="display: flex; justify-content: space-between; margin-top: 20px;">
                <div style="width: 48%; padding: 10px;">
                    <strong>Code Example:</strong><br>
                    <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{defect_row["code example"]}</pre>
                </div>
                <div style="width: 48%; padding: 10px;">
                    <strong>Code Fix Example:</strong><br>
                    <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{defect_row["code fix example"]}</pre>
                </div>
            </div>
        </div>
    </div>
    
    
    <div style="border: 1px solid #ccc; padding: 10px; margin-top: 20px; border-radius: 5px;">
        <strong>Example Submission:</strong><br>
        <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{submissions["answer"].iloc[random.randint(0, len(submissions) - 1)] if len(submissions) else 'No submissions found'}</pre>
    </div>
    """

***

# Loading data

In [None]:
items = pd.read_csv(DATASET_PATH / f'items_{VERSION}.csv', index_col=0)
log = pd.read_csv(DATASET_PATH / f'log_{VERSION}.csv', index_col=0, parse_dates=['time'])
defects = pd.read_csv(DATASET_PATH / f'defects_{VERSION}.csv', index_col=0)
defect_log = pd.read_csv(DATASET_PATH / f'defect_log_{VERSION}.csv', index_col=0)
defect_log.columns = defect_log.columns.astype(int)
code_to_defect_id = json.load(open(DATASET_PATH / f'code_to_defect_id_{VERSION}.json', "r"))
defect_presence = defect_log > 0
defects['display name'] = defects['defect name'].apply(lambda x: abbreviate_and_truncate_text(x, ordered_abbreviations))
items['display name'] = items['name'].apply(truncate_text)

***

# Task filtering

## Task-defect pairs without minimal support

In [None]:
insufficient_support = defect_presence.groupby(log["item"]).sum() < MIN_TASK_DEFECT_SUBMISSIONS

***

# Task-Context Heuristics

## Rare Task-Defect Pairs

Based on the relative frequency of a defect appearing in a given task.

In [None]:
class TaskRarityPrioritizer(DummyTaskPrioritizer):
    """Prioritizes defects based on how rare they are for a specific task."""

    def train(self, log, defect_log, defects, items):
        """Train the model."""
        super().train(log, defect_log, defects, items)
        
        defect_log = defect_log > 0
        self.task_weights = defect_log.groupby(log["item"]).mean()
        self.task_weights = 1 - self.task_weights.loc[items.index, defects.index]

        return self

In [None]:
task_rarity_prioritizer = TaskRarityPrioritizer().train(log, defect_log, defects, items)
task_rarity_prioritizer.save(OUTPUT_PATH / f'task_rarity_prioritizer_{VERSION}.pkl')

### Analysis

In [None]:
rarity_values = task_rarity_prioritizer.task_weights.values.flatten()
rarity_values = rarity_values[rarity_values < 1]

fig, ax = plt.subplots(figsize=(12, 8), layout="constrained")
sns.histplot(data=rarity_values, bins=80, shrink=0.8, ax=ax)

ax.set_yscale('log')
ax.set_xlabel('Rarity (1 - Task-Relative Defect Frequency)')
ax.set_ylabel('Frequency (Log Scale)')
ax.set_title('Distribution of Task-Defect Rarity (Inverse Task-Relative Defect Frequency)')

plt.savefig(IMAGE_DIR / 'task-defect_rarity_distribution.png', dpi=RESOLUTION)
plt.close()

## Surprising/Characteristic Task-Defect Pairs

Based on the z-score of a task-defect pair's frequency compared to the overall defect frequency.

In [None]:
class TaskCharacteristicPrioritizer(DummyTaskPrioritizer):
    """Prioritizes defects based on how characteristic they are for a specific task."""

    def train(self, log, defect_log, defects, items):
        """Train the model."""
        super().train(log, defect_log, defects, items)
        
        defect_log = defect_log > 0
        self.task_weights = (defect_log.groupby(log['item']).mean() - defect_log.mean()) / defect_log.std()

        return self

In [None]:
task_characteristic_prioritizer = TaskCharacteristicPrioritizer().train(log, defect_log, defects, items)
task_characteristic_prioritizer.save(OUTPUT_PATH / f'task_characteristic_prioritizer_{VERSION}.pkl')

In [None]:
task_defect_z_scores = task_characteristic_prioritizer.task_weights
z_score_values = pd.Series(task_defect_z_scores.values.flatten()).dropna()

### Analysis

In [None]:
masked_z_scores = task_defect_z_scores.copy()
masked_z_scores[insufficient_support] = None

defect_names = defects['display name'].loc[task_defect_z_scores.columns]
task_names = items['display name'].loc[task_defect_z_scores.index]

fig, ax = plt.subplots(figsize=(12, 17), layout="constrained")

lim = max(abs(z_score_values.max()), abs(z_score_values.min()))

sns.heatmap(masked_z_scores, xticklabels=defect_names, yticklabels=task_names, cbar=True, cmap='coolwarm', vmin=-lim, vmax=lim)
ax.tick_params(axis='x', labelsize=7)
ax.tick_params(axis='y', labelsize=8)
plt.title('Characteristic Task-Defect Pairs (Z-Scores of Task-Relative Defect Frequencies with Minimum Support)')
plt.xlabel("Defects")
plt.ylabel("Tasks")

plt.savefig(IMAGE_DIR / 'task-defect_characteristic_heatmap.png', dpi=RESOLUTION)
plt.close()

In [None]:
plt.figure(figsize=(12, 6), layout="constrained")
plt.hist(z_score_values, bins=100, color='skyblue', edgecolor='black')

plt.title('Distribution Task-Defect Pair Characteristic Scores (Z-Scores of Task-Relative Defect Frequencies)')
plt.xlabel('Z-Score')
plt.ylabel('Frequency')
plt.grid(True)

plt.savefig(IMAGE_DIR / 'task-defect_characteristic_distribution.png', dpi=RESOLUTION)
plt.close()

## Currently Taught Topic

An LLM-based heuristic on whether a defect is related to a topic currently being taught.

In [None]:
class CurrentlyTaughtPrioritizer(DummyTaskPrioritizer):
    """Prioritizes defects based on LLM judgements on where they relate to the currently taught concepts."""

    def __init__(self, data_path: Path | str, *args, **kwargs):
        """Initialize the model.

        Keyword Arguments:
            data_path -- Path to the LLM judgements.
        """
        super().__init__(*args, **kwargs)
        self.data = pd.read_csv(data_path, sep='|', index_col=False)

    def train(self, log, defect_log, defects, items):
        """Train the model."""
        super().train(log, defect_log, defects, items)

        task_name_to_id = items.reset_index().set_index('name')['id']
        self.data['Task ID'] = self.data['Task Name'].map(task_name_to_id)

        self.task_weights = pd.crosstab(self.data['Task ID'], self.data['Defect ID']).astype(bool).astype(int)
        self.task_weights = self.task_weights.reindex(index=items.index, columns=defects.index, fill_value=0)

        return self

In [None]:
currently_taught_prioritizer = CurrentlyTaughtPrioritizer('data/currently_taught.txt').train(log, defect_log, defects, items)
currently_taught_prioritizer.save(OUTPUT_PATH / f'currently_taught_prioritizer_{VERSION}.pkl')

### Analysis

In [None]:
currently_taught = currently_taught_prioritizer.task_weights

defect_names = defects['display name'].loc[currently_taught.columns]
task_names = items['display name'].loc[currently_taught.index]

fig, ax = plt.subplots(figsize=(12, 17), layout="constrained")

sns.heatmap(currently_taught, xticklabels=defect_names, yticklabels=task_names)

ax.tick_params(axis='x', labelsize=7)
ax.tick_params(axis='y', labelsize=8)
plt.title('Defects Related to Concepts Currently Taught in the Task')
plt.xlabel("Defects")
plt.ylabel("Tasks")

plt.savefig(IMAGE_DIR / 'task-defect_currently_taught.png', dpi=RESOLUTION)
plt.close()


***

# Student-Context Heuristics

## Student-Specific Frequency (Accuracy)

Calculates the accuracy of each student on each defect over time.

### Calculation

In [None]:
df = log.merge(defect_log, left_index=True, right_index=True)
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values(by=['user', 'time'])
accuracy_log = []

for user_id, history in tqdm(df.groupby('user')):
    correct_count = {defect: 0 for defect in defect_log.columns}
    encounter_count = {defect: 0 for defect in defect_log.columns}
    
    for i, (idx, row) in enumerate(history.iterrows()):
        accuracy_row = {}
        task_id = row['item']
        is_reasonable_mask = ~rare.loc[task_id]
        
        for defect in defect_log.columns:
            is_reasonable = is_reasonable_mask[defect]
            
            if row[defect] == 0 and is_reasonable:
                correct_count[defect] += 1
            
            if row[defect] == 1 or is_reasonable:
                encounter_count[defect] += 1
                if encounter_count[defect] > 0:
                    accuracy_row[defect] = correct_count[defect] / encounter_count[defect]
                else:
                    accuracy_row[defect] = np.nan
            else:
                accuracy_row[defect] = np.nan

        accuracy_row['submission id'] = idx
        accuracy_log.append(accuracy_row)

accuracy_log = pd.DataFrame(accuracy_log).set_index('submission id')


### Analysis and Visualization

In [None]:
student_specific_frequency = (accuracy_log - accuracy_log.mean()) / accuracy_log.std()
values = pd.Series(student_specific_frequency.values.flatten()).dropna().values

upper_quantile = 0.90
student_upper_threshold = np.quantile(values, upper_quantile)
lower_quantile = 0.20
student_lower_threshold = np.quantile(values, lower_quantile)

plt.figure(figsize=small_figsize, layout="constrained")
plt.hist(values, bins=200, edgecolor='black')
plt.axvline(student_lower_threshold, color='red', linestyle='--', linewidth=2, label=f'{int(lower_quantile*100)}% threshold')
plt.axvline(student_upper_threshold, color='green', linestyle='--', linewidth=2, label=f'{int(upper_quantile*100)}% threshold')
plt.title('Distribution of Student-Specific Frequency Z-Scores')
plt.xlabel('Z-Score')
plt.ylabel('Count')
plt.legend()
plt.grid(True)
if save:
    plt.savefig(IMAGE_DIR / 'student_specific_frequency_histogram.png', dpi=RESOLUTION)
plt.show()

accuracy_at_least_once = accuracy_log[accuracy_log < 1]

plot_histogram(accuracy_at_least_once.values.flatten(), 'Distribution of User-Defect Accuracy', bins=10, save_path='user_defect_accuracy_distribution.png')
plot_histogram(accuracy_at_least_once.groupby(log['user']).mean().values.flatten(), 'Distribution of User-Defect Accuracy (User Averages)', bins=10, save_path='user_defect_accuracy_user_averages.png')

accuracy_means = accuracy_log.mean().sort_values(ascending=False)
plt.figure(figsize=long_figsize, layout="constrained")
plt.bar(np.arange(len(accuracy_means)), accuracy_means)
plt.title("Average Accuracy per Defect")
plt.ylabel("Average Accuracy")
plt.xlabel("Defect")
plt.xticks(np.arange(len(accuracy_means)), [defects['display name'][idx] for idx in accuracy_means.index], rotation=90)
plt.grid(axis='y')
if save:
    plt.savefig(IMAGE_DIR / 'average_accuracy_per_defect.png', dpi=RESOLUTION)
plt.show()

has_reasonable = (~unreasonable).sum()
has_reasonable = has_reasonable.loc[accuracy_means.index]

plt.figure(figsize=(12, 6), layout="constrained")
plt.bar(np.arange(len(has_reasonable)), has_reasonable)
plt.title("Number of Reasonable Opportunities per Defect")
plt.ylabel("Number of Tasks")
plt.xlabel("Defect")
plt.xticks(np.arange(len(has_reasonable)), [defects['display name'][idx] for idx in has_reasonable.index], rotation=90)
plt.grid(axis='y')
plt.show()

## Defect Multiplicity

Examines how often a defect appears multiple times in a single submission.

### Calculation and Analysis

In [None]:
multiplicity_log = defect_log.copy()
multiplicity_log[multiplicity_log > 10] = 10

means = multiplicity_log[multiplicity_log > 0].mean().sort_values(ascending=False)
plt.figure(figsize=long_figsize, layout="constrained")
plt.bar(np.arange(len(means)), means.values)
plt.xticks(np.arange(len(means)),[defects['display name'][idx] for idx in means.index], rotation=90)
plt.title(f"Defects by Mean Multiplicity (When Occuring)")
plt.xlabel("Defect")
plt.ylabel("Mean Multiplicity")
if save:
    plt.savefig(IMAGE_DIR / 'defects_by_mean_multiplicity.png', dpi=RESOLUTION)
plt.show()

vars = multiplicity_log[multiplicity_log > 0].var().sort_values(ascending=False)
plt.figure(figsize=small_figsize, layout="constrained")
plt.bar(np.arange(len(vars)), vars.values)
plt.xticks(np.arange(len(vars)),[defects['display name'][idx] for idx in vars.index], rotation=90)
plt.title(f"Multiplicity Variance")
plt.xlabel("Defect")
plt.ylabel("Multiplicity Variance")
if save:
    plt.savefig(IMAGE_DIR / 'multiplicity_variance.png', dpi=RESOLUTION)
plt.show()

plot_histogram(multiplicity_log[multiplicity_log > 0].values.flatten(), 'Distribution of Multiplicity', bins=10, save_path='multiplicity_distribution.png')


## Student Recently Fixed

Examines the recency of a defect being fixed by a student.

### Calculation

In [None]:
np.random.seed(42)
df = log.merge(defect_log, left_index=True, right_index=True)
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values(by=['user', 'time'])

recently_fixed_log = []
for user_id, history in tqdm(df.groupby('user')):
    last_fixed = {defect: None for defect in defect_log.columns}
    for i, (idx, row) in enumerate(history.iterrows()):
        recency_row = {}
        for defect in defect_log.columns:
            if row[defect] == 1:
                if np.random.rand() < 0.8:
                    recency_row[defect] = 0
                    last_fixed[defect] = i
                else:
                    recency_row[defect] = np.nan
            else:
                if last_fixed[defect] is not None:
                    recency_row[defect] = i - last_fixed[defect]
                else:
                    recency_row[defect] = np.nan
        recency_row['submission id'] = idx
        recently_fixed_log.append(recency_row)

recently_fixed_log = pd.DataFrame(recently_fixed_log).set_index('submission id')


### Analysis and Visualization

In [None]:
first_time_rate = (recently_fixed_log == 0)[~recently_fixed_log.isna()].mean().sort_values(ascending=True)

plt.figure(figsize=long_figsize, layout="constrained")
plt.bar(np.arange(len(first_time_rate)), first_time_rate)
plt.title("Percentage of First-Time Occurrences per Defect")
plt.ylabel("First-Time Rate")
plt.xlabel("Defect")
plt.xticks(np.arange(len(first_time_rate)), [defects['display name'][idx] for idx in first_time_rate.index], rotation=90)
plt.grid(axis='y')
if save:
    plt.savefig(IMAGE_DIR / 'percentage_of_first_time_occurrences_per_defect.png', dpi=RESOLUTION)
plt.show()

bins = [0, 1, 2, 4, 9, 14, 19, 24, 29, np.inf]
bin_labels = ['1', '2', '3-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30+']
recency_binned = recently_fixed_log.apply(lambda col: pd.cut(col, bins=bins, labels=bin_labels))
recency_counts = recency_binned.apply(lambda col: col.value_counts()).fillna(0).astype(int)
recency_scaled = recency_counts.div(recency_counts.sum(axis=0), axis=1)
recency_scaled = recency_scaled.loc[:, first_time_rate.index]

plt.figure(figsize=(12, 6), layout="constrained")
sns.heatmap(recency_scaled, cmap='viridis', cbar_kws={'label': 'frequency'}, xticklabels=[defects['display name'][idx] for idx in recency_scaled.columns])
plt.title("Number of Sessions Before Fix Reoccurrence")
plt.xlabel("Defect")
plt.ylabel("Recency Bin")
plt.gca().invert_yaxis()
if save:
    plt.savefig(IMAGE_DIR / 'heatmap_of_recency_bins_per_defect.png', dpi=RESOLUTION)
plt.show()

mean_recency = recently_fixed_log.replace(0, np.nan).median()
mean_recency = mean_recency.loc[first_time_rate.index]

plt.figure(figsize=long_figsize, layout="constrained")
plt.bar(np.arange(len(mean_recency)), mean_recency)
plt.title("Median Recency (# of Submissions Since Last Seen) per Defect")
plt.ylabel("Median Recency")
plt.xlabel("Defect")
plt.xticks(np.arange(len(mean_recency)), [defects['display name'][idx] for idx in first_time_rate.index], rotation=90)
plt.grid(axis='y')
if save:
    plt.savefig(IMAGE_DIR / 'average_recency_per_defect.png', dpi=RESOLUTION)
plt.show()

***

# Baseline Heuristics

## Severity

Analyzes the severity of defects as a prioritization baseline.

### Calculation

In [None]:
severity_log = defect_log * defects.loc[defect_log.columns]['severity']
severity_log[severity_log.isna()] = 0

### Analysis and Visualization

In [None]:
df_severity = severity_log.copy()
df_severity[df_severity == 0] = np.nan
means = df_severity.groupby(log['item']).max().mean(axis=1).sort_values()
plt.figure(figsize=(13, 4), layout="constrained")
plt.bar(np.arange(len(means)), means.values)
plt.xticks(np.arange(len(means)), [items['name'][idx].split(' ')[0] for idx in means.index], rotation=90)
plt.title("Mean Severity for Each Task")
plt.xlabel("Task")
plt.ylabel("Severity")
plt.tick_params(axis='x', labelsize=7)
if save:
    plt.savefig(IMAGE_DIR / 'mean_severity.png', dpi=RESOLUTION)
plt.show()

df = severity_log.copy()
df[df == 0] = np.nan
differences = df.apply(lambda row: -row.nlargest(2).diff().iloc[-1] if len(row.nlargest(2)) > 1 else np.nan, axis=1).value_counts().sort_index()

plt.figure(figsize=small_figsize, layout="constrained")
plt.bar(differences.index.astype(int), differences.values, edgecolor='black')
plt.title('Histogram of Differences in Severity')
plt.xlabel('Difference')
plt.ylabel('Count')
if save:
    plt.savefig(f'images/severity_differences.png', dpi=RESOLUTION)
plt.show()


***

# Rejected Heuristics

## Time Spent on Task

A heuristic based on the time a student spends on a task, with outliers clipped.

### Calculation

In [None]:
log['relative_time_spent'] = log['responseTime'] / log.groupby('item')['responseTime'].transform('mean')
clip_threshold = 5
print(f'Clipping all submissions to {clip_threshold} times the task mean: {(log["relative_time_spent"] > clip_threshold).mean():.2%} changed.')
log.loc[log['relative_time_spent'] > clip_threshold, 'relative_time_spent'] = clip_threshold
time_spent_threshold = log['relative_time_spent'].quantile(0.90)

#### Visualization

In [None]:
plt.figure(figsize=small_figsize, layout="constrained")
sns.histplot(log['relative_time_spent'], bins=1000, kde=True)
plt.axvline(time_spent_threshold, color='red', linestyle='--', label=f'{time_spent_threshold:.2f} (90th Percentile)')
plt.title("Distribution of Relative Time Spent per Task (Values Over 5 Clipped)")
plt.xlabel("Relative Time Spent")
plt.ylabel("Count")
plt.legend()
if save:
    plt.savefig(IMAGE_DIR / 'distribution_of_relative_time_spent_per_task.png', dpi=RESOLUTION)
plt.show()

## Associated with Poor Performance (Locally)

A heuristic based on the precision of a defect in predicting task failures.

### Calculation

In [None]:
from sklearn.metrics import precision_score

_, not_log, _, not_defect_log, _ = loader.load(DATASET_PATH, DATASET_PATH, only_correct=False)
df = not_log[['item', 'correct']].merge(not_defect_log, left_index=True, right_index=True)
correlations = {}
for task_id, task_df in df.groupby('item'):
    corr_dict = {}
    for defect in defect_log.columns:
        defect_presence = task_df[defect]
        incorrect = ~task_df['correct']
        if defect_presence.nunique() > 1 and incorrect.nunique() > 1:
            corr_dict[defect] = precision_score(incorrect, defect_presence)
        else:
            corr_dict[defect] = np.nan
    correlations[task_id] = corr_dict
performance = pd.DataFrame.from_dict(correlations, orient='index')
performance[unreasonable] = np.nan

### Visualization

In [None]:
defect_names = defects['display name'].loc[performance.columns]
task_names = items['name'].loc[performance.index].apply(lambda x: abbreviate_text(x))

fig, ax = plt.subplots(figsize=(12, 17), layout="constrained")
sns.heatmap(performance, xticklabels=defect_names, yticklabels=task_names, cbar_kws={'label': 'Precision'})
ax.tick_params(axis='x', labelsize=7)
ax.tick_params(axis='y', labelsize=8)
plt.title('Defect-Failure Precision')
plt.xlabel("Defects")
plt.ylabel("Tasks")
if save:
    plt.savefig(IMAGE_DIR / 'task-defect_precision_heatmap.png', dpi=RESOLUTION)
plt.show()

performances = performance.stack().dropna()

plt.figure(figsize=small_figsize, layout="constrained")
plt.hist(performances, bins=100, color='skyblue', edgecolor='black')
plt.title('Histogram of Task-Defect Pair Precision in Predicting Failures')
plt.xlabel('Precision')
plt.ylabel('Frequency')
plt.grid(True)
if save:
    plt.savefig(IMAGE_DIR / 'histogram_of_task-defect_pair_precision.png', dpi=RESOLUTION)
plt.show()


## Future Opportunity Likelihood

A heuristic based on the probability of a student encountering a specific defect in a future task.

### Calculation

In [None]:
all_tasks = items.index
opportunity_log = []
df = log.merge(defect_log, left_index=True, right_index=True)
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values(by=['user', 'time'])

for user_id, history in tqdm(df.groupby('user')):
    completed_tasks = set()
    for submission_id, row in history.iterrows():
        completed_tasks.add(row['item'])
        remaining_tasks = [t for t in all_tasks if t not in completed_tasks]
        if not remaining_tasks:
            opportunity_log.append({'submission id': submission_id, **{defect: 0.0 for defect in relative_frequencies.columns}})
            continue
        
        weights = np.array([2.0 if t > row['item'] else 1.0 for t in remaining_tasks])
        weighted_avg = (relative_frequencies.loc[remaining_tasks].T @ weights) / weights.sum()
        opportunity_log.append({'submission id': submission_id, **weighted_avg.to_dict()})

opportunity_log = pd.DataFrame(opportunity_log).set_index('submission id')

### Analysis and Visualization

In [None]:
plot_histogram(opportunity_log.values.flatten(), 'Histogram of Future Opportunities', save_path='histogram_of_future_opportunities.png')

means = opportunity_log.mean().sort_values(ascending=False)
plt.figure(figsize=long_figsize, layout="constrained")
bar_width = 0.4
ticks = np.arange(len(means)) + 0.4

plt.bar(ticks - bar_width / 2, means.values, label='Opportunity', width=bar_width)
plt.bar(ticks + bar_width / 2, defect_log[means.index].mean().values, label='Frequency', width=bar_width)

plt.xticks(ticks,[defects['display name'][idx] for idx in means.index], rotation=90)
plt.title(f"Defects by Mean Opportunity vs Frequency")
plt.xlabel("Defect")
plt.ylabel("Mean Opportunity")
plt.legend()
if save:
    plt.savefig(IMAGE_DIR / 'defects_by_mean_opportunity.png', dpi=RESOLUTION)
plt.show()

task_opportunities = opportunity_log.groupby(log['item']).mean()
task_opportunities = (task_opportunities - task_opportunities.mean(axis=0))

defect_names = defects['display name'].loc[task_opportunities.columns]
task_names = items['name'].loc[task_opportunities.index].apply(lambda x: abbreviate_text(x))

fig, ax = plt.subplots(figsize=(12, 17), layout="constrained")
sns.heatmap(task_opportunities, xticklabels=defect_names, yticklabels=task_names, cmap='coolwarm', cbar_kws={'label': 'Z-Score'}) # assuming similar scale as z-score heatmap
ax.tick_params(axis='x', labelsize=7)
ax.tick_params(axis='y', labelsize=8)
plt.title('Future Opportunity Z-Score by Task-Defect Pair')
plt.xlabel("Defects")
plt.ylabel("Tasks")
if save:
    plt.savefig(IMAGE_DIR / 'task-defect_opportunity_heatmap.png', dpi=RESOLUTION)
plt.show()