# Set up and global variables

In [None]:
from pathlib import Path
from collections import defaultdict

import os
import matplotlib.pyplot as plt
import re
import seaborn as sns
import pandas as pd
import numpy as np
import random
import json
from matplotlib.colors import ListedColormap
from IPython.display import display, HTML
from tqdm import tqdm
from scipy.stats import spearmanr

import src.ipython_loader as loader
from src.prioritization import *

RESOLUTION = 300
VERSION = '0.0.0'
DATASET_PATH = Path('data') / 'datasets' / f'ipython_{VERSION}'
OUTPUT_PATH = DATASET_PATH / 'heuristics'
BINARY_CMAP = ListedColormap(['red', 'green'])

IMAGE_DIR = Path('images') / "heuristics"

os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(IMAGE_DIR, exist_ok=True)

MIN_TASK_DEFECT_SUBMISSIONS = 5

## Utils

In [None]:
def truncate_text(text: str, max_length: int=20):
    """Truncate text if it exceeds the maximum length."""
    if len(text) > max_length:
        return text[:max_length - 3] + '...'
    return text

def abbreviate_and_truncate_text(text: str, ordered_abbreviations: dict | None=None, max_length: int=20):
    """Shorten text by applying a list of abbreviations and truncating if necessary."""
    current_text = text

    if ordered_abbreviations:
        for full_word, abbr in ordered_abbreviations.items():
            # Use regex with word boundaries to ensure we replace full words only
            pattern = re.escape(full_word)
            current_text = re.sub(pattern, abbr, current_text, flags=re.IGNORECASE)

            if len(current_text) <= max_length:
                return current_text
    
    return truncate_text(current_text)

ordered_abbreviations = {
    'whitespace': 'ws',
    'constant': 'const',
    'variable': 'var',
    'function': 'func',
    'parameter': 'param',
    'expression': 'expr',
    'argument': 'arg',
    'operator': 'op',
    'operation': 'op',
    'augmentable': 'aug',
    'assignment': 'assign',
    'container': 'cont',
    'statement': 'stmt',
    'arithmetic': 'arith',
    'condition': 'cond',
    'identifier': 'identif',
    'multiple': 'multi',
    'redundant': 'redun',
    'necessary': 'necces',
    'comparison': 'compar',
    'negated': 'neg',
    'unreachable': 'unreach',
    'inappropriate': 'inapp',
    'parenthesis': '()',
}

In [None]:
def task_and_defect_description(task, defect, items, defects, log, defect_log):
    """Generate an HTML display for a specific task and defect."""
    task_row = items.loc[task]
    defect_row = defects.loc[defect]
    submissions = log[(log["item"] == task) & (defect_log[defect])]
    
    return f"""
    <div style="display: flex; justify-content: space-between; gap: 20px;">
        
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>{task_row["name"]}</h3>
            <div><strong>Instructions:</strong><br>{task_row["instructions"]}</div>
            <div><strong>Solution:</strong><br>
                <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{task_row["solution"]}</pre>
            </div>
        </div>
        
        
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>{defect_row["defect name"]}</h3>
            <div><strong>Defect Type:</strong> {defect_row["defect type"]}</div>
            <div><strong>Severity:</strong> {defect_row["severity"]}</div>
            <div><strong>Description:</strong><br>{defect_row["description"]}</div>
            
            <div style="display: flex; justify-content: space-between; margin-top: 20px;">
                <div style="width: 48%; padding: 10px;">
                    <strong>Code Example:</strong><br>
                    <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{defect_row["code example"]}</pre>
                </div>
                <div style="width: 48%; padding: 10px;">
                    <strong>Code Fix Example:</strong><br>
                    <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{defect_row["code fix example"]}</pre>
                </div>
            </div>
        </div>
    </div>
    
    
    <div style="border: 1px solid #ccc; padding: 10px; margin-top: 20px; border-radius: 5px;">
        <strong>Example Submission:</strong><br>
        <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{submissions["answer"].iloc[random.randint(0, len(submissions) - 1)] if len(submissions) else 'No submissions found'}</pre>
    </div>
    """

In [None]:
def plot_model_weight_histogram(values, x_label, title, save_path, bins=80):
    """Plot a basic histogram."""
    fig, ax = plt.subplots(figsize=(12, 8), layout="constrained")

    n_unique = len(np.unique(values))

    sns.histplot(data=values, bins=bins if values.size > 2 else n_unique, shrink=0.8, ax=ax)
    
    ax.set_yscale('log')
    ax.set_xlabel(x_label)
    ax.set_ylabel('Frequency (Log Scale)')
    ax.set_title(title)
    
    plt.savefig(IMAGE_DIR / save_path, dpi=RESOLUTION)
    plt.close()

In [None]:
def plot_task_weight_heatmap(model_weights, defects, items, title, save_path, mask=None, cmap='viridis'):
    """Plot a heatmap of task-context model weights."""
    defect_names = defects['display name'].loc[model_weights.columns]
    task_names = items['display name'].loc[model_weights.index]

    fig, ax = plt.subplots(figsize=(12, 17), layout="constrained")
    
    if cmap == 'coolwarm':
        lim = np.abs(model_weights.values).max()
        sns.heatmap(model_weights, xticklabels=defect_names, yticklabels=task_names, cbar=True, cmap=cmap, vmin=-lim, vmax=lim, mask=mask)
    else:
        sns.heatmap(model_weights, xticklabels=defect_names, yticklabels=task_names, cbar=True, cmap=cmap, mask=mask)

    ax.tick_params(axis='x', labelsize=7)
    ax.tick_params(axis='y', labelsize=8)
    plt.title(title)
    plt.xlabel("Defects")
    plt.ylabel("Tasks")
    plt.savefig(IMAGE_DIR / save_path, dpi=RESOLUTION)
    plt.close()

In [None]:
def get_train_test_context(log, defect_log, defects, items, pct=0.8):
    """Define train/test split for task-context models."""
    train_pivot = log.iloc[int(log.shape[0] * pct)]["time"]

    train_context = ContextProvider(log[log["time"] < train_pivot], defect_log[log["time"] < train_pivot], defects, items)
    test_context = ContextProvider(log[log["time"] > train_pivot], defect_log[log["time"] > train_pivot], defects, items)

    return train_context, test_context

In [None]:
def gini(array):
    """Compute the Gini coefficient of a sorted numpy array."""
    array = array.flatten()
    # avoid zero division
    if array.sum() == 0:
        return 0.0
    # ensure all values are non-negative
    if np.amin(array) < 0:
        array -= np.amin(array)
    # order values for computation
    array = np.sort(array)
    index = np.arange(1, array.shape[0] + 1)
    # gini formula
    return ((np.sum((2 * index - array.shape[0] - 1) * array)) / (array.shape[0] * np.sum(array)))

***

# Loading data

In [None]:
items = pd.read_csv(DATASET_PATH / f'items_{VERSION}.csv', index_col=0)
log = pd.read_csv(DATASET_PATH / f'log_{VERSION}.csv', index_col=0, parse_dates=['time'])
defects = pd.read_csv(DATASET_PATH / f'defects_{VERSION}.csv', index_col=0)
defect_log = pd.read_csv(DATASET_PATH / f'defect_log_{VERSION}.csv', index_col=0)
defect_log.columns = defect_log.columns.astype(int)
code_to_defect_id = json.load(open(DATASET_PATH / f'code_to_defect_id_{VERSION}.json', "r"))
defect_presence = defect_log > 0

full_context = ContextProvider(log, defect_log, defects, items)

defects['display name'] = defects['defect name'].apply(lambda x: abbreviate_and_truncate_text(x, ordered_abbreviations))
items['display name'] = items['name'].apply(truncate_text)

***

# Task filtering

## Task-defect pairs without minimal support

In [None]:
insufficient_support = defect_presence.groupby(log["item"]).sum() < MIN_TASK_DEFECT_SUBMISSIONS

***

# Task-Context Heuristics

In [None]:
train_context, test_context = get_train_test_context(log, defect_log, defects, items, pct=0.8)

task_models = {
    "Task Common": TaskCommonModel().train(train_context),
    "Task Characteristic": TaskCharacteristicModel().train(train_context),
    "Currently Taught": CurrentlyTaughtPrioritizer('data/currently_taught.txt').train(train_context),
    "Severity Baseline": SeverityModel().train(train_context)
}

## Submissions meaningful for prioritization

In [None]:
test_defects = test_context.get_defect_log() > 0
print("Fraction of Submissions with Multiple Defects:", (test_defects.sum(axis=1) > 1).mean())
print("Total Number", (test_defects.sum(axis=1) > 1).sum())

##  Weight Histograms

In [None]:
plot_model_weight_histogram(
    task_models["Task Common"].get_model_weights().values.flatten(),
    x_label="Task-Relative Defect Frequency",
    title="Distribution of Task-Defect Commonality",
    save_path="task_common_distribution.png",
    bins=80
)

In [None]:
plot_model_weight_histogram(
    task_models["Task Characteristic"].get_model_weights().values.flatten(),
    x_label="Z-Score (Absolute Value)",
    title="Distribution of Characteristic Task-Defect Scores",
    save_path="task_characteristic_distribution.png",
    bins=80
)

In [None]:
plot_model_weight_histogram(
    task_models["Currently Taught"].get_model_weights().values.flatten(),
    x_label="LLM Judgment (0 = No, 1 = Yes)",
    title="Distribution of LLM Judgments on Currently Taught Defects",
    save_path="currently_taught_distribution.png",
    bins=2
)

## Task-Defect Weight Maps

In [None]:
plot_task_weight_heatmap(
    task_models["Task Common"].get_model_weights(),
    defects,
    items,
    "Task-Defect Commonality",
    "task_common_heatmap.png",
    mask=None
)

In [None]:
plot_task_weight_heatmap(
    task_models["Task Characteristic"].get_model_weights(),
    defects,
    items,
    "Characteristic Task-Defect Scores",
    "task_characteristic_heatmap.png",
    mask=None,
    cmap='coolwarm'
)

In [None]:
plot_task_weight_heatmap(
    task_models["Currently Taught"].get_model_weights(),
    defects,
    items,
    "LLM Judgments on Currently Taught Defects",
    "currently_taught_heatmap.png",
    mask=None
)

## Masked Task-Defect Weight Maps

In [None]:
plot_task_weight_heatmap(
    task_models["Task Common"].get_model_weights(),
    defects,
    items,
    "Task-Defect Commonality",
    "task_common_masked_heatmap.png",
    mask=insufficient_support
)

In [None]:
plot_task_weight_heatmap(
    task_models["Task Characteristic"].get_model_weights(),
    defects,
    items,
    "Characteristic Task-Defect Scores",
    "task_characteristic_masked_heatmap.png",
    mask=insufficient_support,
    cmap='coolwarm'
)

In [None]:
plot_task_weight_heatmap(
    task_models["Currently Taught"].get_model_weights(),
    defects,
    items,
    "LLM Judgments on Currently Taught Defects",
    "currently_taught_masked_heatmap.png",
    mask=insufficient_support
)

## Quantitative analysis

In [None]:
CLOSE_TIE_THRESHOLD = 0.05  # threshold for close ties as a percentage of the max score

metrics = {
    'exact_ties': {name: [] for name in task_models.keys()},
    'close_ties': {name: [] for name in task_models.keys()},
    'gini': {name: [] for name in task_models.keys()}
}

model_correlations = {
    name1: {name2: [] for name2 in task_models.keys()}
    for name1 in task_models.keys()
}

test_log = test_context.get_log()
for _, submission_log in tqdm(test_log.iterrows(), total=test_log.shape[0]):
    defect_counts = test_context.get_defect_log().loc[submission_log.name]
    present_defects = defect_counts[defect_counts > 0]
    
    # Skip submissions with too few issues to prioritize
    if len(present_defects) < 2:
        continue

    prioritizations = []

    for name, model in task_models.items():
        scores = model.prioritize(submission_log, present_defects)
        
        # --- Exact Ties ---
        # Count items with the same score as the max
        max_score = scores.max()
        exact_ties = (scores == max_score).sum() > 1
        metrics['exact_ties'][name].append(exact_ties)

        # --- Close Ties ---
        # Count items within a relative threshold of the max score
        close_tie_threshold_value = max_score * CLOSE_TIE_THRESHOLD
        close_ties = (scores >= (max_score - close_tie_threshold_value)).sum() > 1
        metrics['close_ties'][name].append(close_ties)

        # --- Gini Coefficient ---
        metrics['gini'][name].append(gini(scores.values))

        # Store the prioritization to calculate intra-model agreement
        prioritizations.append(scores.values)
    
    # --- Agreement ---
    for i, first in enumerate(task_models.keys()):
        for j, second in enumerate(task_models.keys()):
            if i >= j:
                continue
            first_score = prioritizations[i]
            second_score = prioritizations[j]
            # Check for constant arrays
            if np.all(first_score == first_score[0]) or np.all(second_score == second_score[0]):
                rho = np.nan
            else:
                rho, _ = spearmanr(prioritizations[i], prioritizations[j])
            model_correlations[first][second].append(rho)
            model_correlations[second][first].append(rho)

# --- Final Aggregation and Formatting ---
results = {}
for metric_name, data in metrics.items():
    avg_values = {name: np.mean(values) for name, values in data.items()}
    results[f'avg_{metric_name}'] = pd.Series(avg_values)

print(f"Average Exact Ties: \n{results['avg_exact_ties']}")
print("-" * 20)
print(f"Average Close Ties: \n{results['avg_close_ties']}")
print("-" * 20)
print(f"Average Gini Coefficients: \n{results['avg_gini']}")

### Decisivness - Ties

In [None]:
plot_data = pd.DataFrame({
    'Exact Ties': results['avg_exact_ties'],
    'Close Ties': results['avg_close_ties'] - results['avg_exact_ties']
}).T.reset_index()

plot_data = plot_data.rename(columns={'index': 'Tie Type'})

plot_data_melted = plot_data.melt(id_vars='Tie Type', var_name='Model', value_name='Average Count')

ax = plot_data.set_index('Tie Type').T.plot(
    kind='bar', 
    stacked=True, 
    figsize=(12, 8),
    colormap='tab10'
)

plt.title("Average Number of Exact and Close Ties per Model", fontsize=16)
plt.ylabel("Average Count", fontsize=12)
plt.xlabel("Prioritization Model", fontsize=12)
plt.xticks(rotation=45, ha="right")

plt.legend(title='Tie Type')
plt.tight_layout()
plt.savefig(IMAGE_DIR / 'task_models_tie_bar_plot.png', dpi=RESOLUTION)
plt.close()

### Decisivness - Gini

In [None]:
plot_data = []
for model_name, gini_list in metrics['gini'].items():
    for gini_value in gini_list:
        plot_data.append({'Model': model_name, 'Gini Coefficient': gini_value})

plot_data = pd.DataFrame(plot_data)

plt.figure(figsize=(12, 8))

sns.boxplot(
    x='Model',
    y='Gini Coefficient',
    data=plot_data,
    notch=True,
    palette='viridis',
    hue='Model',
    legend=False
)

plt.title("Distribution of Gini Coefficients by Prioritization Model", fontsize=16)
plt.xlabel("Prioritization Model", fontsize=12)
plt.ylabel("Gini Coefficient", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.ylim(0, 1)
plt.tight_layout()

plt.savefig(IMAGE_DIR / 'task_models_gini_box_plot.png', dpi=RESOLUTION)
plt.close()

### Inter-Model Agreement

In [None]:
correlation_matrix = np.empty((len(task_models.keys()), len(task_models.keys())))
correlation_matrix[:] = np.nan

for i, first in enumerate(task_models.keys()):
    for j, second in enumerate(task_models.keys()):
        if i >= j:
            continue
        correlation_matrix[i, j] = np.nanmean(model_correlations[first][second])
        correlation_matrix[j, i] = correlation_matrix[i, j]

correlation_matrix = pd.DataFrame(correlation_matrix, index=task_models.keys(), columns=task_models.keys())

plt.figure(figsize=(10, 8))

sns.heatmap(
    correlation_matrix,
    annot=True,
    cmap='coolwarm',
    fmt=".2f",
    linewidths=.5,
    cbar_kws={'label': "Average Spearman's Correlation"},
    vmin=-1,
    vmax=1
)

plt.title("Inter-Model Prioritization Agreement (Spearman's Rho)", fontsize=16)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()

plt.savefig(IMAGE_DIR / 'task_models_correlation_heatmap.png', dpi=RESOLUTION)
plt.close()

### Agreement with baseline

In [None]:
plot_data = []
for model, corr_list in model_correlations['Severity Baseline'].items():
    for corr in corr_list:
        plot_data.append({'Model': model, 'Spearman_Rho': corr})

plot_data = pd.DataFrame(plot_data)

plt.figure(figsize=(10, 8))

sns.boxplot(
    x='Model',
    y='Spearman_Rho',
    data=plot_data,
    palette='rocket',
    hue='Model',
    legend=False,
    notch=True
)

plt.title("Distribution of Spearman's Rho by Prioritization Model", fontsize=16)
plt.xlabel("Prioritization Model", fontsize=12)
plt.ylabel("Spearman's Rho", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.tight_layout()

plt.savefig(IMAGE_DIR / 'task_models_baseline_correlation_box_plot.png', dpi=RESOLUTION)
plt.close()

### Sensitivity

In [None]:
full_task_models = {
    name: model.train(full_context) for name, model in task_models.items()
}

In [None]:
matrices = {}
for name, model in tqdm(full_task_models.items(), desc="Analyzing Task Models"):
    weights = model.get_model_weights()

    correlation_matrix = pd.DataFrame(index=items.index, columns=items.index)

    if weights is None:
        matrix_values = np.ones((len(items.index), len(items.index)))
        np.fill_diagonal(matrix_values, 0)
        correlation_matrix = pd.DataFrame(matrix_values, index=items.index, columns=items.index)
        matrices[name] = correlation_matrix
        continue

    for first in items.index:
        for second in items.index:
            if first >= second:
                continue
            weights_first = weights.loc[first]
            weights_second = weights.loc[second]

            if np.all(weights_first == weights_first.iloc[0]) or np.all(weights_second == weights_second.iloc[0]):
                rho = np.nan
            else:
                rho, _ = spearmanr(weights_first, weights_second)

            correlation_matrix.loc[first, second] = rho
            correlation_matrix.loc[second, first] = rho
    matrices[name] = correlation_matrix

In [None]:
fig, axes = plt.subplots(
    nrows=2, ncols=2, figsize=(20, 18),
    sharex=True, sharey=True,
    gridspec_kw={'wspace': 0.05, 'hspace': 0.05} # Fine-tune the spacing
)
axes = axes.flatten()

models = ["Task Common", "Task Characteristic", "Currently Taught", "Severity Baseline"]
cmaps = ['coolwarm', 'coolwarm', 'coolwarm', 'coolwarm']

for i, name in enumerate(models):
    ax = axes[i]
    correlation_matrix = matrices[name]
    
    sns.heatmap(
        correlation_matrix.astype(float),
        ax=ax,
        cmap=cmaps[i],
        vmin=-1,
        vmax=1,
        linewidths=.5,
        cbar=False,
    )

    ax.set_title(f"Task-to-Task Agreement for {name}", fontsize=18)
    if i in [2, 3]:
        ax.set_xlabel('Task ID', fontsize=12)
    else:
        ax.set_xlabel('')
        ax.set_xticklabels([])
    if i in [0, 2]:
        ax.set_ylabel('Task ID', fontsize=12)
    else:
        ax.set_ylabel('')
        ax.set_yticklabels([])

# Create a single color bar for the entire figure
fig.subplots_adjust(right=0.85)
cbar_ax = fig.add_axes([0.88, 0.15, 0.02, 0.7])
last_heatmap = axes[-1].collections[0]
cbar = fig.colorbar(last_heatmap, cax=cbar_ax)
cbar.set_label("Spearman's Correlation", fontsize=16)

plt.savefig(IMAGE_DIR / 'task_to_task_correlation_heatmaps.png', dpi=RESOLUTION)
plt.close()

In [None]:
std_devs = {}
for name, correlation_matrix in matrices.items():
    upper_triangle_mask = np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
    off_diagonal_values = correlation_matrix.where(upper_triangle_mask).stack().values
    off_diagonal_std = off_diagonal_values.std()
    
    std_devs[name] = off_diagonal_std

std_df = pd.DataFrame(std_devs.items(), columns=['Model', 'Standard Deviation'])

plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Standard Deviation', data=std_df, palette='viridis', hue='Model', legend=False)

plt.title("Standard Deviation of Inter-Task Correlation", fontsize=16)
plt.ylabel("Standard Deviation of Spearman's Rho", fontsize=12)
plt.xlabel("Prioritization Model", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.tight_layout()

plt.savefig(IMAGE_DIR / 'task_models_sensitivity_bar_plot.png', dpi=RESOLUTION)
plt.close()

***

# Student-Context Heuristics

In [None]:
student_models = {
    "Student Frequency": StudentFrequencyModel(),
    "Student Characteristic": StudentCharacteristicModel(),
    "Student Encountered": StudentEncounteredBeforeModel(),
    "Defect Multiplicity": DefectMultiplicityModel(),
    "Severity Baseline": SeverityModel().train(full_context)
}

## Weight Histograms

In [None]:
plot_model_weight_histogram(
    student_models["Student Frequency"].get_model_weights().values.flatten(),
    x_label="Frequency",
    title="Distribution of Student-Specific Frequency",
    save_path="student_frequency_distribution.png",
    bins=80
)

In [None]:
plot_model_weight_histogram(
    student_models["Student Characteristic"].get_model_weights().values.flatten(),
    x_label="Z-Score (Absolute Value)",
    title="Distribution of Characteristic Student-Defect Scores",
    save_path="student_characteristic_distribution.png",
    bins=80
)

# Pilot Testing

# Output