# set up

In [None]:
import math
import random
import os
import json

from pathlib import Path
from itertools import combinations

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors as mcolors
import seaborn as sns

from tqdm import tqdm
from scipy.stats import chi2_contingency
from matplotlib.colors import ListedColormap
from IPython.display import HTML

RESOLUTION = 300 # dpi
VERSION = '0.0.0'
DATASET_PATH = Path('data') / 'datasets' / f'ipython_{VERSION}'
image_dir = Path('images') / 'exploratory'
defect_images = image_dir / "defect_level"
task_images = image_dir / "task_level"
task_defect_images = image_dir / "task_and_defect"

os.makedirs(defect_images, exist_ok=True)
os.makedirs(task_images, exist_ok=True)
os.makedirs(task_defect_images, exist_ok=True)

In [None]:
def get_regression_line(x: pd.Series, y: pd.Series) -> np.ndarray:
    """Get the regression line between two series."""
    slope, intercept = np.polyfit(x, y, 1)
    return slope * x + intercept

# loading data

In [None]:
items = pd.read_csv(DATASET_PATH / f'items_{VERSION}.csv', index_col=0)
log = pd.read_csv(DATASET_PATH / f'log_{VERSION}.csv', index_col=0, parse_dates=['time'])
defects = pd.read_csv(DATASET_PATH / f'defects_{VERSION}.csv', index_col=0)
defect_log = pd.read_csv(DATASET_PATH / f'defect_log_{VERSION}.csv', index_col=0)
defect_log.columns = defect_log.columns.astype(int)
code_to_defect_id = json.load(open(DATASET_PATH / f'code_to_defect_id_{VERSION}.json', "r"))
defect_presence = defect_log > 0

# grouping by defect

## defect frequency

In [None]:
frequencies = defect_presence.mean(axis=0).sort_values(ascending=False)

defect_names = defects.loc[frequencies.index, 'defect name']
defect_types = defects.loc[frequencies.index, 'defect type']

unique_types = list(set(defects['defect type']))
color_map = {dtype: plt.get_cmap('Paired')(i / (len(unique_types) - 1)) for i, dtype in enumerate(unique_types)}

plt.figure(figsize=(12, 8))
plt.bar(
    x = defect_names,
    height = frequencies,
    color=[color_map[dtype] for dtype in defect_types]
)

plt.title('Relative Frequency of Defects by Category', fontsize=16)
plt.xlabel('Defect Name', fontsize=12)
plt.ylabel('Relative Frequency', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

handles = [plt.Rectangle((0, 0), 1, 1, color=color_map[dtype]) for dtype in unique_types]
plt.legend(handles, unique_types, title='Defect Type', loc='upper right')

plt.savefig(defect_images / 'defect_frequency.png', dpi=RESOLUTION)
plt.close()

### defect frequencies without poor formatting

In [None]:
frequencies = defect_presence.mean(axis=0).sort_values(ascending=False)

# filter out poor formatting
frequencies = frequencies.loc[defects['defect type'] != 'poor formatting']

defect_names = defects.loc[frequencies.index, 'defect name']
defect_types = defects.loc[frequencies.index, 'defect type']

unique_types = list(set(defects['defect type']))
color_map = {dtype: plt.get_cmap('Paired')(i / (len(unique_types) - 1)) for i, dtype in enumerate(unique_types)}

plt.figure(figsize=(12, 8))
plt.bar(
    x = defect_names,
    height = frequencies,
    color=[color_map[dtype] for dtype in defect_types]
)

plt.title('Relative Frequency of Defects by Category', fontsize=16)
plt.xlabel('Defect Name', fontsize=12)
plt.ylabel('Relative Frequency', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

handles = [plt.Rectangle((0, 0), 1, 1, color=color_map[dtype]) for dtype in unique_types]
plt.legend(handles, unique_types, title='Defect Type', loc='upper right')

plt.savefig(defect_images / 'defect_frequency_without_formatting.png', dpi=RESOLUTION)
plt.close()

### defect frequencies by severity

In [None]:
unique_types = list(set(defects['defect type']))
color_map = {dtype: plt.get_cmap('Paired')(i / (len(unique_types) - 1)) for i, dtype in enumerate(unique_types)}

unique_severities = sorted(defects['severity'].unique())

fig, axes = plt.subplots(2, 2, figsize=(18, 12))
axes = axes.flatten()

for i, severity_level in enumerate(unique_severities):
    ax = axes[i]

    # Filter defects by the current severity level
    filtered_defects = defects[defects['severity'] == severity_level].index

    # Filter df_log columns and calculate sums
    frequencies = defect_presence.loc[:, filtered_defects].mean(axis=0).sort_values(ascending=False)

    # filter out poor formatting
    frequencies = frequencies.loc[defects['defect type'] != 'poor formatting']

    defect_names = defects.loc[frequencies.index, 'defect name']
    defect_types = defects.loc[frequencies.index, 'defect type']

    # Plot on the current subplot
    ax.bar(
        defect_names,
        frequencies,
        color=[color_map[dtype] for dtype in defect_types]
    )

    ax.set_title(f'Severity: {severity_level}', fontsize=14)
    ax.set_xlabel('Defect Name')
    ax.set_ylabel('Relative Frequency')

    ax.set_xticks(range(len(defect_names)))
    ax.set_xticklabels(defect_names, rotation=45, ha='right')

# Add a single legend for the entire figure
handles = [plt.Rectangle((0, 0), 1, 1, color=color_map[dtype]) for dtype in unique_types]
fig.legend(handles, unique_types, title='Defect Type', loc='upper right')

plt.tight_layout(rect=[0, 0, 0.9, 1])
plt.savefig(defect_images / 'defect_frequency_by_severity.png', dpi=RESOLUTION)
plt.close()

### defect frequencies by type

In [None]:
unique_types = sorted(list(set(defects['defect type'])))
num_types = len(unique_types)

cols = 2
rows = math.ceil(num_types / cols)

fig, axes = plt.subplots(rows, cols, figsize=(18, 6 * rows))
axes = axes.flatten()

color_map = {dtype: plt.get_cmap('Paired')(i / (num_types - 1)) for i, dtype in enumerate(unique_types)}

for i, defect_type in enumerate(unique_types):
    ax = axes[i]

    # Filter defects by the current type
    filtered_defects_ids = defects[defects['defect type'] == defect_type].index

    # Filter defect presence data and calculate relative frequencies
    if not filtered_defects_ids.empty:
        frequencies = defect_presence.loc[:, filtered_defects_ids].mean(axis=0).sort_values(ascending=False)

        defect_names = defects.loc[frequencies.index, 'defect name']

        # Plot on the current subplot
        ax.bar(
            defect_names,
            frequencies,
            color=color_map[defect_type]
        )

        ax.set_title(f'Defect Type: {defect_type}', fontsize=14)
        ax.set_xlabel('Defect Name')
        ax.set_ylabel('Relative Frequency')

        ax.set_xticks(range(len(defect_names)))
        ax.set_xticklabels(defect_names, rotation=45, ha='right')

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

# Add a single legend for the entire figure
handles = [plt.Rectangle((0, 0), 1, 1, color=color_map[dtype]) for dtype in unique_types]
fig.legend(handles, unique_types, title='Defect Type', loc='upper right')

plt.tight_layout(rect=[0, 0, 0.9, 1])
plt.savefig(defect_images / 'defect_frequency_by_type.png', dpi=RESOLUTION)
plt.close()

### defect frequency by severity and type

In [None]:
df = pd.crosstab(defects['severity'], defects['defect type'])
for i in range(1, 6):
    if i not in df.index:
        df.loc[i] = np.zeros(len(df.columns), dtype=int)
df.sort_index(inplace=True)

fig, ax = plt.subplots(figsize=(10, 8), layout="constrained")

sns.heatmap(df, annot=True, cmap="Blues", fmt="d", ax=ax)
for i, column in enumerate(df.columns):
    ax.scatter(i + 0.5, (df[column] * df.index).sum() / df[column].sum() - 0.5, marker='X', color='red', s=100)
ax.invert_yaxis()
colorbar = ax.collections[0].colorbar
colorbar.set_label('Number of Unique Defects', rotation=90, labelpad=15)
plt.title('Distribution of Defects by Severity and Type')

fig.savefig(defect_images / 'defect_severity_type_heatmap.png', dpi=RESOLUTION)
plt.close()

In [None]:
defect_counts = defect_presence.loc[:, defects[defects['defect type'] != 'poor formatting'].index].sum()

counts_df = pd.DataFrame(defect_counts, columns=['count'])
counts_df = counts_df.join(defects[['severity', 'defect type']])

heatmap_data = pd.pivot_table(
    counts_df,
    values='count',
    index='severity',
    columns='defect type',
    aggfunc="sum",
    fill_value=0
).sort_index()

fig, ax = plt.subplots(figsize=(10, 8), layout="constrained")
sns.heatmap(heatmap_data, annot=True, cmap="Blues", fmt="d", ax=ax)

for i, col in enumerate(heatmap_data.columns):
    ax.scatter(i + 0.5, (heatmap_data[col] * heatmap_data.index).sum() / heatmap_data[col].sum() - 0.5, marker='X', color='red', s=100)
    
ax.invert_yaxis()
colorbar = ax.collections[0].colorbar
colorbar.set_label('Total Defect Count', rotation=90, labelpad=15)
plt.title('Distribution of Defects by Severity and Type (Total Counts)')
plt.savefig(defect_images / 'defect_severity_type_heatmap_total_counts.png', dpi=RESOLUTION)
plt.close()

## Defect occurances per submission

In [None]:
counts = defect_log.sum(axis=1).value_counts()

num_of_submissions = [str(i) for i in range(5)] + ['>=5']
defect_counts= list(counts[:5]) + [counts[5:].sum()]
defect_percentage = list(map(lambda x: x / len(defect_log), defect_counts))

fig, ax = plt.subplots(layout="constrained", figsize=(10, 6))

p = ax.bar(num_of_submissions, defect_percentage)

ax.bar_label(p, labels = defect_counts, label_type='edge')

ax.set_ylabel('Relative Frequency')
ax.set_xlabel('Unique Defect Occurances per Submission')
ax.set_title('Submissions by the Number of Unique Defects (Labeled by Absolute Count)'.format(len(defect_log)))


plt.savefig(defect_images / 'unique_defect_counts.png', dpi=RESOLUTION)
plt.close()

## Number of individual instances per occurance

In [None]:
df_long = pd.melt(defect_log, var_name='defect_id', value_name='count')
df_long = df_long[df_long['count'] > 0]

df_long = pd.merge(df_long, defects.set_index('id')[['defect name', 'defect type']], left_on='defect_id', right_index=True)

mean_order = df_long.groupby('defect name')['count'].mean().sort_values(ascending=False).index

# 4. Create the box plot for each defect, ordered by the mean count
plt.figure(figsize=(10, 20))
sns.boxplot(
    data=df_long,
    y='defect name',
    x='count',
    orient='h',
    order=mean_order,
    showfliers=False,
    notch=True,
    palette=color_map,
    hue='defect type',
)

plt.xlabel('Number of Defects')
plt.ylabel('Defect Name')
plt.title('Distribution of Defect Count When Present per Defect (Outliers Removed)')
plt.tight_layout()
plt.legend(title='Defect Type', loc='center right')
plt.savefig(defect_images / 'defect_counts_when_present.png', dpi=RESOLUTION)
plt.close()

## Defect combinations

### correlation

Some defects are (weakly) correlated, sometimes implying causality - could be impulse for handling the combination differently than the individual.

In [None]:
plt.figure(figsize=(20, 20), layout="constrained")
correlations = defect_presence.corr().values
np.fill_diagonal(correlations, None)
defect_names = defects['defect name'].loc[defect_presence.columns]
sns.heatmap(correlations, xticklabels=defect_names.apply(lambda x: x[:20]), yticklabels=defect_names, vmin=-1, vmax=1, cmap="vlag")

plt.xticks(rotation = 90, ha='right')
plt.yticks(rotation = 0, ha='right')

plt.title('Correlation Matrix of Defect Presence (Pearson)')
plt.savefig(defect_images / 'defect_correlation.png', dpi=300)
plt.close()

In [None]:
# Calculate the correlation matrix
correlations = defect_presence.corr()

# Get unique pairs
correlation_pairs = correlations.unstack().reset_index()
correlation_pairs.columns = ['defect1', 'defect2', 'correlation']
correlation_pairs = correlation_pairs[correlation_pairs['defect1'] < correlation_pairs['defect2']]

# Get the top 10 strongest correlations
top_10_correlations = correlation_pairs.abs().sort_values(by='correlation', ascending=False).head(10)

# Map defect ids to names
defect_id_to_name = defects['defect name'].to_dict()
top_10_correlations['defect1_name'] = top_10_correlations['defect1'].map(defect_id_to_name)
top_10_correlations['defect2_name'] = top_10_correlations['defect2'].map(defect_id_to_name)

# Create a combined label for the bar chart
top_10_correlations['pair'] = top_10_correlations['defect1_name'] + ' & ' + top_10_correlations['defect2_name']

top_10_correlations['color'] = np.where(top_10_correlations['correlation'] > 0, 'green', 'red')

plt.figure(figsize=(15, 8), layout="constrained")
sns.barplot(
    data=top_10_correlations,
    x='correlation',
    y='pair',
    hue='color',
    palette={'green': 'green', 'red': 'red'},
    legend=False
)

plt.title('Top 10 Strongest Defect Correlations (Pearson)')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Defect Pair')
plt.xlim(-1, 1)

plt.savefig(defect_images / 'top_10_defect_correlations.png', dpi=RESOLUTION)
plt.close()

In [None]:
def compare_defects(
        defect1_id: int, defect2_id: int, items: pd.DataFrame, defects: pd.DataFrame,
        log: pd.DataFrame, defect_log: pd.DataFrame, task: int | None = None
    ) -> str:
    """Generate an HTML string to display and compare two defects side-by-side.

    Arguments:
        defect1_id -- The ID of the first defect to compare.
        defect2_id -- The ID of the second defect to compare.
        items -- DataFrame with task metadata.
        defects -- DataFrame with defect metadata.
        log -- DataFrame with submission logs.
        defect_log -- DataFrame indicating defect presence.
        task -- (Optional) The ID of the specific task to examine.

    Returns:
        An HTML-formatted string.
    """
    try:
        defect1_info = defects.loc[defect1_id]
        defect2_info = defects.loc[defect2_id]
        
        # If no specific task is provided, find one where the defects co-occur
        if task is None:
            co_occurring_submissions = defect_log[
                (defect_log[defect1_id] > 0)
                & (defect_log[defect2_id] > 0)
            ].index
            
            if not co_occurring_submissions.empty:
                task = log.loc[random.choice(co_occurring_submissions), 'item']
            else:
                return "Error: No tasks found where both defects co-occur."

        task_row = items.loc[task]

        # Find a random submission for the task where both defects are present
        submissions = log[
            (log["item"] == task)
            & (defect_log[defect1_id] > 0)
            & (defect_log[defect2_id] > 0)
        ]        
        if len(submissions):
            example_submission_text = submissions["answer"].iloc[random.randint(0, len(submissions) - 1)]
        else:
            example_submission_text = "No submissions found for this specific co-occurrence."
    except KeyError as e:
        return f"Error: The provided ID ({e}) is not valid."

    return f"""
    <div style="display: flex; justify-content: space-between; gap: 20px;">
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>Task: {task_row["name"]}</h3>
            <div><strong>Instructions:</strong><br>{task_row["instructions"]}</div>
            <div><strong>Solution:</strong><br>
                <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{task_row["solution"]}</pre>
            </div>
        </div>
        
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>Defect Comparison</h3>
            <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;">
                <div style="border-right: 1px solid #ccc; padding-right: 10px;">
                    <h4>{defect1_info["defect name"]}</h4>
                    <div><strong>Type:</strong> {defect1_info["defect type"]}</div>
                    <div><strong>Severity:</strong> {defect1_info["severity"]}</div>
                    <div><strong>Description:</strong><br>{defect1_info["description"]}</div>
                    <div style="margin-top: 10px;"><strong>Code Example:</strong><br><pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 5px; border-radius: 5px;">{defect1_info["code example"]}</pre></div>
                    <div style="margin-top: 10px;"><strong>Code Fix:</strong><br><pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 5px; border-radius: 5px;">{defect1_info["code fix example"]}</pre></div>
                </div>
                
                <div>
                    <h4>{defect2_info["defect name"]}</h4>
                    <div><strong>Type:</strong> {defect2_info["defect type"]}</div>
                    <div><strong>Severity:</strong> {defect2_info["severity"]}</div>
                    <div><strong>Description:</strong><br>{defect2_info["description"]}</div>
                    <div style="margin-top: 10px;"><strong>Code Example:</strong><br><pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 5px; border-radius: 5px;">{defect2_info["code example"]}</pre></div>
                    <div style="margin-top: 10px;"><strong>Code Fix:</strong><br><pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 5px; border-radius: 5px;">{defect2_info["code fix example"]}</pre></div>
                </div>
            </div>
        </div>
    </div>
    
    <div style="border: 1px solid #ccc; padding: 10px; margin-top: 20px; border-radius: 5px;">
        <strong>Example Submission:</strong><br>
        <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{example_submission_text}</pre>
    </div>
    """

In [None]:
HTML(compare_defects(*top_10_correlations[['defect1', 'defect2']].iloc[1], items, defects, log, defect_log))

### co-occurance

Most defect pairs are exceedingly rare. No specific rules can be designed based on them.

In [None]:
co_occurances = (defect_log.T.dot(defect_log)).values
defect_names = [defects['defect name'].loc[idx] for idx in defect_log.columns]

thresholded = co_occurances.astype('float')
thresholded[thresholded < 100] = 0.
thresholded[thresholded >= 1000] = 3.
thresholded[thresholded >= 250] = 2.
thresholded[thresholded >= 100] = 1.

np.fill_diagonal(thresholded, np.nan)

cmap = ListedColormap(sns.color_palette("Blues", 4))

fig, ax = plt.subplots(figsize=(20, 20), layout="constrained")

p = ax.imshow(thresholded, cmap=cmap, vmin=0, vmax=3)

ax.set_yticks(np.arange(len(defect_names)))
ax.set_yticklabels(defect_names)

ax.set_title('Co-occurrence of defects thresholded at different levels')

cbar = fig.colorbar(p, ax=ax, ticks=[0, 1, 2, 3], shrink=0.8)
cbar.set_label('Thresholded Co-occurrence Levels')

cbar.set_ticks([0, 1, 2, 3])
cbar.set_ticklabels(['< 100', '100 - 250', '250 - 1000', '>= 1000'])

plt.savefig(defect_images / 'defect_co_occurrence.png', dpi=RESOLUTION)
plt.close()


# grouping by task

## task frequency

In [None]:
merged_df = pd.merge(
    log['item'].value_counts().rename('submission_count'),
    items,
    left_index=True,
    right_on='id'
).sort_values(by='submission_count', ascending=True)

ordered_topics = merged_df[['topic', 'topic order']].drop_duplicates().sort_values(by='topic order')
topic_list = ordered_topics['topic'].tolist()

color_map = {topic: mpl.colormaps['tab20'](i / (len(topic_list) - 1)) for i, topic in enumerate(topic_list)}

bar_colors = [color_map[topic] for topic in merged_df['topic']]

fig, ax = plt.subplots(layout="constrained", figsize=(10, 20))
ax.barh(merged_df['name'], merged_df['submission_count'], color=bar_colors)

# 6. Add a legend inside the graph at the right bottom, sorted by topic order
handles = [plt.Rectangle((0, 0), 1, 1, color=color_map[topic]) for topic in topic_list]
ax.legend(handles, topic_list, title='Topic', loc='lower right')

ax.set_xlabel('Number of Submissions')
ax.set_ylabel('Task')
ax.set_title('Distribution of Submissions Over Items by Topic')

plt.savefig(task_images / 'task_frequency.png', dpi=RESOLUTION)
plt.close()

## topic frequency

In [None]:
counts = log['item'].value_counts().reset_index()
counts.columns = ['item_id', 'submission_count']

merged_df = pd.merge(counts, items, left_on='item_id', right_on='id')

topic_submissions = merged_df.groupby('topic').agg({
    'submission_count': 'sum',
    'topic order': 'first'
}).sort_values(by='topic order')

fig, ax = plt.subplots(figsize=(15, 8))

ax.bar(topic_submissions.index, topic_submissions['submission_count'])

ax.set_xlabel('Topic')
ax.set_ylabel('Total Number of Submissions')
ax.set_title('Total Submissions by Topic')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(task_images / 'topic_frequency.png', dpi=RESOLUTION)
plt.close()

In [None]:
item_counts = log['item'].value_counts().reset_index()
item_counts.columns = ['id', 'submission_count']

merged_df = pd.merge(item_counts, items, on='id')

pivot_df = merged_df.pivot_table(
    values='submission_count',
    index=['topic', 'topic order'],
    columns='name',
    fill_value=0
)

pivot_df = pivot_df.reset_index().sort_values(by='topic order').drop(columns='topic order').set_index('topic')

fig, ax = plt.subplots(figsize=(15, 8))
pivot_df.plot(kind='bar', stacked=True, ax=ax)

ax.set_xlabel('Topic')
ax.set_ylabel('Number of Submissions')
ax.set_title('Distribution of Submissions by Topic and Task')
ax.get_legend().remove()

plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(task_images / 'topic_frequency_stacked.png', dpi=RESOLUTION)
plt.close()

# Grouping by task and defect

## Task-defect pairs with very few defects

Over 80 % of all task-defect pairs! It is very hard to make any assertions about them. Will often be left out of the analysis not to disrupt the results.

In [None]:
counts_matrix = defect_presence.groupby(log['item']).sum()
lower_limit = 10
few_submissions = counts_matrix < lower_limit

### absolute counts

In [None]:
defect_names = [defects['defect name'].loc[idx][:20] for idx in counts_matrix.columns]
task_names = [items['name'].loc[idx][:20] for idx in counts_matrix.index]

fig, ax = plt.subplots(figsize=(12, 17), layout="constrained")

sns.heatmap(counts_matrix, xticklabels=defect_names, yticklabels=task_names, cbar=True)
ax.tick_params(axis='x', labelsize=7)
ax.tick_params(axis='y', labelsize=8)
plt.title('Defect Occurances per Task-Defect Pair')
plt.xlabel("")
plt.ylabel("")

plt.savefig(task_defect_images / 'task_defect_counts.png', dpi=RESOLUTION)
plt.close()

In [None]:
# Flatten the matrix and filter out zero values
all_task_defect_counts = counts_matrix.values.flatten()

# Create a CDF plot
fig, ax = plt.subplots(figsize=(10, 6))
sns.ecdfplot(data=all_task_defect_counts, ax=ax)

# Add 90th percentile line
ax.axhline(0.90, color='green', linestyle=':', label=f'90th Percentile ({np.quantile(all_task_defect_counts, 0.90)})')
ax.axvline(np.quantile(all_task_defect_counts, 0.90), color='green', linestyle=':')
ax.legend()

plt.xlabel('Number of Defects')
plt.ylabel('Cumulative Frequency')
plt.title('Distribution of Defect Occurances per Task-Defect Pair')
plt.tight_layout()

plt.savefig(task_defect_images / 'task_defect_counts_cdf.png', dpi=RESOLUTION)
plt.close()

### discrete categories

In [None]:
bins = [-1, 5, 10, 25, 50, np.inf]
labels = ['< 5', '5-9', '10-24', '25-49', '>= 50']

categorized_counts = pd.cut(counts_matrix.stack(), bins=bins, labels=labels, right=False).unstack()
numeric_counts = categorized_counts.apply(lambda x: x.map({label: i for i, label in enumerate(labels)})).astype(int)

colors = sns.color_palette("Set2", len(labels))
cmap = mcolors.ListedColormap(colors)
norm = mcolors.BoundaryNorm(np.arange(len(labels) + 1) - 0.5, cmap.N, clip=True)

defect_names = [defects['defect name'].loc[idx][:20] for idx in counts_matrix.columns]
task_names = [items['name'].loc[idx][:20] for idx in counts_matrix.index]

fig, ax = plt.subplots(figsize=(12, 17), layout="constrained")

sns.heatmap(
    numeric_counts,
    xticklabels=defect_names,
    yticklabels=task_names,
    cmap=cmap,
    norm=norm,
    cbar=False,
    ax=ax
)

colorbar = fig.colorbar(plt.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax, boundaries=np.arange(len(labels) + 1) - 0.5)
colorbar.set_ticks(np.arange(len(labels)))
colorbar.set_ticklabels(labels)
colorbar.set_label('Defect Count Category')

ax.tick_params(axis='x', labelsize=7)
ax.tick_params(axis='y', labelsize=8)
plt.title('Defect Occurances per Task-Defect Pair (Categorized by Count)')
plt.xlabel("")
plt.ylabel("")

plt.savefig(task_defect_images / 'task_defect_counts_categorized.png', dpi=RESOLUTION)
plt.close()

In [None]:
absolute_counts = categorized_counts.stack().value_counts().reindex(labels)
relative_frequencies = absolute_counts / absolute_counts.sum()

plt.figure(figsize=(10, 6))
ax = sns.barplot(
    x=relative_frequencies.index,
    y=relative_frequencies.values,
    hue=relative_frequencies.index,
    palette=sns.color_palette("Set2", len(labels))
)

for i, count in enumerate(absolute_counts):
    ax.text(
        i,
        relative_frequencies.iloc[i] + 0.01,
        f"{count}",
        ha='center',
        va='bottom',
        fontsize=10
    )

# 5. Add labels and a title
plt.title('Relative Frequency of Categorized Defect Occurances per Task (Labeled by Absolute Count)')
plt.xlabel('Defect Occurances per Task-Defect Pair (Categorized)')
plt.ylabel('Relative Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.savefig(task_defect_images / 'task_defect_counts_categorized_relative_frequency.png', dpi=RESOLUTION)
plt.close()

## task-defect z-score anomalies

### all

In [None]:
common_defects = (defect_presence.groupby(log["item"]).mean() - defect_presence.mean(axis=0)) / defect_presence.std(axis=0)

defect_names = [defects['defect name'].loc[idx] for idx in defect_presence.columns]
task_names = [items['name'].loc[idx] for idx in common_defects.index]

plt.figure(figsize=(25, 10), layout="constrained")

biggest_value = common_defects.abs().values.max()

sns.heatmap(common_defects.T, xticklabels=task_names, yticklabels=defect_names, cmap="vlag", cbar=True, vmin=-biggest_value, vmax=biggest_value)

plt.title("Task-Level Defect Anomalies (Z-scores) for All Tasks")
plt.xlabel("Task")
plt.ylabel("Defect Name")
plt.xticks(rotation=90, ha='center')

plt.savefig(task_defect_images / 'task_anomalies_all.png', dpi=RESOLUTION)
plt.close()

### min support

In [None]:
min_threshold = 10

common_defects = (defect_presence.groupby(log["item"]).mean() - defect_presence.mean(axis=0)) / defect_presence.std(axis=0)

# Filter out task-defect pairs with less than 10 submissions
count_mask = defect_presence.groupby(log["item"]).sum() >= 10
common_defects = common_defects.mask(~count_mask)

defect_names = [defects['defect name'].loc[idx] for idx in defect_presence.columns]
task_names = [items['name'].loc[idx] for idx in common_defects.index]

plt.figure(figsize=(25, 10), layout="constrained")

biggest_value = common_defects.abs().values.max()

sns.heatmap(common_defects.T, xticklabels=task_names, yticklabels=defect_names, cmap="Blues", cbar=True, vmin=-biggest_value, vmax=biggest_value)

plt.title("Task-Defect Z-scores for All Pairs With At Least {} Submissions".format(min_threshold))
plt.xlabel("Task")
plt.ylabel("Defect Name")
plt.xticks(rotation=90, ha='center')

plt.savefig(task_defect_images / 'task_anomalies_min_threshold.png', dpi=RESOLUTION)
plt.close()

### high variance

In [None]:
common_defects = (defect_presence.groupby(log["item"]).mean() - defect_presence.mean(axis=0)) / defect_presence.std(axis=0)
highest_variance_tasks = common_defects.var(axis=1).sort_values(ascending=False)[:30].index
defect_names = [defects['defect name'].loc[idx] for idx in defect_presence.columns]
task_names = [items['name'].loc[idx] for idx in highest_variance_tasks]

plt.figure(figsize=(10, 10), layout="constrained")
biggest_value = common_defects.abs().values.max()
sns.heatmap(common_defects.loc[highest_variance_tasks].T, xticklabels=task_names, yticklabels=defect_names, cmap="vlag", cbar=True, vmin=-biggest_value, vmax=biggest_value)
plt.title("Task-Level Defect Anomalies (Z-scores, Tasks With Highest Variance)")
plt.xlabel("")
plt.ylabel("")

plt.savefig(task_defect_images / 'task_anomalies_highest_variance.png', dpi=RESOLUTION)
plt.close()

### defects in task templates

some anomalies can be explained by task templates and democode

In [None]:
import pandas as pd
import subprocess
import shutil
import re
from pathlib import Path
from tempfile import mkdtemp
from typing import List, Tuple, Dict, Any

# The expected format for a linter message
LINTER_MESSAGE_PATTERN = re.compile(
    r"^(.*?):(?:\d+:\d+:)?\s*([A-Z]\d{3,4}):?\s*(.*)$"
)

def _parse_linter_output(
    output: str, temp_file_map: Dict[str, Any]
) -> List[Tuple[Any, Tuple[str, str]]]:
    """Parse linter output to extract messages and their associated code.

    Arguments:
        output -- The stdout string from the linter subprocess.
        temp_file_map -- A dictionary mapping temporary file paths to original IDs.

    Returns:
        A list of tuples: (original_id, (defect_code, message_body))
    """
    parsed_messages = []
    
    for message in output.split("\n"):
        if not message:
            continue
            
        if message.startswith("*********"):
            continue

        match = LINTER_MESSAGE_PATTERN.match(message.strip())
        if not match:
            continue

        file_path_str, code, msg_body = match.groups()
        
        # Use a more reliable way to get the original ID
        original_id = temp_file_map.get(file_path_str)
        
        if original_id and code != "W292":  # Ignore newline errors
            parsed_messages.append(
                (original_id, (code, msg_body))
            )

    return parsed_messages

def generate_linter_messages(code_series: pd.Series) -> Dict[Any, List[Tuple[str, str]]]:
    """Generate linter messages for a batch of code strings from a pandas Series.

    Arguments:
        code_series -- A pandas Series with code strings and an index of IDs.

    Returns:
        A dictionary mapping the Series IDs to a list of error codes and messages.
    """
    temp_dir = Path(mkdtemp())
    temp_file_map = {}
    
    try:
        file_paths = []
        for original_id, code_string in code_series.items():
            temp_file = temp_dir / f"temp_{original_id}.py"
            with open(temp_file, "w") as f:
                f.write(code_string)
            file_paths.append(temp_file)
            temp_file_map[str(temp_file)] = original_id

        command = ["py", "-m", "edulint", "check"] + [str(p) for p in file_paths]
        result = subprocess.run(command, text=True, capture_output=True)

        if result.returncode != 1:
            if result.stderr:
                return {
                    original_id: [("SYNTAX_ERROR", result.stderr.strip())]
                    for original_id in code_series.index
                }
            raise RuntimeError(f"Subprocess returned non-zero exit status: {result.returncode}")

        parsed_results = {original_id: [] for original_id in code_series.index}
        all_messages = _parse_linter_output(result.stdout, temp_file_map)
        
        for original_id, message_tuple in all_messages:
            parsed_results[original_id].append(message_tuple)

        return parsed_results

    except Exception as e:
        raise RuntimeError(f"Unexpected error during linting: {e}")
    finally:
        shutil.rmtree(temp_dir)

In [None]:
pd.set_option('future.no_silent_downcasting', True)

In [None]:
democode_messages = generate_linter_messages(items["democode"])
democode_messages = [
    {'item': idx, 'defect': code}
    for idx, code_message_list in democode_messages.items()
    for code, _ in code_message_list
]
democode_messages = pd.DataFrame(democode_messages)
democode_messages = democode_messages[democode_messages["defect"].isin(code_to_defect_id.keys())]
democode_messages["defect"] = democode_messages["defect"].replace(code_to_defect_id).astype(int)

In [None]:
solution_messages = generate_linter_messages(items["solution"])
solution_messages = [
    {'item': idx, 'defect': code}
    for idx, code_message_list in solution_messages.items()
    for code, _ in code_message_list
]
solution_messages = pd.DataFrame(solution_messages)
solution_messages = solution_messages[solution_messages["defect"].isin(code_to_defect_id.keys())]
solution_messages["defect"] = solution_messages["defect"].replace(code_to_defect_id).astype(int)

In [None]:
common_defects = (defect_presence.groupby(log["item"]).mean() - defect_presence.mean(axis=0)) / defect_presence.std(axis=0)

plt.figure(figsize=(20, 10), layout="constrained")

legend_markers = [plt.scatter([0], [0], color='red', s=100, marker='o'), plt.scatter([0], [0], color='yellow', s=50, marker='x')]

biggest_value = common_defects.abs().values.max()

task_names = [items['name'].loc[idx] for idx in common_defects.index]
defect_names = [defects['defect name'].loc[idx] for idx in common_defects.columns]

ax = sns.heatmap(
    common_defects.T,
    xticklabels=task_names,
    yticklabels=defect_names,
    cmap="Blues",
    cbar=True,
    vmin=-biggest_value,
    vmax=biggest_value
)

for _, row in solution_messages[solution_messages["item"].isin(highest_variance_tasks)].iterrows():
    x_pos = task_names.index(items['name'].loc[row['item']])
    y_pos = defect_names.index(defects['defect name'].loc[row['defect']])
    ax.scatter(x_pos + 0.5, y_pos + 0.5, color="red", s=100, marker='o')

for _, row in democode_messages[democode_messages["item"].isin(highest_variance_tasks)].iterrows():
    x_pos = task_names.index(items['name'].loc[row['item']])
    y_pos = defect_names.index(defects['defect name'].loc[row['defect']])
    ax.scatter(x_pos + 0.5, y_pos + 0.5, color="yellow", s=100, marker='x')

plt.title("Task-Level Defect Anomalies (Z-scores, Tasks With Highest Variance)")
plt.xlabel("")
plt.ylabel("")
plt.legend(legend_markers, ['example solution', 'demo code'], title='Defect present in:', loc='lower left')

plt.savefig(task_defect_images / 'defects_in_task_templates.png', dpi=300)
plt.close()

## Defect pair co-occurance significantly impacted by task