# Set up and global variables

In [None]:
from pathlib import Path
from collections import defaultdict

import matplotlib.pyplot as plt
import re
import seaborn as sns
import pandas as pd
import numpy as np
import random
import json
from matplotlib.colors import ListedColormap
from IPython.display import display, HTML
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupKFold, cross_val_score, LeaveOneGroupOut, train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import src.ipython_loader as loader

RESOLUTION = 300
VERSION = '0.0.0'
DATASET_PATH = Path('data') / 'datasets' / f'ipython_{VERSION}'
BINARY_CMAP = ListedColormap(['red', 'green'])

## Plotting utils

In [None]:
def plot_histogram(values, title, bins=10, cutoff=None, save=False):  # noqa: D103
    if cutoff:
        values[values >= cutoff] = cutoff

    plt.figure(figsize=(10, 5))

    plt.hist(values, bins=bins)
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Count')
    plt.grid(True)

    if save:
        plt.savefig(f'images/{title.lower().replace(" ", "_")}.png', dpi=300)
    plt.show()

In [None]:
def task_and_defect_description(task, defect, items, defects, log, defect_log):  # noqa: D103
    task_row = items.loc[task]
    defect_row = defects.loc[defect]
    submissions = log[(log["item"] == task) & (defect_log[defect])]
    
    return f"""
    <div style="display: flex; justify-content: space-between; gap: 20px;">
        <!-- Task Section -->
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>{task_row["name"]}</h3>
            <div><strong>Instructions:</strong><br>{task_row["instructions"]}</div>
            <div><strong>Solution:</strong><br>
                <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{task_row["solution"]}</pre>
            </div>
        </div>
        
        <!-- Defect Section -->
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>{defect_row["defect name"]}</h3>
            <div><strong>Defect Type:</strong> {defect_row["defect type"]}</div>
            <div><strong>Severity:</strong> {defect_row["severity"]}</div>
            <div><strong>Description:</strong><br>{defect_row["description"]}</div>
            
            <div style="display: flex; justify-content: space-between; margin-top: 20px;">
                <div style="width: 48%; padding: 10px;">
                    <strong>Code Example:</strong><br>
                    <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{defect_row["code example"]}</pre>
                </div>
                <div style="width: 48%; padding: 10px;">
                    <strong>Code Fix Example:</strong><br>
                    <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{defect_row["code fix example"]}</pre>
                </div>
            </div>
        </div>
    </div>
    
    <!-- Code Snippet Section -->
    <div style="border: 1px solid #ccc; padding: 10px; margin-top: 20px; border-radius: 5px;">
        <strong>Example Submission:</strong><br>
        <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{submissions["answer"].iloc[random.randint(0, len(submissions) - 1)] if len(submissions) else 'No submissions found'}</pre>
    </div>
    """

In [None]:
def task_defect_plot(matrix, title='', save=False, interactive=False, *args, **kwargs):  # noqa: D103
    defect_names = [defects['defect name'].loc[idx][:20] for idx in matrix.columns]
    task_names = [items['name'].loc[idx][:20] for idx in matrix.index]

    fig, ax = plt.subplots(figsize=big_figsize, layout="constrained")
    if interactive:
        sns.heatmap(matrix.T, cbar=False, *args, **kwargs)
    else:
        sns.heatmap(matrix.T, xticklabels=task_names, yticklabels=defect_names, cbar=True, *args, **kwargs)
        ax.tick_params(axis='x', labelsize=7)
        ax.tick_params(axis='y', labelsize=8)
        plt.title(title)
    plt.xlabel("")
    plt.ylabel("")

    if save:
        plt.savefig('images/' + title.lower().replace(' ', '_')[:title.find(' t=')] + '.png', dpi=300)

    if interactive:
        output_html = display(HTML("<b>Click a cell to see details</b>"), display_id=True)

        def on_click(event):
            if event.inaxes == ax:
                x = int(event.xdata)
                y = int(event.ydata)
                
                if 0 <= x < len(task_names) and 0 <= y < len(defect_names):
                    html = HTML(task_and_defect_description(matrix.index[x], matrix.columns[y]))
                    #html = HTML(f'{x}, {y}')
                    output_html.update(html)

        fig.canvas.mpl_connect('button_press_event', on_click)

    return fig

In [None]:
ordered_abbreviations = {
    'whitespace': 'ws',
    'constant': 'const',
    'variable': 'var',
    'function': 'func',
    'parameter': 'param',
    'expression': 'expr',
    'argument': 'arg',
    'operator': 'op',
    'operation': 'op',
    'augmentable': 'aug',
    'assignment': 'assign',
    'container': 'cont',
    'statement': 'stmt',
    'arithmetic': 'arith',
    'condition': 'cond',
    'identifier': 'identif',
    'multiple': 'multi',
    'redundant': 'redun',
    'necessary': 'necces',
    'comparison': 'compar',
    'negated': 'neg',
    'unreachable': 'unreach',
    'inappropriate': 'inapp',
    'parenthesis': '()',
}

In [None]:
def abbreviate_text(text: str, ordered_abbreviations: dict | None=None, max_length: int=20):
    """
    Shorten text by applying a list of abbreviations in a specific order, and truncate the string if necessary.

    Arguments:
        text -- The original text string to be abbreviated.

    Keyword Arguments:
        ordered_abbreviations -- An ordered dictionary of abbreviations. (default: {None})
        max_length -- The target maximum length for the string. (default: {20})

    Returns:
        _description_
    """
    current_text = text

    if ordered_abbreviations:
        for full_word, abbr in ordered_abbreviations.items():
            # Use regex with word boundaries to ensure we replace full words only
            pattern = re.escape(full_word)
            current_text = re.sub(pattern, abbr, current_text, flags=re.IGNORECASE)

            if len(current_text) <= max_length:
                return current_text

    if len(current_text) > max_length:
        return current_text[:max_length - 3] + '...'
    
    return current_text

***

# Loading data

In [None]:
items = pd.read_csv(DATASET_PATH / f'items_{VERSION}.csv', index_col=0)
log = pd.read_csv(DATASET_PATH / f'log_{VERSION}.csv', index_col=0, parse_dates=['time'])
defects = pd.read_csv(DATASET_PATH / f'defects_{VERSION}.csv', index_col=0)
defect_log = pd.read_csv(DATASET_PATH / f'defect_log_{VERSION}.csv', index_col=0)
defect_log.columns = defect_log.columns.astype(int)
code_to_defect_id = json.load(open(DATASET_PATH / f'code_to_defect_id_{VERSION}.json', "r"))
defect_presence = defect_log > 0

In [None]:
defects['display name'] = defects['defect name'].apply(lambda x: abbreviate_text(x, ordered_abbreviations))

***

# Feature analysis

## Anomalously frequent task-defect pairs

In [None]:
frequencies = defect_log.groupby(log['item']).mean()
threshold = 0.9
unreasonable = frequencies > threshold

## task-defect rarity

In [None]:
thresholds = [0.01, 0.02, 0.03, 0.04]
defect_names = defects["defect name"]

all_vals = []
for threshold in thresholds:
    rare = (frequencies < threshold).astype('int')
    common = 1 - rare
    vals = common.sum(axis=0)
    all_vals.append(vals)

stack_data = pd.concat(all_vals, axis=1).fillna(0)
stack_data.columns = [f"t={t:.2f}" for t in thresholds]

stack_data = stack_data.loc[(stack_data.median(axis=1) + 0.1 * stack_data.max(axis=1)).sort_values(ascending=False).index]

fig, ax = plt.subplots(figsize=long_figsize, layout='constrained')
x = np.arange(stack_data.shape[0])
bar_width = 0.2
n_thresholds = len(thresholds)

for i, col in enumerate(stack_data.columns):
    offset = (i - n_thresholds / 2) * bar_width + bar_width / 2
    ax.bar(x + offset, stack_data[col], width=bar_width, label=col)


ax.set_xticks(x, labels=[defect_names.loc[idx][:20] for idx in stack_data.index], rotation=90)
ax.set_xlabel('Defect')
ax.set_ylabel('Number of Common Tasks')
ax.set_title('Number of Common Task-Defect Pairs as Threshold Decreases')
ax.legend(title='Threshold')

if save:
    plt.savefig('images/number_of_common_task-defect_pairs_as_threshold_decreases.png', dpi=300)
plt.show()


In [None]:
rare_threshold = 0.005
rare = (frequencies < rare_threshold).astype('int')
rare[unreasonable] = 0

In [None]:
np.unique(rare.values.flatten(), return_counts=True)

In [None]:
rare

## characteristic task-defect pairs

In [None]:
z_score = (defect_log.groupby(log['item']).mean() - defect_log.mean()) / defect_log.std()
z_score[unreasonable] = 0

In [None]:
z_score[unreasonable] = np.nan
fig = task_defect_plot(z_score, title=f"Task-Defect Pair Z-Scores", interactive=False, save=save)
z_score[unreasonable] = 0

In [None]:
z_score[unreasonable] = np.nan
fig = task_defect_plot(z_score, title=f"Task-Defect Pair Z-Scores", interactive=True)
z_score[unreasonable] = 0

In [None]:
plt.close(fig)

In [None]:
reasonable_z_scores = z_score.stack().dropna()
quantile = 0.8
threshold = reasonable_z_scores.quantile(quantile)

plt.figure(figsize=small_figsize)

plt.hist(reasonable_z_scores, bins=100, color='skyblue', edgecolor='black')
plt.axvline(x=threshold, color='red', linestyle='--', label=f'{int(quantile * 100)}-percentile Threshold (= {threshold:.2f})')

plt.title('Histogram of Z-Scores for Reasonable Task-Defect Pairs')
plt.xlabel('Z-Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)

if save:
    plt.savefig('images/histogram_of_z-scores_for_reasonable_task-defect_pairs.png', dpi=300)
plt.show()

In [None]:
characteristic_threshold = reasonable_z_scores.quantile(quantile)
characteristic = (z_score > characteristic_threshold).astype(int)
characteristic[unreasonable] = 0

In [None]:
characteristic[unreasonable] = np.nan
fig = task_defect_plot(characteristic, title=f"Characteristic Defects for Threshold t={characteristic_threshold:.2f}", interactive=False, save=save, cmap=binary_cmap)
characteristic[unreasonable] = 0


In [None]:
log_by_topic = log.merge(items, left_on='item', right_index=True)['topic']
defect_frequencies_by_topic = defect_log.groupby(log_by_topic).mean()

topic_z_score = (defect_log.groupby(log_by_topic).mean() - defect_log.mean()) / defect_log.std()

In [None]:
topic_z_score[defect_frequencies_by_topic < 0.01] = np.nan

In [None]:
plt.figure(figsize=big_figsize, layout="constrained")
sns.heatmap(topic_z_score.T, vmin=-2, vmax=2, yticklabels=[defects['defect name'].loc[idx][:30] for idx in topic_z_score.columns], cmap="vlag", cbar=True)
plt.ylabel("")
plt.xlabel("")
plt.title("Topic-Level Defect Anomalies (Z-scores)")

plt.show()

In [None]:
items[items['name'].str.contains('Velké')]

In [None]:
defects[defects['defect name'].str.contains('for with redu')]

## currently taught topic

In [None]:
currently_taught = pd.read_csv('data/currently_taught.txt', sep='|', index_col=False)
task_name_to_id = items.drop_duplicates(subset='name').reset_index().set_index('name')['id']
currently_taught['Task ID'] = currently_taught['Task Name'].map(task_name_to_id)
#currently_taught = currently_taught[['Defect ID', 'Task ID']]
currently_taught = pd.crosstab(currently_taught['Task ID'], currently_taught['Defect ID']).astype(bool).astype(int)
currently_taught = currently_taught.reindex(index=frequencies.index, columns=frequencies.columns, fill_value=0)


In [None]:
fig = task_defect_plot(currently_taught, title=f"Related Defects", interactive=False, save=save, cmap=binary_cmap)

## student-specific frequency

In [None]:
# prepare in advance
df = log.merge(defect_log, left_index=True, right_index=True)
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values(by=['user', 'time'])

accuracy_log = []

# for each user
for user_id, history in tqdm(df.groupby('user')):
    correct_count = {defect: 0 for defect in defect_log.columns}
    encounter_count = {defect: 0 for defect in defect_log.columns}
    
    # iterate over history
    for i, (idx, row) in enumerate(history.iterrows()):
        accuracy_row = {}
        
        task_id = row['item']
        
        for defect in defect_log.columns:
            is_reasonable = not unreasonable.loc[task_id, defect]
            if row[defect] == 0 and is_reasonable:
                correct_count[defect] += 1
            if row[defect] == 1 or is_reasonable:
                encounter_count[defect] += 1
                accuracy_row[defect] = correct_count[defect] / encounter_count[defect]
            else:
                accuracy_row[defect] = np.nan

        accuracy_row['submission id'] = idx
        accuracy_log.append(accuracy_row)
    
# create dataframe
accuracy_log = pd.DataFrame(accuracy_log).set_index('submission id')
accuracy_log.index.name = 'submission id'

accuracy_at_least_once = accuracy_log[accuracy_log < 1]

student_specific_frequency = (accuracy_log - accuracy_log.mean()) / accuracy_log.std()

In [None]:
values = pd.Series(student_specific_frequency.values.flatten()).dropna().values

upper_quantile = 0.90
student_upper_threshold = np.quantile(values, upper_quantile)
lower_quantile = 0.20
student_lower_threshold = np.quantile(values, lower_quantile)

plt.figure(figsize=(10, 5))

plt.hist(values, bins=100)

plt.axvline(student_lower_threshold, color='red', linestyle='--', linewidth=2, label=f'{int(lower_quantile*100)}% threshold')
plt.axvline(student_upper_threshold, color='green', linestyle='--', linewidth=2, label=f'{int(upper_quantile*100)}% threshold')

plt.title('Distribution of Student-Specific Frequency')
plt.xlabel('Value')
plt.ylabel('Count')
plt.legend()
plt.grid(True)

if save:
    plt.savefig(f'images/distribution_of_student_specific_frequency.png', dpi=300)
plt.show()

In [None]:
student_specific_log = (student_specific_frequency > student_upper_threshold).astype('int')
student_specific_log[student_specific_frequency < student_lower_threshold] = -1

In [None]:
np.unique(student_specific_log.values.flatten(), return_counts=True)

In [None]:
plot_histogram(accuracy_at_least_once.values.flatten(), 'Distribution of User-Defect Accuracy', bins=10)

In [None]:
plot_histogram(accuracy_at_least_once.groupby(log['user']).mean().values.flatten(), 'Distribution of User-Defect Accuracy (User Averages)', bins=10)

In [None]:
accuracy_means = accuracy_log.mean().sort_values(ascending=False)

plt.figure(figsize=long_figsize, layout="constrained")

ticks = np.arange(len(accuracy_means))

plt.bar(ticks, accuracy_means)
plt.title("Average Accuracy per Defect")
plt.ylabel("Average Accuracy")
plt.xlabel("Defect")
plt.xticks(ticks, [defects['defect name'][idx][:20] for idx in accuracy_means.index], rotation=90)
plt.grid(axis='y')

if save:
    plt.savefig('images/average_accuracy_per_defect.png', dpi=300)
plt.show()


In [None]:
has_reasonable = (~unreasonable).sum()

has_reasonable = has_reasonable.loc[accuracy_means.index]

plt.figure(figsize=(12, 6), layout="constrained")

ticks = np.arange(len(has_reasonable))

plt.bar(ticks, has_reasonable)
plt.title("Average Accuracy per Defect")
plt.ylabel("Average Accuracy")
plt.xlabel("Defect")
plt.xticks(ticks, [defects['defect name'][idx][:20] for idx in has_reasonable.index], rotation=90)
plt.grid(axis='y')

plt.show()

## defect multiplicity

In [None]:
# suppress outliers
multiplicity_log[multiplicity_log > 10] = 10

In [None]:
means = multiplicity_log[multiplicity_log > 0].mean().sort_values().sort_values(ascending=False)

ticks = np.arange(len(means)) + 0.4

plt.figure(figsize=long_figsize, layout="constrained")
plt.bar(ticks, means.values)

plt.xticks(ticks,[defects['defect name'][idx][:20] for idx in means.index], rotation=90)
plt.title(f"Defects by Mean Multiplicity (When Occuring)")
plt.xlabel("Defect")
plt.ylabel("Mean Multiplicity")

if save:
    plt.savefig('images/defects_by_mean_multiplicity.png', dpi=300)
plt.show()

In [None]:
vars = multiplicity_log[multiplicity_log > 0].var().sort_values().sort_values(ascending=False)

ticks = np.arange(len(means)) + 0.4

plt.figure(figsize=(10, 4), layout="constrained")
plt.bar(ticks, vars.values)

plt.xticks(ticks,[defects['defect name'][idx][:20] for idx in means.index], rotation=90)
plt.title(f"Multiplicity Variance")
plt.xlabel("Defect")
plt.ylabel("Multiplicity Varaince")

if save:
    plt.savefig('images/multiplicity_variance.png', dpi=300)
plt.show()

In [None]:
plot_histogram(multiplicity_log[multiplicity_log > 0].values.flatten(), 'Distribution of Multiplicity', bins=10)

In [None]:
multiplicity_thresholds = [0, 1, 5]
multiplicity_labels = [0, 1, 2]
bins = multiplicity_thresholds + [np.inf]

# flatten
multiplicity = multiplicity_log.values.flatten()

# add labels
multiplicity = pd.cut(multiplicity, bins=bins, labels=multiplicity_labels, right=False)

# reshape
multiplicity = pd.DataFrame(
    multiplicity.reshape(multiplicity_log.shape),
    index=multiplicity_log.index,
    columns=multiplicity_log.columns
).astype('Int64')


In [None]:
np.unique(multiplicity.values.flatten(), return_counts=True)

## student recently fixed

In [None]:
np.random.seed(42)

recently_fixed_log = []

# prepare in advance to make the computation faster
df = log.merge(defect_log, left_index=True, right_index=True)
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values(by=['user', 'time'])

# for each user
for user_id, history in tqdm(df.groupby('user')):
    last_fixed = {defect: None for defect in defect_log.columns}

    for i, (idx, row) in enumerate(history.iterrows()):
        recency_row = {}

        for defect in defect_log.columns:
            if row[defect] == 1:
                # 80% chance the student fixes it
                if np.random.rand() < 0.8:
                    recency_row[defect] = 0
                    last_fixed[defect] = i
                else:
                    recency_row[defect] = np.nan
            else:
                if last_fixed[defect] is not None:
                    recency_row[defect] = i - last_fixed[defect]
                else:
                    recency_row[defect] = np.nan

        recency_row['submission id'] = idx
        recently_fixed_log.append(recency_row)

recently_fixed_log = pd.DataFrame(recently_fixed_log).set_index('submission id')
recently_fixed_log.index.name = 'submission id'

In [None]:
first_time_rate = (recently_fixed_log == 0)[~recently_fixed_log.isna()].mean().sort_values(ascending=True)

plt.figure(figsize=long_figsize, layout="constrained")

ticks = np.arange(len(first_time_rate))

plt.bar(ticks, first_time_rate)
plt.title("Percentage of First-Time Occurances per Defect")
plt.ylabel("First-Time Rate")
plt.xlabel("Defect")
plt.xticks(ticks, [defects['defect name'][idx][:30] for idx in first_time_rate.index], rotation=90)
plt.grid(axis='y')

if save:
    plt.savefig('images/percentage_of_first_time_occurances_per_defect.png', dpi=300)
plt.show()


In [None]:
# bins
bins = [0, 1, 2, 4, 9, 14, 19, 24, 29, np.inf]
bin_labels = ['1', '2', '3-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30+']
recency = recently_fixed_log.apply(lambda col: pd.cut(col, bins=bins, labels=bin_labels))

# histogram
recency = recency.apply(lambda col: col.value_counts()).fillna(0).astype(int)

# scaling
recency = recency.div(recency.sum(axis=0), axis=1)

# sort as the previous graph
recency = recency.loc[:, first_time_rate.index]

plt.figure(figsize=(12, 6), layout="constrained")
sns.heatmap(recency, cmap='viridis', cbar_kws={'label': 'frequency'}, xticklabels=[defects['defect name'][idx][:30] for idx in recency.columns])
plt.title("Number of Sessions Before Fix Reoccurance")
plt.xlabel("Defect")
plt.ylabel("Recency Bin")
plt.gca().invert_yaxis()

if save:
    plt.savefig('images/heatmap_of_recency_bins_per_defect.png', dpi=300)
plt.show()

In [None]:
mean_recency = recently_fixed_log.replace(0, np.nan).median()

# sort as the other graphs
mean_recency = mean_recency.loc[first_time_rate.index]

plt.figure(figsize=long_figsize, layout="constrained")

ticks = np.arange(len(mean_recency))

plt.bar(ticks, mean_recency)
plt.title("Average Recency (# of Submissions Since Last Seen) per Defect")
plt.ylabel("Average Recency")
plt.xlabel("Defect")
plt.xticks(ticks, [defects['defect name'][idx][:30] for idx in first_time_rate.index], rotation=90)
plt.grid(axis='y')

if save:
    plt.savefig('images/average_recency_per_defect.png', dpi=300)
plt.show()


In [None]:
recency_thresholds = [-np.inf, -0.1, 5, np.inf]
recency_labels = [0, 2, 1]  # 0 = never, 2 = recently, 1 = fixed but not recently

recency = recently_fixed_log.values.flatten()

recency = np.where(np.isnan(recency), -1, recency)

recency = pd.cut(recency, bins=recency_thresholds, labels=recency_labels, right=False)

recency = pd.DataFrame(
    recency.reshape(recently_fixed_log.shape),
    index=recently_fixed_log.index,
    columns=recently_fixed_log.columns
).astype('Int64')


In [None]:
np.unique(recency.values.flatten(), return_counts=True)

## severity

In [None]:
severity_log = defect_log * defects.loc[defect_log.columns]['severity']
severity_log[severity_log.isna()] = 0

In [None]:
df = severity_log.copy()
df[df == 0] = np.nan
means = df.groupby(log['item']).max().mean(axis=1).sort_values()

ticks = np.arange(len(means))

plt.figure(figsize=(13, 4), layout="constrained")

plt.bar(ticks, means.values)

plt.xticks(ticks, [items['name'][idx][:20] for idx in means.index], rotation=90)
plt.title(f"Mean Severity for Each Task")
plt.xlabel("Task")
plt.ylabel("Severity")
plt.tick_params(axis='x', labelsize=7)

if save:
    plt.savefig('images/mean_severity.png', dpi=300)
plt.show()

In [None]:
differences = df.apply(lambda row: -row.nlargest(2).diff().iloc[-1], axis=1).value_counts().sort_index()

plt.figure(figsize=small_figsize, layout="constrained")

plt.bar(differences.index.astype(int), differences.values, edgecolor='black')
plt.title('Histogram of Differences in Severity')
plt.xlabel('Difference')
plt.ylabel('Count')

if save:
    plt.savefig(f'images/severity_differences.png', dpi=300)
plt.show()

# rejected

### time spent on task

In [None]:
log['relative_time_spent'] = log['responseTime'] / log.groupby('item')['responseTime'].transform('mean')

# clip all submissions to X times the task mean
clip_threshold = 5

print('Clipping all submissions to', clip_threshold, 'times the task mean:', (log['relative_time_spent'] > clip_threshold).mean(), 'changed.')

log[log['relative_time_spent'] > clip_threshold] = clip_threshold

In [None]:
time_spent_threshold = log['relative_time_spent'].quantile(0.90)

In [None]:
plt.figure(figsize=small_figsize, layout="constrained")

sns.histplot(log['relative_time_spent'], bins=1000, kde=True)
plt.axvline(time_spent_threshold, color='red', linestyle='--', label='75th Percentile Threshold')
plt.title("Distribution of Relative Time Spent per Task (Values Over 5 Clipped)")
plt.xlabel("Relative Time Spent")
plt.ylabel("Count")
plt.legend()

if save:
    plt.savefig('images/distribution_of_relative_time_spent_per_task.png', dpi=300)
plt.show()


### associated with poor performance (locally)

In [None]:
from sklearn.metrics import precision_score

_, not_log, _, not_defect_log, _ = loader.load(ipython_path, data_path, only_correct=False)

df = not_log[['item', 'correct']].merge(not_defect_log, left_index=True, right_index=True)

correlations = {}

# for each item
for task_id, task_df in df.groupby('item'):
    corr_dict = {}
    # for each defect
    for defect in defect_log.columns:
        # get vectors
        defect_presence = task_df[defect]
        incorrect = ~task_df['correct']
        # caluclate correlation
        if defect_presence.nunique() > 1 and incorrect.nunique() > 1:
            corr = precision_score(incorrect, defect_presence)
            # corr, _ = pointbiserialr(defect_presence, incorrect)
            corr_dict[defect] = corr
        else:
            corr_dict[defect] = np.nan

    correlations[task_id] = corr_dict

# construct df
performance = pd.DataFrame.from_dict(correlations, orient='index')
performance[unreasonable] = np.nan


In [None]:
fig = task_defect_plot(performance, title="Defect-Failure Precision", interactive=False, save=save)

In [None]:
performances = performance.stack().dropna()

plt.figure(figsize=(10, 6))

plt.hist(performances, bins=100, color='skyblue', edgecolor='black')
plt.title('Histogram of Task-Defect Pair Precision in Predicting Failures')
plt.xlabel('Z-Score')
plt.ylabel('Frequency')
plt.grid(True)

if save:
    plt.savefig('images/histogram_of_task-defect_pair_precision.png', dpi=300)
plt.show()

In [None]:
failure_threshold = 0.25
# failure_threshold = performances.quantile(quantile)

fig = task_defect_plot(performance > failure_threshold, title=f"High failure rates t={failure_threshold:.2f}", interactive=False, save=save)

In [None]:
fig = task_defect_plot(performance > failure_threshold, title=f"High failure rates t={failure_threshold:.2f}", interactive=True, cmap=binary_cmap)

In [None]:
plt.close(fig)

### future opportunity likelihood

In [None]:
all_tasks = items.index

opportunity_log = []

# prepare in advance to make the computation faster
df = log.merge(defect_log, left_index=True, right_index=True)
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values(by=['user', 'time'])

# for each user
for user_id, history in tqdm(df.groupby('user')):
    completed_tasks = set()

    # iterate over history
    for submission_id, row in history.iterrows():
        # unfinished tasks
        completed_tasks.add(row['item'])
        remaining_tasks = [t for t in all_tasks if t not in completed_tasks]

        if not remaining_tasks:
            opportunity_log.append({
                'submission id': submission_id,
                **{defect: 0.0 for defect in frequencies.columns}
            })
            continue

        weights = np.array([2.0 if t > row['item'] else 1.0 for t in remaining_tasks])
        weighted_avg = (frequencies.loc[remaining_tasks].T @ weights) / weights.sum()

        opportunity_log.append({
            'submission id': submission_id,
            **weighted_avg.to_dict()
        })


# create dataframe
opportunity_log = pd.DataFrame(opportunity_log).set_index('submission id')
opportunity_log.index.name = 'submission id'

In [None]:
plt.figure(figsize=(10, 5))

plt.hist(opportunity_log.values.flatten(), bins=100)
plt.title('Histogram of Future Opportunities')
plt.xlabel('Opportunity')
plt.ylabel('Count')
plt.grid(True)

if save:
    plt.savefig(f'images/histogram_of_future_opportunities.png', dpi=300)
plt.show()

In [None]:
means = opportunity_log.mean().sort_values().sort_values(ascending=False)

ticks = np.arange(len(means)) + 0.4
bar_width = 0.4

plt.figure(figsize=(10, 4), layout="constrained")

plt.bar(ticks - bar_width / 2, means.values, label='Opportunity', width=bar_width)
plt.bar(ticks + bar_width / 2, defect_log[means.index].mean().values, label='Frequency', width=bar_width)

plt.xticks(ticks,[defects['defect name'][idx][:20] for idx in means.index], rotation=90)
plt.title(f"Defects by Mean Opportunity vs Frequency")
plt.xlabel("Defect")
plt.ylabel("Mean Opportunity")
plt.legend()

if save:
    plt.savefig('images/defects_by_mean_opportunity.png', dpi=300)
plt.show()

In [None]:
task_opportunities = opportunity_log.groupby(log['item']).mean()
task_opportunities = (task_opportunities - task_opportunities.mean(axis=0))

fig = task_defect_plot(task_opportunities, title="Future Opportunity to Make Defect by Task", interactive=False, save=save)

# filtering before sampling

In [None]:
def has_close_pair(row):
    """Check if there is a pair of values with difference one or less."""
    row_values = row.values
    return np.any(np.abs(row_values[:, None] - row_values) <= 1)

In [None]:
# at least two defects
filtered = defect_log[defect_log.sum(axis=1) > 1]

In [None]:
# at most difference of one in severity
filtered *= defects.loc[filtered.columns]['severity']
filtered = filtered[filtered.apply(has_close_pair, axis=1)]

In [None]:
# apply the filter
defect_log = defect_log.loc[filtered.index]
log = log.loc[filtered.index]

# feature engineering

In [None]:
def expand_task_defect_table(log, task_defect_table):
    """Expand task-defect table to feature log."""
    feature_log = log[['item']].join(task_defect_table, on='item')
    feature_log.drop('item', axis=1, inplace=True)
    return feature_log


In [None]:
feature_values = {
    'rare': expand_task_defect_table(log, frequencies),
    'characteristic': expand_task_defect_table(log, z_score),
    'currently_taught': expand_task_defect_table(log, currently_taught),
    'student specific': student_specific_frequency.loc[log.index],
    'multiplicity': multiplicity_log.loc[log.index],
    # use discrete values for the recently fixed (big difference between never fixed and recently fixed)
    'recently fixed': recency.loc[log.index].astype('int'), # recently_fixed_log.loc[log.index],
    'severity': severity_log.loc[log.index],
}

features = {
    'rare': expand_task_defect_table(log, rare).astype('int'),
    'characteristic': expand_task_defect_table(log, characteristic).astype('int64'),
    'currently_taught': expand_task_defect_table(log, currently_taught).astype('int'),
    'student specific': student_specific_log.loc[log.index].astype('int'),
    'multiplicity': multiplicity.loc[log.index].astype('int'),
    'recently fixed': recency.loc[log.index].astype('int'),
    'severity': severity_log.loc[log.index].astype('int'),
}

feature_embeddings = {
    'rare': {
        0: '',
        1: 'Defect is rare for this task.'
    },
    'characteristic': {
        0: '',
        1: 'Defect is much more common for this task than other tasks.'
    },
    'currently_taught': {
        0: '',
        1: 'A LLM has identified that this Defect relates to material currently being taught.'
    },
    'student specific': {
        -1: 'Student makes this defect less frequently than peers.',
        0: '',
        1: 'Student makes this defect more frequently than peers.'
    },
    'multiplicity': {
        0: '',
        1: 'Defect occurs a few times.',
        2: 'Defect occurs many times.'
    },
    'recently fixed': {
        0: 'Student has never fixed this defect.',
        1: 'Student has fixed this defect at some point.',
        2: 'Student fixed this defect recently.'
    },
    'severity': {
        0: '',
        2: 'This defect is benign.',
        3: 'This defect is of moderate severity.',
        4: 'This defect is severe.',
        5: 'This defect is highly severe.'
    },
}

# testing sample

In [None]:
def generate_submission_html(submission_id, feature):
    """Simulate a decision based on the value of single feature."""
    task_id = log.loc[submission_id, 'item']
    
    present_defects = defect_log.loc[submission_id]
    present_defects = present_defects[present_defects == 1].index.tolist()


    defect_rows = []
    for defect in present_defects:
        defect_rows.append({
            "Defect": defects.loc[defect, "defect name"],
            "Description": defects.loc[defect, "description"],
            f"{feature}": f"{feature_values[feature].loc[submission_id, defect]:.2f}"
        })
    
    defect_df = pd.DataFrame(defect_rows)
    
    # generated by chatGPT
    html = f"""
    <div style="background-color: #121212; color: #f0f0f0; font-family: 'Segoe UI', sans-serif; padding: 20px;">
        <div style="text-align: left;">
            <table style="width: 90%; margin-left: auto; border-collapse: collapse; background-color: #1e1e1e; border: 1px solid #444;">
                <tr>
                    <td style="vertical-align: top; width: 50%; border-right: 1px solid #333; padding: 20px; text-align: left;">
                        <h2 style="color: #ffffff;">{items.loc[task_id, 'name']}</h2>
                        <p><strong>Instructions:</strong><br>{items.loc[task_id, 'instructions']}</p>
                        <div style="background-color: #2b2b2b; color: #dcdcdc; padding: 15px; border-radius: 5px; overflow-x: auto; text-align: left;">
                            <pre style="margin: 0; white-space: pre-wrap;">{log.loc[submission_id,'answer']}</pre>
                        </div>
                    </td>
                    <td style="vertical-align: top; width: 50%; padding: 20px;">
                        <h2 style="color: #ffffff;">Detected Defects</h2>
                        {defect_df.to_html(index=False, escape=False, border=0, justify='left', classes='defect-table')}
                    </td>
                </tr>
            </table>
        </div>
    </div>
    """
    return html


In [None]:
feature = 'characteristic'

# identify submissions where feature is represented
sampled_df = log[features[feature].sum(axis=1) > 0]

# sample 10 unique submissions
sampled_df = sampled_df.sample(n=10, random_state=42)


In [None]:
display(HTML(generate_submission_html(sampled_df.index[3], feature)))

# survey sample

In [None]:
def greedy_sample(features, log, n_samples=200, seed=42):
    """Sample log indexes using a greedy algorithm to balance features and maximize task coverage."""
    random.seed(seed)

    sample = []
    feature_counts = feature_counts = pd.Series(0, index=features.keys())
    task_counts = pd.Series(0, index=log['item'].unique())

    for _ in tqdm(range(n_samples)):
        # least represented feature
        feature = feature_counts.idxmin()

        # filter submissions with feature and not in the sample
        candidates = features[feature]
        candidates = candidates[candidates.sum(axis=1) > 0].index.difference(sample)
        
        if candidates.empty:
            print("[WARNING] No candidates left for feature, skipping...")
            feature_counts[feature] += 1
            continue

        # filter submissions with yet unused task
        task_indices = task_counts[task_counts == task_counts.min()].index
        task_candidates = log.loc[candidates]
        task_candidates = task_candidates[task_candidates['item'].apply(lambda x: x in task_indices)].index

        if not task_candidates.empty:
            choice = random.choice(task_candidates)
            sample.append(choice)
            task_counts[log.loc[choice, 'item']] += 1
        else:
            print("[WARNING] No candidates left for task, choosing randomly...")
            sample.append(random.choice(candidates))
    return sample


In [None]:
sample = greedy_sample(features, log, n_samples=70)

In [None]:
sample_freq = defect_log.loc[sample].mean()
global_freq = defect_log.mean()
uniform = pd.Series(1 / len(defect_log.columns), index=defect_log.columns)

df = pd.DataFrame({
    'Sample': sample_freq,
    'Global': global_freq,
    'Uniform': uniform
}).sort_values(by='Global', ascending=False)

labels = [defects['defect name'][idx][:20] for idx in df.index]
ticks = np.arange(len(df))

bar_width = 0.4
plt.figure(figsize=(13, 5), layout='constrained')

# sample vs global
plt.bar(ticks - bar_width/2, df['Global'], width=bar_width, label='Global', color='lightgray')
plt.bar(ticks + bar_width/2, df['Sample'], width=bar_width, label='Sample', color='steelblue')

# uniform
plt.plot(ticks, df['Uniform'], color='green', linestyle='--', label='Uniform')

plt.xticks(ticks, labels, rotation=90)
plt.ylabel("Defect Frequency")
plt.title("Defect Distribution: Sample vs Global vs Uniform")
plt.legend()

plt.savefig(f'images/sampled_defect_distribution.png', dpi=300)
plt.show()

In [None]:
sample_freq = log.loc[sample]['item'].value_counts(normalize=True)
global_freq = log['item'].value_counts(normalize=True)
uniform = pd.Series(1 / log['item'].nunique(), index=log['item'].unique())

df = pd.DataFrame({
    'Sample': sample_freq,
    'Global': global_freq,
    'Uniform': uniform
}).sort_values(by='Global', ascending=False)

labels = [items['name'].loc[idx][:20] for idx in df.index]
ticks = np.arange(len(df))

bar_width = 0.4
plt.figure(figsize=(13, 5), layout='constrained')

# sample vs global
plt.bar(ticks - bar_width/2, df['Global'], width=bar_width, label='Global', color='lightgray')
plt.bar(ticks + bar_width/2, df['Sample'], width=bar_width, label='Sample', color='steelblue')

# uniform
plt.plot(ticks, df['Uniform'], color='green', linestyle='--', label='Uniform')

plt.xticks(ticks, labels, rotation=90)
plt.tick_params(axis='x', labelsize=7)
plt.ylabel("Task Frequency")
plt.title("Task Distribution: Sample vs Global vs Uniform")
plt.legend()

plt.savefig(f'images/sampled_task_distribution.png', dpi=300)
plt.show()

In [None]:
feature_counts = pd.DataFrame({
    name: pd.Series(df.values.flatten()).value_counts()
    for name, df in features.items()
}).fillna(0).astype(int)

fig, ax = plt.subplots(figsize=(13, 5), layout='constrained')
feature_counts.T.plot(kind='bar', stacked=True, colormap='tab10', ax=ax)

plt.xlabel("Feature")
plt.ylabel("Number of Occurances")
plt.title("Stacked Bar Plot of Feature Values in the Sample")
plt.legend(title="Values", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=0)

plt.savefig(f'images/sampled_feature_distribution.png', dpi=300)
plt.show()

# export dataset

In [None]:
def create_export_dataframes(indexes, log, items, defects, defect_log, features):
    """Create submission and defect dataframes for export from given indexes and feature dictionaries."""
    submission_df = []
    defect_df = []

    for idx in indexes:
        row = log.loc[idx]
        submission_df.append({
            'submission': row['answer'],
            'task name': items.loc[row['item']]['name'],
            'instructions': items.loc[row['item']]['instructions']
        })

        # For each defect present in the submission
        for defect in defect_log.loc[idx][defect_log.loc[idx] > 0].index:
            defect_entry = {
                'submission id': idx,
                'defect id': defect,
                'name': defects.loc[defect]['defect name'],
                'description': defects.loc[defect]['description'],
                'code example': defects.loc[defect]['code example'],
                'code fix example': defects.loc[defect]['code fix example'],
                'additional context': ''
            }

            for feature, df in features.items():
                value = df.at[idx, defect]
                defect_entry['additional context'] += feature_embeddings[feature][value] + '\n'

            defect_df.append(defect_entry)

    return pd.DataFrame(submission_df, index=indexes), pd.DataFrame(defect_df)


In [None]:
if False:
    sampled_submissions, sampled_defects = create_export_dataframes(sample, log, items, defects, defect_log, features)
    sampled_submissions.to_csv('data/export/sampled_submissions.csv', sep=';', index_label='index')
    sampled_defects.to_csv('data/export/sampled_defects.csv', sep=';', index_label='index')
else:
    sampled_submissions = pd.read_csv('data/export/sampled_submissions.csv', sep=';', index_col=0)
    sampled_defects = pd.read_csv('data/export/sampled_defects.csv', sep=';', index_col=0)

# survey results

In [None]:
responses = pd.read_csv('data/responses.csv', sep=';')
responses = responses[responses['submission id'].isin(sampled_submissions.index)] # some submissions were manually removed during the survey

In [None]:
vote_counts = responses.groupby(['submission id', 'answer']).size().reset_index(name='count')
ties = vote_counts.groupby('submission id', group_keys=False).apply(lambda x: (x['count'] == x['count'].max()).sum() > 1, include_groups=False)

In [None]:
print('Number of respondents:', responses['respondent'].nunique())
print('Average number of responses:', responses.groupby('respondent').count()['answer'].mean())
print('Average number of answers per submission:', responses.groupby('submission id').count()['answer'].mean())
print('Percentage of tied results:', np.round(ties.mean() * 100, 2), '%')

In [None]:
submission_counts = responses['submission id'].value_counts().value_counts().sort_index()

plt.figure(figsize=small_figsize, layout='constrained')
plt.bar(submission_counts.index.astype(str), submission_counts.values)
plt.title("Distribution of Responses per Submission")
plt.xlabel("Number of Responses")
plt.ylabel("Number of Submissions")

plt.savefig(f'images/submission_response_distribution.png', dpi=300)
plt.show()

In [None]:
pick_rates = (responses.groupby('answer').size() / defect_log.loc[responses['submission id']].sum()).sort_values(ascending=False)
pick_rates = pick_rates[~pick_rates.isna()]

severities = (defects.loc[pick_rates.index]['severity'] - 2) / 3

plt.figure(figsize=long_figsize, layout='constrained')

ticks = np.arange(len(pick_rates))
width = 0.4

plt.bar(ticks - width / 2, pick_rates.values, width=width)
plt.bar(ticks + width / 2, severities.values, width=width)
plt.title("Defect Pick Rates vs Normalized Severity")
plt.xlabel("Defect")
plt.ylabel("Pick Rate")
plt.xticks(ticks, [defects['defect name'][idx][:20] for idx in pick_rates.index], rotation=90)

plt.savefig(f'images/defect_pick_rates.png', dpi=300)
plt.show()

In [None]:
defect_log.loc[responses['submission id']]

In [None]:
winners = vote_counts.loc[vote_counts.groupby('submission id')['count'].idxmax()]
winners.set_index('submission id', inplace=True)

winners['consensus'] = (winners['count'] / responses.groupby('submission id').size()).sort_values()

consensus = winners['consensus'].sort_values()

ticks = np.arange(len(consensus))

plt.figure(figsize=(13, 4), layout="constrained")

plt.bar(ticks, consensus.values)

plt.title("Entry Consensus (Votes for Winner / Total Votes)")
plt.xlabel("Entry")
plt.ylabel("Consensus")
plt.tick_params(axis='x', labelsize=7)

if save:
    plt.savefig('images/entry_consensus.png', dpi=300)
plt.show()

In [None]:
display(HTML(generate_submission_html(consensus.index[0], 'severity')))

In [None]:
display(HTML(generate_submission_html(consensus.index[1], 'severity')))

In [None]:
display(HTML(generate_submission_html(consensus.index[-1], 'severity')))

In [None]:
display(HTML(generate_submission_html(consensus.index[-2], 'severity')))

# feature heuristics

In [None]:
def leave_one_group_out_for_model(model, X, y, groups):
    """Train the model using leave-one-out cross-validation."""
    logo = LeaveOneGroupOut()
    predictions = []
    ground_truth = []

    for train_idx, test_idx in logo.split(X, y, groups=groups):
        model.fit(X.iloc[train_idx], y.iloc[train_idx])
        preds = model.predict(X.iloc[test_idx])
        predictions.extend(preds)
        ground_truth.extend(y.iloc[test_idx])

    return predictions, ground_truth

In [None]:
def summarize_model_performance(y_true, y_pred, model_name='Model'):
    """Summarize the results of leave-one-out cross-validation with accuracy, precision, recall, and F1 score."""
    summary = {
        'Model': model_name,
        'Accuracy': np.round(accuracy_score(y_true, y_pred), 2),
        'Precision': np.round(precision_score(y_true, y_pred, zero_division=0), 2),
        'Recall': np.round(recall_score(y_true, y_pred, zero_division=0), 2),
        'F1': np.round(f1_score(y_true, y_pred, zero_division=0), 2),
    }
    return pd.DataFrame([summary])

## differential features

In [None]:
# add losing defects
df = responses.merge(sampled_defects[['submission id', 'defect id']], on='submission id', how='left')
df = df[~(df['answer'] == df['defect id'])]

# add differential features
for feature, values in feature_values.items():
    df[feature] = np.zeros(len(df))
    for idx, row in df.iterrows():
        df.at[idx, feature] = values.loc[row['submission id'], row['answer']] - values.loc[row['submission id'], row['defect id']]

differential_features = features.keys()

# pairwise encoding
pairwise_df = []

# keep track of the original responses for cross validation
for index, (_, group) in enumerate(df.groupby(['submission id', 'respondent'])):
    for _, row in group.iterrows():    
        pairwise_df.append({
            'response id': index,
            'defect1': row['answer'],
            'defect2': row['defect id'],
            'first chosen': 1,
            **{feature: row[feature] for feature in differential_features}
        })

        # also add the reverse
        pairwise_df.append({
            'response id': index,
            'defect1': row['defect id'],
            'defect2': row['answer'],
            'first chosen': 0,
            **{feature: -row[feature] for feature in differential_features}
        })

pairwise_df = pd.DataFrame(pairwise_df)


In [None]:
results = []

for feature in differential_features:
    X = pairwise_df[[feature]]
    y = pairwise_df['first chosen']
    groups = pairwise_df['response id']

    if training:
        X, _, y, _, groups, _ = train_test_split(
            X, y, groups, test_size=0.2, random_state=42, stratify=y
        )

    model = LogisticRegression(max_iter=1000)
    preds, truths = leave_one_group_out_for_model(model, X, y, groups)

    summary_df = summarize_model_performance(truths, preds, model_name=feature)
    results.append(summary_df)

final_results = pd.concat(results).reset_index(drop=True)
print(final_results)

# feature combinations

## data mining

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

X = pairwise_df[features.keys()]
y = pairwise_df['first chosen']

# convert into binary itemsets
itemsets = X.apply(lambda x: [f"{col}>" if x[col] > 0 else f"{col}<=" for col in X.columns], axis=1)

# encode
te = TransactionEncoder()
te_ary = te.fit(itemsets).transform(itemsets)
encoded = pd.DataFrame(te_ary, columns=te.columns_)

encoded['winner'] = y.values.astype(bool)

# run apriori
frequent_itemsets = apriori(encoded, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

# filter rules
# predicting the output variable
rules = rules[rules['consequents'].apply(lambda x: 'winner' in x)]
# sufficient confidence and support
rules = rules[
    (rules['confidence'] > 0.7) & 
    (rules['support'] > 0.15)
]
# sort
rules = rules.sort_values(by='lift', ascending=False)
# only one rule per antecedent
rules = rules.drop_duplicates(subset=['antecedents'])

In [None]:
best_rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(5)

# generated by chatGPT
html = f"""
<div style="background-color: #121212; color: #f0f0f0; padding: 15px; font-family: 'Segoe UI', sans-serif; max-width: 900px; margin: 20px auto; border-radius: 6px;">
    <h2 style="text-align: center; margin-bottom: 15px;">Top Unique Association Rules Predicting Target=1</h2>
    {best_rules.to_html(index=False, border=0, classes='rules-table')}
</div>
<style>
    .rules-table {{
        width: 100%;
        border-collapse: collapse;
        color: #dcdcdc;
    }}
    .rules-table th {{
        background-color: #2b2b2b;
        padding: 8px;
        text-align: left;
    }}
    .rules-table td {{
        background-color: #1e1e1e;
        padding: 8px;
    }}
    .rules-table tr:hover td {{
        background-color: #3a3a3a;
    }}
</style>
"""
display(HTML(html))

In [None]:
derived_df = {}

for antecedents in best_rules['antecedents']:
    def match_rule(row):  # noqa: D103
        for cond in antecedents:
            if '>' in cond:
                feat = cond.split('>')[0]
                if not row[feat] > 0:
                    return 0
            else:
                feat = cond.split('<=')[0]
                if not row[feat] <= 0:
                    return 0
        return 1
    derived_df[" & ".join(sorted(antecedents))] = pairwise_df.apply(match_rule, axis=1)

derived_rules = derived_df.keys()
derived_df = pd.DataFrame(derived_df)

In [None]:
results = []

for derived_rule in derived_rules:
    X = derived_df[[derived_rule]]
    y = pairwise_df['first chosen'].astype(bool)
    groups = pairwise_df['response id']

    model = LogisticRegression(max_iter=1000)
    preds, truths = leave_one_group_out_for_model(model, X, y, groups)

    summary_df = summarize_model_performance(truths, preds, model_name=derived_rule)
    results.append(summary_df)

final_results = pd.concat(results).reset_index(drop=True)
print(final_results)

## decision tree

In [None]:
discretized_df = responses.merge(sampled_defects[['submission id', 'defect id']], on='submission id', how='left')
discretized_df = discretized_df[~(discretized_df['answer'] == discretized_df['defect id'])]

defect1_features = []
defect2_features = []

# add categorical features
for feature, discretized_values in features.items():
    defect1_name = f'defect1 {feature}'
    defect2_name = f'defect2 {feature}'

    defect1_features.append(defect1_name)
    defect2_features.append(defect2_name)

    discretized_df[defect1_name] = np.zeros(len(discretized_df))
    discretized_df[defect2_name] = np.zeros(len(discretized_df))

    for idx, row in discretized_df.iterrows():
        discretized_df.at[idx, defect1_name] = discretized_values.loc[row['submission id'], row['answer']]
        discretized_df.at[idx, defect2_name] = discretized_values.loc[row['submission id'], row['defect id']]

discretized_pairwise_df = []

for index, (_, group) in enumerate(discretized_df.groupby(['submission id', 'respondent'])):
    for _, row in group.iterrows():    
        discretized_pairwise_df.append({
            'response id': index,
            'defect1': row['answer'],
            'defect2': row['defect id'],
            'first chosen': 1,
            **{defect1_features[i]: row[defect2_features[i]] for i in range(len(defect1_features))},
            **{defect2_features[i]: row[defect1_features[i]] for i in range(len(defect2_features))}
        })

        # also add the reverse
        discretized_pairwise_df.append({
            'response id': index,
            'defect1': row['defect id'],
            'defect2': row['answer'],
            'first chosen': 0,
            **{defect1_features[i]: row[defect1_features[i]] for i in range(len(defect1_features))},
            **{defect2_features[i]: row[defect2_features[i]] for i in range(len(defect2_features))}
        })

discretized_pairwise_df = pd.DataFrame(discretized_pairwise_df)


In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

X = discretized_pairwise_df[defect1_features + defect2_features]
y = discretized_pairwise_df['first chosen']

tree = DecisionTreeClassifier(max_depth=3, random_state=42)
tree.fit(X, y)

plt.figure(figsize=big_figsize, layout="constrained")
plot_tree(tree, feature_names=X.columns, filled=True, rounded=True, class_names=['Chosen Second','Chosen First'])
plt.title(f"Shallow Decision Tree for Feature Interactions (ACC: {tree.score(X, y):.2f})")
plt.show()

redundant with the previous results

In [None]:
importances = tree.feature_importances_
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(importance_df.head(10))

# combination model

In [None]:
X = pd.concat([pairwise_df[differential_features], discretized_pairwise_df[defect1_features + defect2_features], derived_df[derived_rules]], axis=1)
y = pairwise_df['first chosen'].astype(bool)

In [None]:
X = pd.concat([pairwise_df[differential_features], discretized_pairwise_df[defect1_features + defect2_features], derived_df[derived_rules]], axis=1)
y = pairwise_df['first chosen'].astype(bool)

if training:
    X, _, y, _, groups, _ = train_test_split(
        X, y, groups, test_size=0.2, random_state=42, stratify=y
    )

models = {
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=4444),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": GaussianNB()
}

results_df = []

for name, model in models.items():
    preds, truths = leave_one_group_out_for_model(model, X, y, groups=pairwise_df['response id'])
    summary_df = summarize_model_performance(truths, preds, model_name=name)
    results_df.append(summary_df)

final_results_df = pd.concat(results_df).reset_index(drop=True)
print(final_results_df)

In [None]:
raise RuntimeError

# final evaluation

In [None]:
np.random.seed(42)

non_whitespace_defects = defects[
    ~defects['defect name'].str.lower().str.contains('whitespace', na=False)
].index

filtered_defect_log = defect_log[non_whitespace_defects.intersection(defect_log.columns)]

candidates = filtered_defect_log[filtered_defect_log.sum(axis=1) >= 2].index

filtered_features = {k: v.loc[candidates] for k, v in features.items()}

evaluation_sample = greedy_sample(filtered_features, log.loc[candidates], n_samples=7, seed=42)

In [None]:
_, evaluation_defects = create_export_dataframes(evaluation_sample, log, items, defects, filtered_defect_log, filtered_features)

In [None]:
def predict_defect_pair(index, defect1, defect2):  # noqa: D103
    differential = pd.Series(0., index=differential_features)
    for feature, values in feature_values.items():
        differential[feature] = values.loc[index, defect1] - values.loc[index, defect2]
    derived = pd.Series(0., index=derived_rules)
    for antecedents in best_rules['antecedents']:
        def match_rule(row):  # noqa: D103
            for cond in antecedents:
                if '>' in cond:
                    feat = cond.split('>')[0]
                    if not row[feat] > 0:
                        return 0
                else:
                    feat = cond.split('<=')[0]
                    if not row[feat] <= 0:
                        return 0
            return 1
        derived[" & ".join(sorted(antecedents))] = match_rule(differential)
    derived = pd.Series(derived)
    discretized = pd.Series(0., index=defect1_features + defect2_features)
    for feature, values in features.items():
        discretized[f'defect1 {feature}'] = values.loc[index, defect1]
        discretized[f'defect2 {feature}'] = values.loc[index, defect2]
    return models['Decision Tree'].predict(pd.DataFrame([pd.concat([differential, discretized, derived])]))[0]

In [None]:
max_iters = 10
ordered_defects = {}

for index, group in evaluation_defects.groupby('submission id'):
    found_defects = group['defect id'].tolist()

    iteration = 0
    changed = True

    while changed and iteration < max_iters:
        changed = False
        iteration += 1

        for i in range(len(found_defects)):
            for j in range(i + 1, len(found_defects)):
                if predict_defect_pair(index, found_defects[i], found_defects[j]) == False:
                    found_defects[i], found_defects[j] = found_defects[j], found_defects[i]
                    changed = True

    if iteration == max_iters:
        print(f"Warning: max iterations reached for submission_id {index}. Result may be unstable.")

    ordered_defects[index] = found_defects


In [None]:
unordered_defects = {}

for k, v in ordered_defects.items():
    if len(v) == 2:
        # swap the two items
        unordered_defects[k] = [v[1], v[0]]
    else:
        # shuffle for other lengths
        unordered_defects[k] = random.sample(v, len(v))

In [None]:
from IPython.display import display, HTML

# Only relevant CSS
css = """
<style>
.survey-container {
    display: flex;
    flex-direction: column;
    align-items: center;
    width: 100%;
    margin: auto;
    font-family: Arial, sans-serif;
    color: #333;
    background-color: #f4f4f4;
    padding: 20px;
    box-sizing: border-box;
}

.survey-content {
    display: flex;
    justify-content: space-between;
    width: 100%;
    flex-wrap: wrap;
    gap: 20px;
}

.task-section, .defects-section {
    width: 48%;
    padding: 15px;
    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
    border-radius: 8px;
    background-color: #f5f5f5;
}

.code-block {
    background-color: #f0f0f0;
    padding: 10px;
    border-radius: 5px;
    overflow-x: auto;
    font-family: monospace;
}

.defect-button {
    background-color: #dcdcdc;
    border: none;
    padding: 15px;
    text-align: left;
    width: 100%;
    margin-bottom: 10px;
    box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1);
    display: flex;
    flex-direction: column;
    align-items: flex-start;
    border-radius: 8px;
    cursor: default;
}

.defect-content {
    width: 100%;
}

.defect-context {
    margin-top: 10px;
    padding: 10px;
    background-color: #f8f8f8;
    border-top: 1px solid #ddd;
    font-style: italic;
}
</style>
"""

for task_number, idx in enumerate(evaluation_sample):
    row = log.loc[idx]
    task = items.loc[row['item']]

    html = """
    <div class="survey-container">
        <div class="survey-content" style="display: flex; gap: 20px;">
            <!-- Left column: ordered defects -->
            <div class="defects-section ordered-defects" style="flex: 1;">
    """

    for defect_id in ordered_defects[idx]:
        defect = defects.loc[defect_id]
        html += f"""
            <button class="defect-button unclickable">
                <div class="defect-content">
                    <p><strong>{defect['defect name']}</strong>: {defect['description']}</p>
        """
        if defect.get('code example'):
            html += f"""
                    <pre class="code-block"><strong>Example:</strong>\n{defect['code example']}</pre>
            """
        if defect.get('code fix example'):
            html += f"""
                    <pre class="code-block"><strong>Fix:</strong>\n{defect['code fix example']}</pre>
            """
        if defect.get('additional context'):
            html += f"""
                    <div class="defect-context">
                        <strong>Additional Context:</strong> {defect['additional context']}
                    </div>
            """

        html += """
                </div>
            </button>
        """

    html += f"""
            </div>
            <!-- Center column: task section -->
            <div class="task-section" style="flex: 2;">
                <h3>Task {task_number + 1}: {task['name']}</h3>
                <p><strong>Instructions:</strong> {task['instructions']}</p>
                <h4>Student Submission:</h4>
                <pre class="code-block">{row['answer']}</pre>
            </div>
            <!-- Right column: unordered defects -->
            <div class="defects-section unordered-defects" style="flex: 1;">
    """

    for defect_id in unordered_defects[idx]:
        defect = defects.loc[defect_id]
        html += f"""
            <button class="defect-button unclickable">
                <div class="defect-content">
                    <p><strong>{defect['defect name']}</strong>: {defect['description']}</p>
        """
        if defect.get('code example'):
            html += f"""
                    <pre class="code-block"><strong>Example:</strong>\n{defect['code example']}</pre>
            """
        if defect.get('code fix example'):
            html += f"""
                    <pre class="code-block"><strong>Fix:</strong>\n{defect['code fix example']}</pre>
            """
        if defect.get('additional context'):
            html += f"""
                    <div class="defect-context">
                        <strong>Additional Context:</strong> {defect['additional context']}
                    </div>
            """

        html += """
                </div>
            </button>
        """

    html += """
            </div>
        </div>
    </div>
    """
    filename = f"data/export/tasks/task{task_number + 1}.html"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(css + html)
