# loading data

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import random
from matplotlib.colors import ListedColormap
from scipy.stats import pointbiserialr
from IPython.display import display, HTML, update_display
from tqdm import tqdm

import src.ipython_loader as loader
from src.code_processing import generate_linter_messages

save = True
small_figsize = (8, 5)
long_figsize = (13, 5)
big_figsize = (13, 7)
resolution = 300 # dpi
binary_cmap = ListedColormap(['red', 'green'])
data_path = Path('data')
ipython_path = data_path / 'ipython_new'

%matplotlib widget

In [None]:
items, log, defects, defect_log, code_to_defect_id = loader.load(ipython_path, data_path)

#defect_log.drop(defects[['whitespace' in name for name in defects['defect name']]].index, axis=1, inplace=True)

# feature analysis

## plotting utils

In [None]:
def plot_histogram(values, title, bins=10, cutoff=None, save=False):  # noqa: D103
    if cutoff:
        values[values >= cutoff] = cutoff

    plt.figure(figsize=(10, 5))

    plt.hist(values, bins=bins)
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Count')
    plt.grid(True)

    if save:
        plt.savefig(f'images/{title.lower().replace(" ", "_")}.png', dpi=300)
    plt.show()

In [None]:
def task_and_defect_description(task, defect):  # noqa: D103
    task_row = items.loc[task]
    defect_row = defects.loc[defect]
    submissions = log[(log["item"] == task) & (defect_log[defect])]
    
    return f"""
    <div style="display: flex; justify-content: space-between; gap: 20px;">
        <!-- Task Section -->
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>{task_row["name"]}</h3>
            <div><strong>Instructions:</strong><br>{task_row["instructions"]}</div>
            <div><strong>Solution:</strong><br>
                <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{task_row["solution"]}</pre>
            </div>
        </div>
        
        <!-- Defect Section -->
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>{defect_row["defect name"]}</h3>
            <div><strong>Defect Type:</strong> {defect_row["defect type"]}</div>
            <div><strong>Severity:</strong> {defect_row["severity"]}</div>
            <div><strong>Description:</strong><br>{defect_row["description"]}</div>
            
            <div style="display: flex; justify-content: space-between; margin-top: 20px;">
                <div style="width: 48%; padding: 10px;">
                    <strong>Code Example:</strong><br>
                    <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{defect_row["code example"]}</pre>
                </div>
                <div style="width: 48%; padding: 10px;">
                    <strong>Code Fix Example:</strong><br>
                    <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{defect_row["code fix example"]}</pre>
                </div>
            </div>
        </div>
    </div>
    
    <!-- Code Snippet Section -->
    <div style="border: 1px solid #ccc; padding: 10px; margin-top: 20px; border-radius: 5px;">
        <strong>Example Submission:</strong><br>
        <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{submissions["answer"].iloc[random.randint(0, len(submissions) - 1)] if len(submissions) else 'No submissions found'}</pre>
    </div>
    """

In [None]:
def task_defect_plot(matrix, title='', save=False, interactive=False, *args, **kwargs):  # noqa: D103
    defect_names = [defects['defect name'].loc[idx][:20] for idx in matrix.columns]
    task_names = [items['name'].loc[idx][:20] for idx in matrix.index]

    fig, ax = plt.subplots(figsize=big_figsize, layout="constrained")
    if interactive:
        sns.heatmap(matrix.T, cbar=False, *args, **kwargs)
    else:
        sns.heatmap(matrix.T, xticklabels=task_names, yticklabels=defect_names, cbar=True, *args, **kwargs)
        ax.tick_params(axis='x', labelsize=7)
        ax.tick_params(axis='y', labelsize=8)
        plt.title(title)
    plt.xlabel("")
    plt.ylabel("")

    if save:
        plt.savefig('images/' + title.lower().replace(' ', '_')[:title.find(' t=')] + '.png', dpi=300)

    if interactive:
        output_html = display(HTML("<b>Click a cell to see details</b>"), display_id=True)

        def on_click(event):
            if event.inaxes == ax:
                x = int(event.xdata)
                y = int(event.ydata)
                
                if 0 <= x < len(task_names) and 0 <= y < len(defect_names):
                    html = HTML(task_and_defect_description(matrix.index[x], matrix.columns[y]))
                    #html = HTML(f'{x}, {y}')
                    output_html.update(html)

        fig.canvas.mpl_connect('button_press_event', on_click)

    return fig

## task-defect reasonableness

TODO issues in task templates

### anomalously common tasks

In [None]:
frequencies = defect_log.groupby(log['item']).mean()
upper_limit = 0.9
unreasonable = frequencies > upper_limit

In [None]:
fig = task_defect_plot(unreasonable, title=f"Anomalously common task-defect pairs for threshold t={upper_limit}", interactive=False, save=save, cmap=binary_cmap)

In [None]:
fig = task_defect_plot(unreasonable, title=f"Anomalously common task-defect pairs for threshold t={upper_limit}", interactive=True)

In [None]:
plt.close(fig)

### task-defect pairs with very few submissions

In [None]:
counts = defect_log.groupby(log['item']).sum()
lower_limit = 10
few_submissions = (counts < lower_limit).astype('int')
unreasonable = unreasonable | few_submissions

In [None]:
fig = task_defect_plot(few_submissions, title=f"Task-Defect Pairs with too Few Submissions for t={lower_limit}", interactive=False, save=save, cmap=binary_cmap)

In [None]:
fig = task_defect_plot(few_submissions, title=f"Task-Defect Pairs with too Few Submissions for t={lower_limit}", interactive=True)

In [None]:
plt.close(fig)

## task-defect rarity

In [None]:
thresholds = [0.01, 0.02, 0.03, 0.04]
defect_names = defects["defect name"]

all_vals = []
for threshold in thresholds:
    rare = ((frequencies < threshold) | few_submissions).astype('int')
    common = 1 - rare
    vals = common.sum(axis=0)
    all_vals.append(vals)

stack_data = pd.concat(all_vals, axis=1).fillna(0)
stack_data.columns = [f"t={t:.2f}" for t in thresholds]

stack_data = stack_data.loc[(stack_data.median(axis=1) + 0.1 * stack_data.max(axis=1)).sort_values(ascending=False).index]

fig, ax = plt.subplots(figsize=long_figsize, layout='constrained')
x = np.arange(stack_data.shape[0])
bar_width = 0.2
n_thresholds = len(thresholds)

for i, col in enumerate(stack_data.columns):
    offset = (i - n_thresholds / 2) * bar_width + bar_width / 2
    ax.bar(x + offset, stack_data[col], width=bar_width, label=col)


ax.set_xticks(x, labels=[defect_names.loc[idx][:20] for idx in stack_data.index], rotation=90)
ax.set_xlabel('Defect')
ax.set_ylabel('Number of Common Tasks')
ax.set_title('Number of Common Task-Defect Pairs as Threshold Decreases')
ax.legend(title='Threshold')

if save:
    plt.savefig('images/number_of_common_task-defect_pairs_as_threshold_decreases.png', dpi=300)
plt.show()


In [None]:
rare_threshold = 0.02
rare = ((frequencies < threshold) | few_submissions).astype('int')

## characteristic task-defect pairs

In [None]:
z_score = (defect_log.groupby(log['item']).mean() - defect_log.mean()) / defect_log.std()
z_score[unreasonable] = np.nan

In [None]:
fig = task_defect_plot(z_score, title=f"Task-Defect Pair Z-Scores", interactive=False, save=save)

In [None]:
fig = task_defect_plot(z_score, title=f"Task-Defect Pair Z-Scores", interactive=True)

In [None]:
plt.close(fig)

In [None]:
reasonable_z_scores = z_score.stack().dropna()
quantile = 0.8
threshold = reasonable_z_scores.quantile(quantile)

plt.figure(figsize=small_figsize)

plt.hist(reasonable_z_scores, bins=100, color='skyblue', edgecolor='black')
plt.axvline(x=threshold, color='red', linestyle='--', label=f'{int(quantile * 100)}-percentile Threshold (= {threshold:.2f})')

plt.title('Histogram of Z-Scores for Reasonable Task-Defect Pairs')
plt.xlabel('Z-Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)

if save:
    plt.savefig('images/histogram_of_z-scores_for_reasonable_task-defect_pairs.png', dpi=300)
plt.show()

In [None]:
characteristic_threshold = reasonable_z_scores.quantile(quantile)
characteristic = (z_score > characteristic_threshold).astype(int)
characteristic[unreasonable] = np.nan

In [None]:
fig = task_defect_plot(characteristic, title=f"Characteristic Defects for Threshold t={characteristic_threshold:.2f}", interactive=False, save=save, cmap=binary_cmap)

In [None]:
log_by_topic = log.merge(items, left_on='item', right_index=True)['topic']
defect_frequencies_by_topic = defect_log.groupby(log_by_topic).mean()

topic_z_score = (defect_log.groupby(log_by_topic).mean() - defect_log.mean()) / defect_log.std()

In [None]:
topic_z_score[defect_frequencies_by_topic < 0.01] = np.nan

In [None]:
plt.figure(figsize=(13, 10), layout="constrained")
sns.heatmap(topic_z_score.T, vmin=-2, vmax=2, yticklabels=[defects['defect name'].loc[idx][:30] for idx in topic_z_score.columns], cmap="vlag", cbar=True)
plt.ylabel("")
plt.xlabel("")
plt.title("Topic-Level Defect Anomalies (Z-scores)")

plt.show()

In [None]:
items[items['name'].str.contains('Velké')]

In [None]:
defects[defects['defect name'].str.contains('for with redu')]

## currently taught topic

## rejected

### time spent on task

In [None]:
log['relative_time_spent'] = log['responseTime'] / log.groupby('item')['responseTime'].transform('mean')

# clip all submissions to X times the task mean
clip_threshold = 5.0

print('Clipping all submissions to', clip_threshold, 'times the task mean:', (log['relative_time_spent'] > clip_threshold).mean(), 'changed.')

log[log['relative_time_spent'] > clip_threshold] = clip_threshold

In [None]:
time_spent_threshold = log['relative_time_spent'].quantile(0.90)

In [None]:
plt.figure(figsize=(10, 4))
sns.histplot(log['relative_time_spent'], bins=1000, kde=True)
plt.axvline(time_spent_threshold, color='red', linestyle='--', label='75th Percentile Threshold')
plt.title("Distribution of Relative Time Spent per Task (Values Over 5 Clipped)")
plt.xlabel("Relative Time Spent")
plt.ylabel("Count")
plt.legend()
plt.tight_layout()

if save:
    plt.savefig('images/distribution_of_relative_time_spent_per_task.png', dpi=300)
plt.show()


### associated with poor performance (locally)

In [None]:
from sklearn.metrics import precision_score

_, not_log, _, not_defect_log, _ = loader.load(ipython_path, data_path, only_correct=False)

df = not_log[['item', 'correct']].merge(not_defect_log, left_index=True, right_index=True)

correlations = {}

# for each item
for task_id, task_df in df.groupby('item'):
    corr_dict = {}
    # for each defect
    for defect in defect_log.columns:
        # get vectors
        defect_presence = task_df[defect]
        incorrect = ~task_df['correct']
        # caluclate correlation
        if defect_presence.nunique() > 1 and incorrect.nunique() > 1:
            corr = precision_score(incorrect, defect_presence)
            # corr, _ = pointbiserialr(defect_presence, incorrect)
            corr_dict[defect] = corr
        else:
            corr_dict[defect] = np.nan

    correlations[task_id] = corr_dict

# construct df
performance = pd.DataFrame.from_dict(correlations, orient='index')
performance[unreasonable] = np.nan


In [None]:
fig = task_defect_plot(performance, title="Defect-Failure Precision", interactive=False, save=save)

In [None]:
performances = performance.stack().dropna()

plt.figure(figsize=(10, 6))

plt.hist(performances, bins=100, color='skyblue', edgecolor='black')
plt.title('Histogram of Task-Defect Pair Precision in Predicting Failures')
plt.xlabel('Z-Score')
plt.ylabel('Frequency')
plt.grid(True)

if save:
    plt.savefig('images/histogram_of_task-defect_pair_precision.png', dpi=300)
plt.show()

In [None]:
failure_threshold = 0.25
# failure_threshold = performances.quantile(quantile)

fig = task_defect_plot(performance > failure_threshold, title=f"High failure rates t={failure_threshold:.2f}", interactive=False, save=save)

In [None]:
fig = task_defect_plot(performance > failure_threshold, title=f"High failure rates t={failure_threshold:.2f}", interactive=True, cmap=binary_cmap)

In [None]:
plt.close(fig)

There are some interesting takeaways (long function, unused variable), but overall does not produce any meaningful results.

## Defect multiplicity

TODO Z-Score?

In [None]:
_, _, _, multiplicity_log, _ = loader.load(ipython_path, data_path, only_presence=False)

assert multiplicity_log.index.difference(log.index).empty

# suppress outliers
multiplicity_log[multiplicity_log > 10] = 10

In [None]:
means = multiplicity_log[multiplicity_log > 0].mean().sort_values().sort_values(ascending=False)

ticks = np.arange(len(means)) + 0.4

plt.figure(figsize=(10, 4), layout="constrained")
plt.bar(ticks, means.values)

plt.xticks(ticks,[defects['defect name'][idx][:20] for idx in means.index], rotation=90)
plt.title(f"Defects by Mean Multiplicity (When Occuring)")
plt.xlabel("Defect")
plt.ylabel("Mean Multiplicity")

if save:
    plt.savefig('images/defects_by_mean_multiplicity.png', dpi=300)
plt.show()

In [None]:
multiplicity = multiplicity_log[multiplicity_log > 0]

multiplicity = multiplicity.melt(var_name='Defect', value_name='Count')
multiplicity = multiplicity[multiplicity['Count'] > 0]

multiplicity['Defect Name'] = multiplicity['Defect'].map(lambda x: defects['defect name'].loc[x][:20])

plt.figure(figsize=(14, 6), layout="constrained")
sns.boxplot(data=multiplicity, x='Defect Name', y='Count', order=[defects['defect name'].loc[idx][:20] for idx in means.index])
plt.xticks(rotation=90)
plt.title("Box Plot of Defect Multiplicity")
plt.ylabel("Multiplicity (Count per Submission)")
plt.xlabel("Defect")

if save:
    plt.savefig('images/box_plot_of_defect_multiplicity.png', dpi=300)
plt.show()


## Recency

In [None]:
recency_log = []

# prepare in advance to make the computation faster
df = log.merge(defect_log, left_index=True, right_index=True)
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values(by=['user', 'time'])

# for each user
for user_id, history in tqdm(df.groupby('user')):
    last_seen = {defect: None for defect in defect_log.columns}

    # iterate over history
    for i, (idx, row) in enumerate(history.iterrows()):
        recency_row = {}
        for defect in defect_log.columns:
            if row[defect] == 1:

                if last_seen[defect] is None:
                    recency_row[defect] = 0
                else:
                    recency_row[defect] = i - last_seen[defect]

                last_seen[defect] = i
            else:
                recency_row[defect] = np.nan

        recency_row['submission id'] = idx
        recency_log.append(recency_row)

# create dataframe
recency_log = pd.DataFrame(recency_log).set_index('submission id')
recency_log.index.name = 'submission id'

In [None]:
first_time_rate = (recency_log == 0)[~recency_log.isna()].mean().sort_values(ascending=True)

plt.figure(figsize=(12, 6), layout="constrained")

ticks = np.arange(len(first_time_rate))

plt.bar(ticks, first_time_rate)
plt.title("Percentage of First-Time Occurances per Defect")
plt.ylabel("First-Time Rate")
plt.xlabel("Defect")
plt.xticks(ticks, [defects['defect name'][idx][:30] for idx in first_time_rate.index], rotation=90)
plt.grid(axis='y')

if save:
    plt.savefig('images/percentage_of_first_time_occurances_per_defect.png', dpi=300)
plt.show()


In [None]:
# bins
bins = [0, 1, 2, 4, 9, 14, 19, 24, 29, np.inf]
bin_labels = ['1', '2', '3-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30+']
recency = recency_log.apply(lambda col: pd.cut(col, bins=bins, labels=bin_labels))

# histogram
recency = recency.apply(lambda col: col.value_counts()).fillna(0).astype(int)

# scaling
recency = recency.div(recency.sum(axis=0), axis=1)

# sort as the previous graph
recency = recency.loc[:, first_time_rate.index]

plt.figure(figsize=(12, 6), layout="constrained")
sns.heatmap(recency, cmap='viridis', cbar_kws={'label': 'frequency'}, xticklabels=[defects['defect name'][idx][:30] for idx in recency.columns])
plt.title("Number of Sessions Before Defect Reoccurance")
plt.xlabel("Defect")
plt.ylabel("Recency Bin")
plt.gca().invert_yaxis()

if save:
    plt.savefig('images/heatmap_of_recency_bins_per_defect.png', dpi=300)
plt.show()

In [None]:
mean_recency = recency_log.replace(0, np.nan).median()

# sort as the other graphs
mean_recency = mean_recency.loc[first_time_rate.index]

plt.figure(figsize=(12, 6), layout="constrained")

ticks = np.arange(len(mean_recency))

plt.bar(ticks, mean_recency)
plt.title("Average Recency (# of Submissions Since Last Seen) per Defect")
plt.ylabel("Average Recency")
plt.xlabel("Defect")
plt.xticks(ticks, [defects['defect name'][idx][:30] for idx in first_time_rate.index], rotation=90)
plt.grid(axis='y')

if save:
    plt.savefig('images/average_recency_per_defect.png', dpi=300)
plt.show()


## student-specific frequency

In [None]:
# prepare in advance
df = log.merge(defect_log, left_index=True, right_index=True)
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values(by=['user', 'time'])

accuracy_log = []

# for each user
for user_id, history in tqdm(df.groupby('user')):
    correct_count = {defect: 0 for defect in defect_log.columns}
    encounter_count = {defect: 0 for defect in defect_log.columns}
    
    # iterate over history
    for i, (idx, row) in enumerate(history.iterrows()):
        accuracy_row = {}
        
        task_id = row['item']
        
        for defect in defect_log.columns:
            is_reasonable = not unreasonable.loc[task_id, defect]
            if row[defect] == 0 and is_reasonable:
                correct_count[defect] += 1
            if row[defect] == 1 or is_reasonable:
                encounter_count[defect] += 1
                accuracy_row[defect] = correct_count[defect] / encounter_count[defect]
            else:
                accuracy_row[defect] = np.nan

        accuracy_row['submission id'] = idx
        accuracy_log.append(accuracy_row)
    
# create dataframe
accuracy_log = pd.DataFrame(accuracy_log).set_index('submission id')
accuracy_log.index.name = 'submission id'

accuracy_at_least_once = accuracy_log[accuracy_log < 1]

student_specific_frequency = (accuracy_log - accuracy_log.mean()) / accuracy_log.std()

In [None]:
values = pd.Series(student_specific_frequency.values.flatten()).dropna().values

upper_quantile = 0.90
student_upper_threshold = np.quantile(values, upper_quantile)
lower_quantile = 0.20
student_lower_threshold = np.quantile(values, lower_quantile)

plt.figure(figsize=(10, 5))

plt.hist(values, bins=100)

plt.axvline(student_lower_threshold, color='red', linestyle='--', linewidth=2, label=f'{int(lower_quantile*100)}% threshold')
plt.axvline(student_upper_threshold, color='green', linestyle='--', linewidth=2, label=f'{int(upper_quantile*100)}% threshold')

plt.title('Distribution of Student-Specific Frequency')
plt.xlabel('Value')
plt.ylabel('Count')
plt.legend()
plt.grid(True)

if save:
    plt.savefig(f'images/distribution_of_student_specific_frequency.png', dpi=300)
plt.show()

In [None]:
student_specific = (student_specific_frequency > student_upper_threshold).astype('int')
student_specific[student_specific_frequency < student_lower_threshold] = -1

In [None]:
plot_histogram(accuracy_at_least_once.values.flatten(), 'Distribution of User-Defect Accuracy', bins=10)

In [None]:
plot_histogram(accuracy_at_least_once.groupby(log['user']).mean().values.flatten(), 'Distribution of User-Defect Accuracy (User Averages)', bins=10)

In [None]:
accuracy_means = accuracy_log.mean().sort_values(ascending=False)

plt.figure(figsize=(12, 6), layout="constrained")

ticks = np.arange(len(accuracy_means))

plt.bar(ticks, accuracy_means)
plt.title("Average Accuracy per Defect")
plt.ylabel("Average Accuracy")
plt.xlabel("Defect")
plt.xticks(ticks, [defects['defect name'][idx][:20] for idx in accuracy_means.index], rotation=90)
plt.grid(axis='y')

if save:
    plt.savefig('images/average_accuracy_per_defect.png', dpi=300)
plt.show()


In [None]:
has_reasonable = (~unreasonable).sum()

has_reasonable = has_reasonable.loc[accuracy_means.index]

plt.figure(figsize=(12, 6), layout="constrained")

ticks = np.arange(len(has_reasonable))

plt.bar(ticks, has_reasonable)
plt.title("Average Accuracy per Defect")
plt.ylabel("Average Accuracy")
plt.xlabel("Defect")
plt.xticks(ticks, [defects['defect name'][idx][:20] for idx in has_reasonable.index], rotation=90)
plt.grid(axis='y')

plt.show()

## Currently Taught Topic

Manually set for topics, or by frequency in student submissions.

In [None]:
def add_concept_to_defects(keyword:str, tag:str):
    if 'concepts' not in defects.columns:
        defects['concepts'] = ['' for i in range(len(defects))]
    mask = defects['code fix example'].apply(lambda x: True if x and 'if' in x else False)
    mask |= defects['code example'].apply(lambda x: True if x and 'if' in x else False)
    mask &= defects['concepts'].apply(lambda x: tag not in x)
    defects['concepts'] += mask.apply(lambda x: tag + ' ' if x else '')

In [None]:
add_concept_to_defects('if ', 'if')
add_concept_to_defects('for ', 'for')
add_concept_to_defects('while ', 'while')
add_concept_to_defects('string ', '\'')
add_concept_to_defects('string ', '"')

In [None]:
defects

## Future Opportunity Likelihood

In [None]:
all_tasks = items.index

opportunity_log = []

# prepare in advance to make the computation faster
df = log.merge(defect_log, left_index=True, right_index=True)
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values(by=['user', 'time'])

# for each user
for user_id, history in tqdm(df.groupby('user')):
    completed_tasks = set()

    # iterate over history
    for submission_id, row in history.iterrows():
        # unfinished tasks
        completed_tasks.add(row['item'])
        remaining_tasks = [t for t in all_tasks if t not in completed_tasks]

        if not remaining_tasks:
            opportunity_log.append({
                'submission id': submission_id,
                **{defect: 0.0 for defect in frequencies.columns}
            })
            continue

        weights = np.array([2.0 if t > row['item'] else 1.0 for t in remaining_tasks])
        weighted_avg = (frequencies.loc[remaining_tasks].T @ weights) / weights.sum()

        opportunity_log.append({
            'submission id': submission_id,
            **weighted_avg.to_dict()
        })


# create dataframe
opportunity_log = pd.DataFrame(opportunity_log).set_index('submission id')
opportunity_log.index.name = 'submission id'

In [None]:
plt.figure(figsize=(10, 5))

plt.hist(opportunity_log.values.flatten(), bins=100)
plt.title('Histogram of Future Opportunities')
plt.xlabel('Opportunity')
plt.ylabel('Count')
plt.grid(True)

if save:
    plt.savefig(f'images/histogram_of_future_opportunities.png', dpi=300)
plt.show()

In [None]:
means = opportunity_log.mean().sort_values().sort_values(ascending=False)

ticks = np.arange(len(means)) + 0.4
bar_width = 0.4

plt.figure(figsize=(10, 4), layout="constrained")

plt.bar(ticks - bar_width / 2, means.values, label='Opportunity', width=bar_width)
plt.bar(ticks + bar_width / 2, defect_log[means.index].mean().values, label='Frequency', width=bar_width)

plt.xticks(ticks,[defects['defect name'][idx][:20] for idx in means.index], rotation=90)
plt.title(f"Defects by Mean Opportunity vs Frequency")
plt.xlabel("Defect")
plt.ylabel("Mean Opportunity")
plt.legend()

if save:
    plt.savefig('images/defects_by_mean_opportunity.png', dpi=300)
plt.show()

In [None]:
task_opportunities = opportunity_log.groupby(log['item']).mean()
task_opportunities = (task_opportunities - task_opportunities.mean(axis=0))

fig = task_defect_plot(task_opportunities, title="Future Opportunity to Make Defect by Task", interactive=False, save=save)

## Severity

In [None]:
severity_log = defect_log * defects.loc[defect_log.columns]['severity']
severity_log[severity_log == 0] = np.nan

In [None]:
means = severity_log.groupby(log['item']).max().mean(axis=1).sort_values()

ticks = np.arange(len(means))

plt.figure(figsize=(13, 4), layout="constrained")

plt.bar(ticks, means.values)

plt.xticks(ticks, [items['name'][idx][:20] for idx in means.index], rotation=90)
plt.title(f"Mean Severity for Each Task")
plt.xlabel("Task")
plt.ylabel("Severity")
plt.tick_params(axis='x', labelsize=7)
plt.legend()

if save:
    plt.savefig('images/mean_severity.png', dpi=300)
plt.show()

In [None]:
differences = severity_log.apply(lambda row: -row.nlargest(2).diff().iloc[-1], axis=1).value_counts().sort_index()

plt.figure(figsize=(10, 5))

plt.bar(differences.index, differences.values, edgecolor='black')
plt.title('Histogram of Differences in Severity')
plt.xlabel('Difference')
plt.ylabel('Count')
plt.grid(True)

if save:
    plt.savefig(f'images/severity_differences.png', dpi=300)
plt.show()

# filtering before sampling

TODO too long, short, ...

In [None]:
def has_close_pair(row):
    """Check if there is a pair of values with difference one or less."""
    row_values = row.values
    return np.any(np.abs(row_values[:, None] - row_values) <= 1)

In [None]:
# at least two defects
filtered = defect_log[defect_log.sum(axis=1) > 1]

In [None]:
# at most difference of one in severity
filtered *= defects.loc[filtered.columns]['severity']
filtered = filtered[filtered.apply(has_close_pair, axis=1)]

In [None]:
# apply the filter
defect_log = defect_log.loc[filtered.index]
log = log.loc[filtered.index]

# feature engineering

In [None]:
def expand_task_defect_table(log, task_defect_table):
    """Expand task-defect table to feature log."""
    feature_log = log[['item']].join(task_defect_table, on='item')
    feature_log.drop('item', axis=1, inplace=True)
    return feature_log
    

In [None]:
feature_values = {
    'rare': expand_task_defect_table(log, frequencies),  # TODO expand to log
    'characteristic': expand_task_defect_table(log, characteristic),
    #'currently_taught': currently_taught,
    #'student_frequency': student_frequency_log,
    #'multiplicity': multiplicity_log,
    # 'recently_fixed': recently_fixed_log,
    'severity': severity_log.loc[log.index],
}

features = {
    'rare': expand_task_defect_table(log, ~reasonable).astype('int'),
    'characteristic': expand_task_defect_table(log, characteristic).astype('int'),
}

In [None]:
reasonable

In [None]:
characteristic

# testing sample

In [None]:
def generate_submission_html(submission_id, feature):
    """Simulate a decision based on the value of single feature."""
    task_id = log.loc[submission_id, 'item']
    
    present_defects = defect_log.loc[submission_id]
    present_defects = present_defects[present_defects == 1].index.tolist()


    defect_rows = []
    for defect in present_defects:
        defect_rows.append({
            "Defect": defects.loc[defect, "defect name"],
            "Description": defects.loc[defect, "description"],
            f"{feature}": f"{feature_values[feature].loc[submission_id, defect]:.2f}"
        })
    
    defect_df = pd.DataFrame(defect_rows)
    
    html = f"""
    <div style="background-color: #121212; color: #f0f0f0; font-family: 'Segoe UI', sans-serif; padding: 20px;">
        <div style="text-align: left;">
            <table style="width: 90%; margin-left: auto; border-collapse: collapse; background-color: #1e1e1e; border: 1px solid #444;">
                <tr>
                    <td style="vertical-align: top; width: 50%; border-right: 1px solid #333; padding: 20px; text-align: left;">
                        <h2 style="color: #ffffff;">{items.loc[task_id, 'name']}</h2>
                        <p><strong>Instructions:</strong><br>{items.loc[task_id, 'instructions']}</p>
                        <div style="background-color: #2b2b2b; color: #dcdcdc; padding: 15px; border-radius: 5px; overflow-x: auto; text-align: left;">
                            <pre style="margin: 0; white-space: pre-wrap;">{log.loc[submission_id,'answer']}</pre>
                        </div>
                    </td>
                    <td style="vertical-align: top; width: 50%; padding: 20px;">
                        <h2 style="color: #ffffff;">Detected Defects</h2>
                        {defect_df.to_html(index=False, escape=False, border=0, justify='left', classes='defect-table')}
                    </td>
                </tr>
            </table>
        </div>
    </div>
    """
    return html


In [None]:
feature = 'rare'

# identify submissions where feature is represented
sampled_df = log[features[feature].sum(axis=1) > 0]

# sample 10 unique submissions
sampled_df = sampled_df.sample(n=10, random_state=42)


In [None]:
display(HTML(generate_submission_html(sampled_df.index[3], feature)))

# survey sample

In [None]:
from collections import defaultdict
import random

def greedy_sample(features, log, n_samples=200, seed=42):
    """Sample log indexes using a greedy algorithm to balance features and maximize task coverage."""
    random.seed(seed)

    sample = []
    feature_counts = feature_counts = pd.Series(0, index=features.keys())
    task_counts = pd.Series(0, index=log['item'].unique())

    for _ in tqdm(range(n_samples)):
        # least represented feature
        feature = feature_counts.idxmin()

        # filter submissions with feature and not in the sample
        candidates = features[feature]
        candidates = candidates[candidates.sum(axis=1) > 0].index.difference(sample)
        
        if candidates.empty:
            print("[WARNING] No candidates left for feature, skipping...")
            feature_counts[feature] += 1
            continue

        # filter submissions with yet unused task
        task_indices = task_counts[task_counts == task_counts.min()].index
        task_candidates = log.loc[candidates]
        task_candidates = task_candidates[task_candidates['item'].apply(lambda x: x in task_indices)].index

        if not task_candidates.empty:
            choice = random.choice(task_candidates)
            sample.append(choice)
            task_counts[log.loc[choice, 'item']] += 1
        else:
            print("[WARNING] No candidates left for task, choosing randomly...")
            sample.append(random.choice(candidates))
    return sample


In [None]:
sample = greedy_sample(features, log, n_samples=50)

In [None]:
sample_freq = defect_log.loc[sample].mean()
global_freq = defect_log.mean()
uniform = pd.Series(1 / len(defect_log.columns), index=defect_log.columns)

df = pd.DataFrame({
    'Sample': sample_freq,
    'Global': global_freq,
    'Uniform': uniform
}).sort_values(by='Global', ascending=False)

labels = [defects['defect name'][idx][:20] for idx in df.index]
ticks = np.arange(len(df))

bar_width = 0.4
plt.figure(figsize=(13, 5), layout='constrained')

# sample vs global
plt.bar(ticks - bar_width/2, df['Global'], width=bar_width, label='Global', color='lightgray')
plt.bar(ticks + bar_width/2, df['Sample'], width=bar_width, label='Sample', color='steelblue')

# uniform
plt.plot(ticks, df['Uniform'], color='green', linestyle='--', label='Uniform')

plt.xticks(ticks, labels, rotation=90)
plt.ylabel("Defect Frequency")
plt.title("Defect Distribution: Sample vs Global vs Uniform")
plt.legend()

plt.savefig(f'images/sampled_defect_distribution.png', dpi=300)
plt.show()

In [None]:
sample_freq = log.loc[sample]['item'].value_counts(normalize=True)
global_freq = log['item'].value_counts(normalize=True)
uniform = pd.Series(1 / log['item'].nunique(), index=log['item'].unique())

df = pd.DataFrame({
    'Sample': sample_freq,
    'Global': global_freq,
    'Uniform': uniform
}).sort_values(by='Global', ascending=False)

labels = [items['name'].loc[idx][:20] for idx in df.index]
ticks = np.arange(len(df))

bar_width = 0.4
plt.figure(figsize=(13, 5), layout='constrained')

# sample vs global
plt.bar(ticks - bar_width/2, df['Global'], width=bar_width, label='Global', color='lightgray')
plt.bar(ticks + bar_width/2, df['Sample'], width=bar_width, label='Sample', color='steelblue')

# uniform
plt.plot(ticks, df['Uniform'], color='green', linestyle='--', label='Uniform')

plt.xticks(ticks, labels, rotation=90)
plt.tick_params(axis='x', labelsize=7)
plt.ylabel("Task Frequency")
plt.title("Task Distribution: Sample vs Global vs Uniform")
plt.legend()

plt.savefig(f'images/sampled_task_distribution.png', dpi=300)
plt.show()

In [None]:
feature_counts = pd.DataFrame({
    name: pd.Series(df.values.flatten()).value_counts()
    for name, df in features.items()
}).fillna(0).astype(int)

fig, ax = plt.subplots(figsize=(13, 5), layout='constrained')
feature_counts.T.plot(kind='bar', stacked=True, colormap='tab10', ax=ax)

plt.xlabel("Feature")
plt.ylabel("Number of Occurances")
plt.title("Stacked Bar Plot of Feature Values in the Sample")
plt.legend(title="Values", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=0)

plt.savefig(f'images/sampled_feature_distribution.png', dpi=300)
plt.show()