# setup

In [None]:
import math
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas as pd
import numpy as np
from ipywidgets import Output
from IPython.display import display, HTML

from src.code_processing import parse_code_string, generate_linter_messages
import src.ipython_loader as loader

# Enable interactive backend
%matplotlib widget

figsize = (10, 7)
resolution = 300 # dpi
data_path = Path('data')
ipython_path = data_path / 'ipython_new'
use_counts = True

In [None]:
def get_regression_line(x: pd.Series, y: pd.Series) -> np.ndarray:
    """Get the regression line between two series."""
    slope, intercept = np.polyfit(x, y, 1)
    return slope * x + intercept

# final submissions

In [None]:
items, log, defects, defect_log, code_to_defect_id = loader.load(ipython_path, data_path, only_correct=False)

In [None]:
log['responseTime'].info()

## severity baseline

Does not encompass the whole scale, isn't uniform, many ties.

TODO Move where appropriate

In [None]:
df = pd.crosstab(defects['severity'], defects['defect type'])
for i in range(1, 6):
    if i not in df.index:
        df.loc[i] = np.zeros(len(df.columns), dtype=int)
df.sort_index(inplace=True)

fig, ax = plt.subplots(figsize=(10, 8), layout="constrained")

sns.heatmap(df, annot=True, cmap="Blues", fmt="d", ax=ax)
for i, column in enumerate(df.columns):
    ax.scatter(i + 0.5, (df[column] * df.index).sum() / df[column].sum() - 0.5, marker='X', color='red', s=100)
ax.invert_yaxis()
colorbar = ax.collections[0].colorbar
colorbar.set_label('counts', rotation=90, labelpad=15)
plt.title('Distribution of defects by severity and type')
plt.show()

## tasks

### distribution

Skewed towards earlier tasks

In [None]:
counts = log['item'].value_counts()

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=(20, 7))

ticks = np.arange(len(counts))

p = ax.bar(ticks, counts.values)

ax.set_xlabel('task')
ax.set_ylabel('number of submissions')
ax.set_xticks(ticks, labels=[f"{idx}: {items['name'].loc[idx]}" for idx in counts.index], rotation = 90, ha='right')
ax.set_title('Distribution of Submissions Over Items')

# fig.savefig('task_counts.png', dpi=300)
plt.show()

In [None]:
count_ranks = counts.rank(method='dense', ascending=False).astype(int)
id_ranks = pd.Series(counts.index).rank(method='dense', ascending=False).astype(int)

Earlier simpler tasks have less submissions then later more complex.

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.scatter(count_ranks, id_ranks, marker='o', color='b')

ax.plot(count_ranks, get_regression_line(count_ranks, id_ranks), color='red', linestyle='--', label='Linear Regression')

ax.set_xlabel('count rank')
ax.set_ylabel('id')
ax.set_title('Task Id vs Rank by Number of Submissions')
plt.legend()

# plt.savefig('images/id_vs_rank.png', dpi=300)
plt.show()

### number of submissions vs difficulty

Trying to prove more difficlut tasks have less submissions - Does not work, use a citation instead!

TODO Very suspicious results, should be double checked.

In [None]:
error_rate = 1 - log.groupby('item')['correct'].mean()

fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.scatter(count_ranks, error_rate, marker='o', color='b')

ax.plot(count_ranks, get_regression_line(count_ranks, error_rate), color='red', linestyle='--', label='Linear Regression')

ax.set_xlabel('count rank')
ax.set_ylabel('error rate')
ax.set_title('Error Rate vs Number of Submissions Rank')

# plt.savefig('error_rate_vs_count_ranks.png', dpi=300)
plt.show()

In [None]:
percent = log.groupby('item')['correct'].count() / len(log)

fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.scatter(percent, error_rate, marker='o', color='b')

ax.plot(percent, get_regression_line(percent, error_rate), color='red', linestyle='--', label='Linear Regression')

ax.set_xlabel('percent')
ax.set_ylabel('error rate')
ax.set_title('Error Rate vs Percent of Submissions')

# plt.savefig('percent_vs_error_rate.png', dpi=300)
plt.show()

### anomalies

high z-scores are almost always impulses for changes to the system

problems with task templates - reproduced results of another paper

TOOD look for other takeaways

TODO more tasks

TODO filter out low absolute counts

TODO relationship with difficulty?

In [None]:
common_defects = (defect_log.groupby(log["item"]).mean() - defect_log.mean(axis=0)) / defect_log.std(axis=0)
highest_variance_tasks = common_defects.var(axis=1).sort_values(ascending=False)[:30].index
defect_names = [defects['defect name'].loc[idx] for idx in defect_log.columns]
task_names = [items['name'].loc[idx] for idx in highest_variance_tasks]

plt.figure(figsize=(10, 10), layout="constrained")
biggest_value = common_defects.abs().values.max()
sns.heatmap(common_defects.loc[highest_variance_tasks].T, xticklabels=task_names, yticklabels=defect_names, cmap="vlag", cbar=True, vmin=-biggest_value, vmax=biggest_value)
plt.title("Task-Level Defect Anomalies (Z-scores, Tasks With Highest Variance)")
plt.xlabel("")
plt.ylabel("")

# plt.savefig('defect_anomalies.png', dpi=300)
plt.show()

Interactive version

In [None]:
def task_and_defect_description(task, defect):  # noqa: D103
    task_row = items.loc[task]
    
    defect_row = defects.loc[defect]
    
    return f"""
    <div style="display: flex; justify-content: space-between; gap: 20px;">
        <!-- Task Section -->
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>{task_row["name"]}</h3>
            <div><strong>Instructions:</strong><br>{task_row["instructions"]}</div>
            <div><strong>Solution:</strong><br>
                <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{task_row["solution"]}</pre>
            </div>
        </div>
        
        <!-- Defect Section -->
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>{defect_row["defect name"]}</h3>
            <div><strong>Defect Type:</strong> {defect_row["defect type"]}</div>
            <div><strong>Severity:</strong> {defect_row["severity"]}</div>
            <div><strong>Description:</strong><br>{defect_row["description"]}</div>
            
            <div style="display: flex; justify-content: space-between; margin-top: 20px;">
                <div style="width: 48%; padding: 10px;">
                    <strong>Code Example:</strong><br>
                    <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{defect_row["code example"]}</pre>
                </div>
                <div style="width: 48%; padding: 10px;">
                    <strong>Code Fix Example:</strong><br>
                    <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{defect_row["code fix example"]}</pre>
                </div>
            </div>
        </div>
    </div>
    
    <!-- Code Snippet Section -->
    <div style="border: 1px solid #ccc; padding: 10px; margin-top: 20px; border-radius: 5px;">
        <strong>Example Submission:</strong><br>
        <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{log[(log["item"] == task) & (defect_log[defect])].iloc[0]["answer"]}</pre>
    </div>
    """

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from ipywidgets import Output
from IPython.display import display, HTML

common_defects = (defect_log.groupby(log["item"]).mean() - defect_log.mean(axis=0)) / defect_log.std(axis=0)
highest_variance_tasks = common_defects.var(axis=1).sort_values(ascending=False)[:30].index
common_defects = common_defects.loc[highest_variance_tasks].T

fig, ax = plt.subplots(figsize=(10, 8), layout="constrained")
biggest_value = common_defects.abs().values.max() # for axis scale

sns.heatmap(
    common_defects,
    xticklabels=[items['name'].loc[idx] for idx in highest_variance_tasks],
    yticklabels=[defects['defect name'].loc[idx] for idx in defect_log.columns],
    cmap="vlag",
    cbar=True,
    vmin=-biggest_value,
    vmax=biggest_value,
    ax=ax
)
plt.title("Task-Level Defect Anomalies (Z-scores, Tasks With Highest Variance)")
plt.xlabel("")
plt.ylabel("")

# Create a dedicated output area for the click details
click_output = Output()

def on_click(event):  # noqa: D103
    if event.inaxes == ax:
        x, y = int(event.xdata), int(event.ydata)
        if 0 <= x < len(task_names) and 0 <= y < len(defect_names):
            task = highest_variance_tasks[x]
            defect = defect_log.columns[y]
            # value = common_defects.iloc[y, x]

            # Update the click output area
            with click_output:
                click_output.clear_output(wait=True)
                display(HTML(task_and_defect_description(task, defect)))

# Connect the click event
fig.canvas.mpl_connect('button_press_event', on_click)

# Show the heatmap and output
plt.show()
display(click_output)

## defects

### distribution

Trivial "stylistic" defects dominate (especially whitespace).

TODO how does it look after removing whitespace

In [None]:
counts = defect_log.sum(axis=0).sort_values(ascending=False)
percentages = counts / len(defect_log) * 100

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.bar([defects['defect name'].loc[idx] for idx in counts.index], percentages)

ax.bar_label(p, labels = counts, label_type='edge', rotation=45)

ax.set_ylabel('% of submissions')
plt.xticks(rotation = 45, ha='right')
plt.ylim(0, max(percentages) + 5)
ax.set_title('Submissions containing a defect ({} submissions in total)'.format(len(defect_log)))

#plt.savefig('defect_histogram.png', dpi=300)
plt.show()

less than 20 % submissions have more than one defect

In [None]:
counts = defect_log.sum(axis=1).value_counts()

num_of_submissions = [str(i) for i in range(5)] + ['>=5']
defect_counts= list(counts[:5]) + [counts[5:].sum()]
defect_percentage = list(map(lambda x: x / len(defect_log) * 100, defect_counts))

fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.bar(num_of_submissions, defect_percentage)

ax.bar_label(p, labels = defect_counts, label_type='edge')

ax.set_ylabel('% of submissions')
ax.set_xlabel('Number of defects')
ax.set_title('Submissions by the number of unique defects ({} in total)'.format(len(defect_log)))

#plt.savefig('submissions_histogram.png', dpi=300)
plt.show()

In [None]:
counts = defect_log.sum(axis=1).value_counts().cumsum()

num_of_submissions = ["0"] + [f"<={i}" for i in range(1, max(counts.index) + 1)]

max_idx = max(counts.index)
defect_counts = [
    counts.loc[idx] if idx in counts.index else counts.loc[max([j for j in counts.index if j < idx])]
    for idx in range(max_idx + 1)
]

defect_percentage = [(x / len(defect_log)) * 100 for x in defect_counts]

fig, ax1 = plt.subplots(layout="constrained", figsize=(10, 6))

bars = ax1.bar(num_of_submissions, defect_counts, alpha=0.6, color='blue', label='Absolute counts')

ax2 = ax1.twinx()
ax2.plot(num_of_submissions, defect_percentage, marker='o', color='red', label='Cumulative %')

ax1.set_ylabel('Absolute counts')
ax2.set_ylabel('% of submissions')
ax1.set_xlabel('Number of defects')
ax1.set_title(f'Submissions by the number of unique defects ({len(defect_log)} in total)')

ax1.set_ylim(0, len(defect_log))
ax2.set_ylim(0, 100)

ax1.grid(visible=True, linestyle='--', linewidth=0.5)
fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))

#plt.savefig('submissions_cumulative_histogram.png', dpi=300)
plt.show()

### correlation and co-occurance

No interesting or super strong correlations

In [None]:
plt.figure(figsize=figsize, layout="constrained")
correlations = defect_log.corr().values
np.fill_diagonal(correlations, 0)
defect_names = defects['defect name'].loc[defect_log.columns]
sns.heatmap(correlations, xticklabels=defect_names.apply(lambda x: x[:20]), yticklabels=defect_names, vmin=-1, vmax=1, cmap="vlag")

plt.xticks(rotation = 90, ha='right')
plt.yticks(rotation = 0, ha='right')

plt.title('Correlation between the presence of defects (pearson)'.format(len(defect_log)))
#plt.savefig('defect_correlation.png', dpi=300)
plt.show()

Almost all defect pairs are quite rare, focusing analysis on them is a bad idea. We need a more general approach.

In [None]:
# co-occurances
co_occurances = (defect_log.T.dot(defect_log)).values
np.fill_diagonal(co_occurances, 0)
defect_names = [defects['defect name'].loc[idx] for idx in defect_log.columns]

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

# Your data (assuming 'co_occurances' and 'defect_names' are defined elsewhere)
thresholded = co_occurances.copy()
thresholded[thresholded < 100] = 0
thresholded[thresholded >= 1000] = 3
thresholded[thresholded >= 250] = 2
thresholded[thresholded >= 100] = 1

cmap = ListedColormap(sns.color_palette("Blues", 4))

fig, ax = plt.subplots(figsize=figsize, layout="constrained")

p = ax.imshow(thresholded, cmap=cmap, vmin=0, vmax=3)

ax.set_yticks(np.arange(len(defect_names)))
ax.set_yticklabels(defect_names)

ax.set_title('Co-occurrence of defects thresholded at different levels')

cbar = fig.colorbar(p, ax=ax, ticks=[0, 1, 2, 3], shrink=0.8)
cbar.set_label('Thresholded Co-occurrence Levels')

cbar.set_ticks([0, 1, 2, 3])
cbar.set_ticklabels(['< 100', '100 - 250', '250 - 1000', '>= 1000'])

plt.show()


In [None]:
fig = px.imshow(co_occurances, x = defect_names, y = defect_names)
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)
fig.update_layout(yaxis={"dtick":1},margin={"t":0,"b":0})
fig

### co-occurance - manual exploration

manually looking for frequent and interesting defect pairs - bad idea

by category

In [None]:
categories = defects['defect type'].loc[defect_log.columns]
dfs_by_category = {category: defect_log[columns].copy() for category, columns in categories.groupby(categories).groups.items()}
categories = categories.unique()
NUM_ROWS = 2
NUM_COLS = math.ceil(len(categories) / 2)


for i in range(len(categories)):
    this_category = categories[i]
    fig, axes = plt.subplots(NUM_ROWS, NUM_COLS, figsize=figsize)
    fig.suptitle("Programming Defect Co-occurance by Categories: {} (absolute counts)".format(categories[i]), fontsize=16)
    axes = axes.flatten()
    for j in range(len(categories)):
        other_category = categories[j]
        this_df, other_df = dfs_by_category[this_category], dfs_by_category[other_category]
        
        product_df = this_df.T.dot(other_df)

        if this_category == other_category:
            np.fill_diagonal(product_df.values, 0)

        sns.heatmap(product_df, annot=True, cmap="Reds", fmt="d", ax=axes[j], cbar=False)
        axes[j].set_title(other_category)
        axes[j].set(xlabel="", ylabel="")
        axes[j].set_xticks(np.arange(product_df.shape[1]), labels=defects["defect name"].loc[product_df.columns].apply(lambda x: x[:20]), rotation=40, ha="right")
        if j % NUM_COLS == 0:
            axes[j].set_yticks(np.arange(product_df.shape[0]), labels=defects["defect name"].loc[product_df.index].apply(lambda x: x[:30]), rotation=0)
        else:
            axes[j].set_yticks([])
        
        plt.subplots_adjust(
            left=0.25,
            bottom=0.25, 
            right=0.95, 
            top=0.9, 
            wspace=0.1, 
            hspace=0.85
        )
                
        #plt.savefig('co_occurance_{}.png'.format(this_category), dpi=300)
    plt.show()

by severity

In [None]:
categories = defects['severity'].loc[defect_log.columns]
dfs_by_category = {category: defect_log[columns].copy() for category, columns in categories.groupby(categories).groups.items()}
categories = categories.unique()
NUM_ROWS = 2
NUM_COLS = len(categories) // 2


for i in range(len(categories)):
    this_category = categories[i]
    fig, axes = plt.subplots(NUM_ROWS, NUM_COLS, figsize=figsize)
    fig.suptitle("Programming Defect Co-occurance by Categories: {} (absolute counts)".format(categories[i]), fontsize=16)
    axes = axes.flatten()
    for j in range(len(categories)):
        other_category = categories[j]
        this_df, other_df = dfs_by_category[this_category], dfs_by_category[other_category]
        
        product_df = this_df.T.dot(other_df)

        if this_category == other_category:
            np.fill_diagonal(product_df.values, 0)

        sns.heatmap(product_df, annot=True, cmap="Reds", fmt="d", ax=axes[j], cbar=False)
        axes[j].set_title(other_category)
        axes[j].set(xlabel="", ylabel="")
        axes[j].set_xticks(np.arange(product_df.shape[1]), labels=defects["defect name"].loc[product_df.columns].apply(lambda x: x[:20]), rotation=40, ha="right")
        if j % NUM_COLS == 0:
            axes[j].set_yticks(np.arange(product_df.shape[0]), labels=defects["defect name"].loc[product_df.index].apply(lambda x: x[:30]), rotation=0)
        else:
            axes[j].set_yticks([])
        
        plt.subplots_adjust(
            left=0.25,
            bottom=0.25, 
            right=0.95, 
            top=0.9, 
            wspace=0.1, 
            hspace=0.85
        )
                
        #plt.savefig('co_occurance_{}.png'.format(this_category), dpi=300)
    plt.show()

# defects by tasks

## defects in task templates

some anomalies can be explained by task templates and democode

In [None]:
democode_messages = items["democode"].apply(generate_linter_messages)
democode_messages = [
    {'item': idx, 'defect': code}
    for idx, code_message_list in democode_messages.items()
    for code, _ in code_message_list
]
democode_messages = pd.DataFrame(democode_messages)
democode_messages = democode_messages[democode_messages["defect"].isin(code_to_defect_id.keys())]
democode_messages["defect"] = democode_messages["defect"].replace(code_to_defect_id).astype(int)

In [None]:
solution_messages = items["solution"].apply(generate_linter_messages)
solution_messages = [
    {'item': idx, 'defect': code}
    for idx, code_message_list in solution_messages.items()
    for code, _ in code_message_list
]
solution_messages = pd.DataFrame(solution_messages)
solution_messages = solution_messages[solution_messages["defect"].isin(code_to_defect_id.keys())]
solution_messages["defect"] = solution_messages["defect"].replace(code_to_defect_id).astype(int)

In [None]:
legend_markers = [plt.scatter([0], [0], color='green', s=100, marker='o'), plt.scatter([0], [0], color='purple', s=50, marker='x')]

plt.figure(figsize=figsize, layout="constrained")

biggest_value = common_defects.abs().values.max()

ax = sns.heatmap(
    common_defects,
    xticklabels=task_names,
    yticklabels=defect_names,
    cmap="vlag",
    cbar=True,
    vmin=-biggest_value,
    vmax=biggest_value
)

for _, row in solution_messages[solution_messages["item"].isin(highest_variance_tasks)].iterrows():
    x_pos = task_names.index(items['name'].loc[row['item']])
    y_pos = defect_names.index(defects['defect name'].loc[row['defect']])
    ax.scatter(x_pos + 0.5, y_pos + 0.5, color="green", s=100, marker='o')

for _, row in democode_messages[democode_messages["item"].isin(highest_variance_tasks)].iterrows():
    x_pos = task_names.index(items['name'].loc[row['item']])
    y_pos = defect_names.index(defects['defect name'].loc[row['defect']])
    ax.scatter(x_pos + 0.5, y_pos + 0.5, color="purple", s=50, marker='x')

plt.title("Task-Level Defect Anomalies (Z-scores, Tasks With Highest Variance)")
#plt.legend(title='Smoker', loc='upper left', labels=['Hell Yeh', 'Nah Bruh'])
plt.xlabel("")
plt.ylabel("")
plt.legend(legend_markers, ['example solution', 'demo code'], title='Defect present in:', loc='lower left')

#plt.savefig('defect_anomalies.png', dpi=300)
plt.show()

# defect severity

defects repeated many times might jump in perceived severity - how often does this happen?

not that often, but it does happen -> candidate for the survey

In [None]:
if use_counts:
    raise RuntimeError
MAX_COUNT = 20
# use counts, not presence
defect_counts = defect_log
# histogram for each defect
defect_counts = {col: defect_counts[col][defect_counts[col] > 0].value_counts() for col in defect_counts.columns}
# fill empty brackets with 0
defect_counts = pd.DataFrame(defect_counts).fillna(0).astype(int)
for i in range(1, MAX_COUNT):
    if i not in defect_counts.index:
        defect_counts.loc[i] = np.zeros(len(defect_counts.columns), dtype=int)
defect_counts.sort_index(inplace=True)
# sum the tail
defect_counts.loc[MAX_COUNT - 1] = defect_counts.iloc[MAX_COUNT:].sum(axis=0)
defect_counts = defect_counts.iloc[:MAX_COUNT]
# normalize the values
defect_counts = defect_counts.div(defect_counts.sum(axis=0), axis=1)

# order by severity
defect_counts = defect_counts[sorted(defect_counts.columns.tolist(), key=lambda x: defects["severity"].loc[x])]

# rotate
defect_counts = defect_counts.T

fig, ax = plt.subplots(figsize=figsize, layout="constrained")

sns.heatmap(defect_counts, cmap="Blues", fmt="d", ax=ax)
for i, row in enumerate(defect_counts.index):
    ax.scatter((defect_counts.loc[row] * defect_counts.columns.values).sum() / defect_counts.loc[row].sum() - 0.5, i + 0.5, marker='X', color='red', s=100)
ax.invert_yaxis()
ax.set_aspect('equal', 'box')

colorbar = ax.collections[0].colorbar
colorbar.set_label('% of submissions', rotation=90, labelpad=15)

plt.xticks(ticks=np.arange(MAX_COUNT), labels=[str(i) for i in range(1, MAX_COUNT)] + [f">= {MAX_COUNT}"], rotation=30)
plt.yticks(ticks=np.arange(len(defect_counts.index)), labels=[defects["defect name"].loc[row] for row in defect_counts.index], rotation=0)

severity = [defects["severity"].loc[row] for row in defect_counts.index]
for idx in range(1, len(severity)):
    if severity[idx] != severity[idx - 1]:
        plt.axhline(y=idx, color='black', linestyle='--', linewidth=1)

plt.xlabel("# of instances")
plt.title('Distribution of defect counts')
# plt.savefig('num_of_defects.png', dpi=300)
plt.show()

In [None]:
if not only_correct:
    correlations = pd.DataFrame({
        task: defect_log[log['item'] == task].corrwith(log['correct'] - 1) for task in log['item'].unique()
    })
    counts = defect_log.groupby(log["item"]).sum().T
    correlations = correlations[counts.columns]
    correlations[counts < 100] = np.nan

    highest_variance_tasks = correlations.var(axis=1).sort_values(ascending=False)[:30].index
    defect_names = [defects['defect name'].loc[idx] for idx in defect_log.columns]
    task_names = [items['name'].loc[idx] for idx in highest_variance_tasks]

    plt.figure(figsize=figsize, layout="constrained")
    sns.heatmap(common_defects.loc[highest_variance_tasks].T, xticklabels=task_names, yticklabels=defect_names, cmap="vlag", cbar=True, vmin=-1, vmax=1)
    plt.title("Correlation between the presence of defects and failure (pearson)")
    plt.xlabel("")
    plt.ylabel("")

    # plt.savefig('success_correlations_across_tasks.png', dpi=300)
    plt.show()

# Exploring specific defect co-occurances

In [None]:
def wrap_text(text, max_length=100):  # noqa: D103
    lines = []
    line = []
    length = 0

    for word in text.split():
        if (length := length + len(word) + 1) <= max_length:
            line.append(word)
        else:
            lines.append(" ".join(line))
            line = [word]
            length = len(word) + 1
    
    if line:
        lines.append(" ".join(line))
    
    return "\n".join(lines)


def item_info(idx):  # noqa: D103
    row = items.loc[idx]
    print('Task: ', row['name'])
    print('='*50)
    print('Description: ', wrap_text(row['instructions']), sep='\n')
    print('-'*50)
    print('Example solution: ', row['solution'], sep='\n')


def defect_info(idx):  # noqa: D103
    row = defects.loc[idx]
    print(row["defect type"], ": ", row['defect name'])
    print('='*50)
    print('Description: ', wrap_text(row['description']), sep='\n')
    print('-'*50)
    print('Code example: ', row['code example'], sep='\n')

In [None]:
defect1, defect2 = 29, 30
print(f"({defect1}, {defect2}) {defects.loc[defect1]['defect name']}, {defects.loc[defect2]['defect name']}")
defect_info(defect1)
print("\n\n")
defect_info(defect2)
df = log.loc[defect_log[(defect_log.loc[:, defect1] >= 1) & (defect_log.loc[:, defect2] >= 1)].index]
df["item"].value_counts()

In [None]:
item = 60
item_df = df[df["item"] == item]
item_info(item)
print('-'*50)
for i in range(min(len(item_df), 5)):
    print(f"Solution {i + 1}, index: {item_df.index[i]}", item_df.iloc[i]["answer"], sep="\n")
    print("-"*50)

In [None]:
index = 103332


In [None]:
# defect descriptions
defect1_row = defects.loc[defect1, ["defect name", "description", "code example", "code fix example"]]
defect2_row = defects.loc[defect2, ["defect name", "description", "code example", "code fix example"]]
defect1_row.index = ["defect 1 name", "defect 1 description", "defect 1 code example", "defect 1 code fix example"]
defect2_row.index = ["defect 2 name", "defect 2 description", "defect 2 code example", "defect 2 code fix example"]

# item description
item_row = items.loc[item, ["name", "instructions"]]
item_row.index = ["item name", "item instructions"]

# user submission
log_row = log.loc[[index]].reset_index()[["index", "answer"]].iloc[0]
log_row.index = ["submission index", "submission text"]

new_row = pd.concat([item_row, log_row, defect1_row, defect2_row])
pd.concat([
    pd.read_csv(data_path / "questions.csv", sep=";", index_col=0),
    pd.DataFrame([new_row])
], ignore_index=True).to_csv(data_path / "questions.csv", sep=";")

In [None]:
#pd.DataFrame([new_row]).to_csv("questions.csv", sep=";")

# clustering

tasks by defect vs task category

submissions by defect vs task

submissions by defect vs task category

tasks by defect vs complexity?

users by defect vs performance?

submissions by defect vs user?

# feature engineering

In [None]:
items, log, defects, defect_log, code_to_defect_id = loader.load(ipython_path, data_path, only_correct=False, only_final=False)

# TODO drop whitespace

defect_log.drop(defects[['whitespace' in name for name in defects['defect name']]].index, axis=1, inplace=True)

## is the task-defect "possible"?

In [None]:
threshold = 0.10
merged = log[['item']].merge(defect_log > 0, left_index=True, right_index=True)
frequencies = merged.groupby('item').mean()
reasonable = frequencies >= threshold

### exploration

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

ticks = np.arange(len(reasonable))

p = ax.bar(ticks, reasonable.sum(axis=1).values)

ax.set_xticks(ticks, labels=[items["name"].loc[idx] for idx in reasonable.index], rotation=90)

ax.set_xlabel('task')
ax.set_ylabel('number of defects')
ax.set_title(f'Reasonable Task-Defect Pairs (t={threshold})')

# plt.savefig('images/reasonable_task.png', dpi=300)
plt.show()

In [None]:
threshold = 0.10
merged = log[['item']].merge(defect_log > 0, left_index=True, right_index=True)
frequencies = merged.groupby('item').mean()
reasonable = frequencies >= threshold

fig, ax = plt.subplots(layout="constrained", figsize=figsize)

ticks = np.arange(reasonable.shape[1])

vals = reasonable.sum(axis=0).sort_values(ascending=False)

p = ax.bar(ticks, vals.values)

ax.set_xticks(ticks, labels=[defects["defect name"].loc[idx] for idx in vals.index], rotation=90)

ax.set_xlabel('defect')
ax.set_ylabel('number of tasks')
ax.set_title(f'Reasonable Task-Defect Pairs (t={threshold})')

# plt.savefig('images/reasonable_defect.png', dpi=300)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

thresholds = [0.20, 0.10, 0.05, 0.01]  # strict to loose
defect_names = defects["defect name"]

# Step 1: compute reasonable counts for each threshold
all_vals = []
for threshold in thresholds:
    merged = log[['item']].merge(defect_log > 0, left_index=True, right_index=True)
    frequencies = merged.groupby('item').mean()
    reasonable = frequencies >= threshold
    vals = reasonable.sum(axis=0)
    all_vals.append(vals)

# Step 2: build DataFrame and compute incremental differences
stack_data = pd.concat(all_vals, axis=1).fillna(0)
stack_data.columns = [f"t={t:.2f}" for t in thresholds]

# Compute deltas from strict to loose
delta_data = stack_data.copy()
for i in range(len(delta_data.columns) - 1, 0, -1):
    delta_data.iloc[:, i] -= delta_data.iloc[:, i - 1]

# Optional: sort defects by total reasonable tasks
delta_data = delta_data.loc[delta_data.sum(axis=1).sort_values(ascending=False).index]

# Plot with strictest threshold at the bottom of the stack
fig, ax = plt.subplots(figsize=(12, 6), layout='constrained')

bottom = np.zeros(len(delta_data))
x = np.arange(len(delta_data))

for col in delta_data.columns:
    ax.bar(x, delta_data[col], bottom=bottom, label=col)
    bottom += delta_data[col]

ax.set_xticks(x, labels=[defect_names.loc[idx] for idx in delta_data.index], rotation=90)
ax.set_xlabel('Defect')
ax.set_ylabel('Number of Reasonable Tasks (Δ per Threshold)')
ax.set_title('New Reasonable Task-Defect Pairs as Threshold Decreases')
ax.legend(title='Threshold')
plt.show()


## characteristic defect

## currently taught topic

## connected to poor performance

### on the task

### poor performing students

## recently repeated

## often repeated

## many times repeated in a single submission

## inside an already complex expression

## student skill level

## how much time has the student spent on the task (compared to the average)

## how long since the defect has been introduced