# loading data

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import random
from matplotlib.colors import ListedColormap
from scipy.stats import pointbiserialr
from IPython.display import display, HTML, update_display
from tqdm import tqdm

import src.ipython_loader as loader
from src.code_processing import generate_linter_messages

figsize = (10, 7)
resolution = 300 # dpi
data_path = Path('data')
ipython_path = data_path / 'ipython_new'

%matplotlib widget

In [None]:
items, log, defects, defect_log, code_to_defect_id = loader.load(ipython_path, data_path)

#defect_log.drop(defects[['whitespace' in name for name in defects['defect name']]].index, axis=1, inplace=True)

# feature engineering

## plotting utils

In [None]:
def task_and_defect_description(task, defect):  # noqa: D103
    task_row = items.loc[task]
    defect_row = defects.loc[defect]
    submissions = log[(log["item"] == task) & (defect_log[defect])]
    
    return f"""
    <div style="display: flex; justify-content: space-between; gap: 20px;">
        <!-- Task Section -->
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>{task_row["name"]}</h3>
            <div><strong>Instructions:</strong><br>{task_row["instructions"]}</div>
            <div><strong>Solution:</strong><br>
                <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{task_row["solution"]}</pre>
            </div>
        </div>
        
        <!-- Defect Section -->
        <div style="width: 48%; border: 1px solid #ccc; padding: 10px; border-radius: 5px;">
            <h3>{defect_row["defect name"]}</h3>
            <div><strong>Defect Type:</strong> {defect_row["defect type"]}</div>
            <div><strong>Severity:</strong> {defect_row["severity"]}</div>
            <div><strong>Description:</strong><br>{defect_row["description"]}</div>
            
            <div style="display: flex; justify-content: space-between; margin-top: 20px;">
                <div style="width: 48%; padding: 10px;">
                    <strong>Code Example:</strong><br>
                    <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{defect_row["code example"]}</pre>
                </div>
                <div style="width: 48%; padding: 10px;">
                    <strong>Code Fix Example:</strong><br>
                    <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{defect_row["code fix example"]}</pre>
                </div>
            </div>
        </div>
    </div>
    
    <!-- Code Snippet Section -->
    <div style="border: 1px solid #ccc; padding: 10px; margin-top: 20px; border-radius: 5px;">
        <strong>Example Submission:</strong><br>
        <pre style="background-color: #2e2e2e; color: #f5f5f5; padding: 10px; border-radius: 5px; font-family: monospace;">{submissions["answer"].iloc[random.randint(0, len(submissions) - 1)] if len(submissions) else 'No submissions found'}</pre>
    </div>
    """

In [None]:
def task_defect_plot(matrix, title='', save=False, interactive=False, *args, **kwargs):
    defect_names = [defects['defect name'].loc[idx][:20] for idx in matrix.columns]
    task_names = [items['name'].loc[idx][:20] for idx in matrix.index]

    fig, ax = plt.subplots(figsize=(13, 7), layout="constrained")
    if interactive:
        sns.heatmap(matrix.T, cbar=False, *args, **kwargs)
    else:
        sns.heatmap(matrix.T, xticklabels=task_names, yticklabels=defect_names, cbar=True, *args, **kwargs)
        ax.tick_params(axis='x', labelsize=7)
        ax.tick_params(axis='y', labelsize=8)
        plt.title(title)
    plt.xlabel("")
    plt.ylabel("")

    if save:
        plt.savefig('images/' + title.lower().replace(' ', '_')[:title.find(' t=')] + '.png', dpi=300)

    if interactive:
        output_html = display(HTML("<b>Click a cell to see details</b>"), display_id=True)

        def on_click(event):
            if event.inaxes == ax:
                x = int(event.xdata)
                y = int(event.ydata)
                
                if 0 <= x < len(task_names) and 0 <= y < len(defect_names):
                    html = HTML(task_and_defect_description(matrix.index[x], matrix.columns[y]))
                    #html = HTML(f'{x}, {y}')
                    output_html.update(html)

        fig.canvas.mpl_connect('button_press_event', on_click)

    return fig

## Reasonableness for the task

In [None]:
merged = log[['item']].merge(defect_log > 0, left_index=True, right_index=True)
frequencies = merged.groupby('item').mean()

In [None]:
# 0.8 and lower start appearing innocent defects (augmentable assignment and so on)
upper_limit = 0.9

fig = task_defect_plot(frequencies > upper_limit, title=f"Anomalously common task-defect pairs for threshold t={upper_limit}", interactive=False, save=True)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

thresholds = [0.01, 0.02, 0.03, 0.04]
defect_names = defects["defect name"]

all_vals = []
for threshold in thresholds:
    reasonable = frequencies >= threshold
    vals = reasonable.sum(axis=0)
    all_vals.append(vals)

stack_data = pd.concat(all_vals, axis=1).fillna(0)
stack_data.columns = [f"t={t:.2f}" for t in thresholds]

stack_data = stack_data.loc[(stack_data.median(axis=1) + 0.1 * stack_data.max(axis=1)).sort_values(ascending=False).index]

fig, ax = plt.subplots(figsize=(12, 7), layout='constrained')
x = np.arange(stack_data.shape[0])
bar_width = 0.2
n_thresholds = len(thresholds)

for i, col in enumerate(stack_data.columns):
    offset = (i - n_thresholds / 2) * bar_width + bar_width / 2
    ax.bar(x + offset, stack_data[col], width=bar_width, label=col)


ax.set_xticks(x, labels=[defect_names.loc[idx][:20] for idx in stack_data.index], rotation=90)
ax.set_xlabel('Defect')
ax.set_ylabel('Number of Reasonable Tasks')
ax.set_title('Number of Reasonable Task-Defect Pairs as Threshold Decreases')
ax.legend(title='Threshold')

plt.savefig('images/number_of_reasonable_task-defect_pairs_as_threshold_decreases.png', dpi=300)
plt.show()


In [None]:
reasonable_threshold = 0.02
reasonable = (frequencies >= reasonable_threshold) & (frequencies < 0.9)

In [None]:
fig = task_defect_plot(reasonable, title=f"Reasonable Task-Defect Pairs for threshold t={reasonable_threshold:1f}", interactive=False, save=True)

In [None]:
fig = task_defect_plot(reasonable, title=f"Reasonable Task-Defect Pairs for threshold t={threshold}", interactive=True)

In [None]:
plt.close(fig)

## Characteristic for the task

In [None]:
z_score = (defect_log.groupby(log['item']).mean() - defect_log.mean()) / defect_log.std()

In [None]:
z_score[reasonable == False] = np.nan

In [None]:
fig = task_defect_plot(z_score, title=f"Task-Defect Pair Z-Scores", interactive=False, save=True)

In [None]:
reasonable_z_scores = z_score.stack().dropna()
quantile = 0.8
threshold = reasonable_z_scores.quantile(quantile)

plt.figure(figsize=(10, 6))

plt.hist(reasonable_z_scores, bins=100, color='skyblue', edgecolor='black')
plt.axvline(x=threshold, color='red', linestyle='--', label=f'{int(quantile * 100)}-percentile Threshold (= {round(threshold, 2)})')

plt.title('Histogram of Z-Scores for Reasonable Task-Defect Pairs')
plt.xlabel('Z-Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)

plt.savefig('images/histogram_of_z-scores_for_reasonable_task-defect_pairs.png', dpi=300)
plt.show()

In [None]:
chrarcteristic = (z_score > threshold).astype(int)
chrarcteristic[reasonable == False] = np.nan

In [None]:
fig = task_defect_plot(chrarcteristic, title=f"Characteristic Defects for Threshold t={threshold}", interactive=False, save=True, cmap=ListedColormap(['blue', 'red']))

In [None]:
fig = task_defect_plot(z_score, title=f"Task-Defect Pair Z-Scores", interactive=True)

In [None]:
plt.close(fig)

In [None]:
log_by_topic = log.merge(items, left_on='item', right_index=True)['topic']
defect_frequencies_by_topic = defect_log.groupby(log_by_topic).mean()

topic_z_score = (defect_log.groupby(log_by_topic).mean() - defect_log.mean()) / defect_log.std()

In [None]:
topic_z_score[defect_frequencies_by_topic < 0.01] = np.nan

In [None]:
plt.figure(figsize=(13, 10), layout="constrained")
sns.heatmap(topic_z_score.T, vmin=-2, vmax=2, yticklabels=[defects['defect name'].loc[idx][:20] for idx in topic_z_score.columns], cmap="vlag", cbar=True)
plt.ylabel("")
plt.xlabel("")
plt.title("Topic-Level Defect Anomalies (Z-scores)")

plt.show()

In [None]:
items[items['name'].str.contains('Velké')]

## Time Spent on Task

## Associated with Poor Performance Locally

In [None]:
_, not_log, _, not_defect_log, _ = loader.load(ipython_path, data_path, only_correct=False)

df = not_log[['item', 'correct']].merge(not_defect_log, left_index=True, right_index=True)

correlations = {}

# for each item
for task_id, task_df in df.groupby('item'):
    corr_dict = {}
    # for each defect
    for defect in defect_log.columns:
        # get vectors
        defect_presence = task_df[defect]
        incorrect = ~task_df['correct']
        # caluclate correlation
        if defect_presence.nunique() > 1 and incorrect.nunique() > 1:
            corr, _ = pointbiserialr(defect_presence, incorrect)
            corr_dict[defect] = corr
        else:
            corr_dict[defect] = np.nan

    correlations[task_id] = corr_dict

# construct df
performance = pd.DataFrame.from_dict(correlations, orient='index')
performance[reasonable == False] = np.nan


In [None]:
~task_df['correct']

In [None]:
task_df['correct']

In [None]:
fig = task_defect_plot(performance, title="Defect-Failure Correlation", interactive=False, save=True)

In [None]:
performances = performance.stack().dropna()

plt.figure(figsize=(10, 6))

plt.hist(performances, bins=100, color='skyblue', edgecolor='black')
plt.title('Histogram of Task-Defect Pair Correlations with Failure')
plt.xlabel('Z-Score')
plt.ylabel('Frequency')
plt.grid(True)

plt.savefig('images/histogram_of_task-defect_pair_correlations_with_failure.png', dpi=300)
plt.show()

In [None]:
failure_threshold = 0.1 # performances.quantile(quantile)

fig = task_defect_plot(performance > failure_threshold, title=f"High failure rates t={failure_threshold:.2f}", interactive=False, save=True)

In [None]:
fig = task_defect_plot(performance > failure_threshold, title=f"High failure rates t={failure_threshold:.2f}", interactive=True)

In [None]:
plt.close(fig)

There are some interesting takeaways (long function, unused variable), but overall does not produce any meaningful results.

## Defect multiplicity

In [None]:
_, _, _, multiplicity_log, _ = loader.load(ipython_path, data_path, only_presence=False)

assert multiplicity_log.index.difference(log.index).empty

# suppress outliers
multiplicity_log[multiplicity_log > 10] = 10

In [None]:
multiplicity = (multiplicity_log > 1).mean().sort_values()

plt.figure(figsize=(14, 4), layout="constrained")

ticks = np.arange(len(multiplicity))

sns.barplot(multiplicity.values)
plt.title("Mean Number of Defect Occurances per Defect")
plt.xlabel("Defect")
plt.ylabel("Mean Multiplicity")
plt.xticks(ticks, [defects['defect name'][idx][:20] for idx in multiplicity.index], rotation=90)

plt.show()


In [None]:
multiplicity = multiplicity_log.sample(n=5000, random_state=42)

multiplicity = multiplicity.melt(var_name='Defect', value_name='Count')
multiplicity = multiplicity[multiplicity['Count'] > 0]

multiplicity['Defect Name'] = multiplicity['Defect'].map(lambda x: defects['defect name'].loc[x][:20])

plt.figure(figsize=(14, 6), layout="constrained")
sns.boxplot(data=multiplicity, x='Defect Name', y='Count')
plt.xticks(rotation=90)
plt.title("Box Plot of Defect Multiplicity per Submission")
plt.ylabel("Multiplicity (Count per Submission)")
plt.xlabel("Defect")
plt.show()


## Recency

In [None]:
recency_log = []

# prepare in advance to make thee computation faster
df = log.merge(defect_log, left_index=True, right_index=True)
df = df.sort_values(by=['user', 'time'])

# for each user
for user_id, history in tqdm(df.groupby('user')):
    last_seen = {defect: None for defect in defect_log.columns}

    for i, (idx, row) in enumerate(history.iterrows()):
        recency_row = {}
        for defect in defect_log.columns:
            if row[defect] == 1:

                if last_seen[defect] is None:
                    recency_row[defect] = 0
                else:
                    recency_row[defect] = i - last_seen[defect]

                last_seen[defect] = i
            else:
                recency_row[defect] = np.nan

        recency_row['submission id'] = idx
        recency_log.append(recency_row)

# create dataframe
recency_log = pd.DataFrame(recency_log).set_index('submission id')
recency_log.index.name = 'submission id'

In [None]:
first_time_rate = (recency_log == 0)[~recency_log.isna()].mean().sort_values(ascending=True)

plt.figure(figsize=(12, 6), layout="constrained")

ticks = np.arange(len(first_time_rate))

plt.bar(ticks, first_time_rate)
plt.title("Percentage of First-Time Occurances per Defect")
plt.ylabel("First-Time Rate")
plt.xlabel("Defect")
plt.xticks(ticks, [defects['defect name'][idx][:20] for idx in first_time_rate.index], rotation=90)
plt.grid(axis='y')

plt.savefig('images/percentage_of_first_time_occurances_per_defect.png', dpi=300)
plt.show()


In [None]:
# bins
bins = [0, 1, 2, 4, 9, 14, 19, 24, 29, np.inf]
bin_labels = ['1', '2', '3-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30+']
recency = recency_log.apply(lambda col: pd.cut(col, bins=bins, labels=bin_labels))

# histogram
recency = recency.apply(lambda col: col.value_counts()).fillna(0).astype(int)

# scaling
recency = recency.div(recency.sum(axis=0), axis=1)

# sort as the previous graph
recency = recency.loc[:, first_time_rate.index]

plt.figure(figsize=(12, 6), layout="constrained")
sns.heatmap(recency, cmap='viridis', cbar_kws={'label': 'frequency'}, xticklabels=[defects['defect name'][idx][:20] for idx in defect_log.columns])
plt.title("Number of Sessions Before Defect Reoccurance")
plt.xlabel("Defect")
plt.ylabel("Recency Bin")
plt.gca().invert_yaxis()

plt.savefig('images/heatmap_of_recency_bins_per_defect.png', dpi=300)
plt.show()

In [None]:
mean_recency = recency_log.replace(0, np.nan).mean().sort_values()

# sort as the other graphs
recency = recency.loc[:, first_time_rate.index]

plt.figure(figsize=(12, 6), layout="constrained")

ticks = np.arange(len(mean_recency))

plt.bar(ticks, mean_recency)
plt.title("Average Recency (# of Submissions Since Last Seen) per Defect")
plt.ylabel("Average Recency")
plt.xlabel("Defect")
plt.xticks(ticks, [defects['defect name'][idx][:20] for idx in first_time_rate.index], rotation=90)
plt.grid(axis='y')

plt.savefig('images/average_recency_per_defect.png', dpi=300)
plt.show()


In [None]:
recency_log.groupby(log['item']).mean()

In [None]:
recency_log

## Currently Taught Topic

Manually set for topics, or by frequency in student submissions.

In [None]:
def add_concept_to_defects(keyword:str, tag:str):
    if 'concepts' not in defects.columns:
        defects['concepts'] = ['' for i in range(len(defects))]
    mask = defects['code fix example'].apply(lambda x: True if x and 'if' in x else False)
    mask |= defects['code example'].apply(lambda x: True if x and 'if' in x else False)
    mask &= defects['concepts'].apply(lambda x: tag not in x)
    defects['concepts'] += mask.apply(lambda x: tag + ' ' if x else '')

In [None]:
add_concept_to_defects('if ', 'if')
add_concept_to_defects('for ', 'for')
add_concept_to_defects('while ', 'while')
add_concept_to_defects('string ', '\'')
add_concept_to_defects('string ', '"')

In [None]:
defects

## Future Opportunity Likelihood

In [None]:
import pandas as pd
import numpy as np

def compute_future_opportunity(submission_id):
    # Get submission row
    row = log.loc[submission_id]
    user, task, current_time = row['user'], row['item'], row['time']
    
    # Tasks completed up to this point
    completed_up_to_now = log[
        (log['user'] == user) & (log['time'] <= current_time)
    ]['item']

    # Define future tasks
    all_tasks = sorted(frequencies.index)
    remaining_tasks = [t for t in all_tasks if t not in completed_up_to_now]
    if not remaining_tasks:
        return {defect: 0.0 for defect in frequencies.columns}
    
    # Weight by task ID (scaled)
    weights = np.where(remaining_tasks > task, 2, 1)
    
    # Get defect frequencies for those tasks
    future_freqs = frequencies.loc[remaining_tasks]
    
    # Weighted average
    weighted_avg = (future_freqs.T @ weights).to_dict()
    return weighted_avg


In [None]:
compute_future_opportunity(log.iloc[100]['id'])

# filtering

In [None]:
def has_close_pair(row):
    """Check if there is a pair of values with difference one or less."""
    row_values = row.values
    return np.any(np.abs(row_values[:, None] - row_values) <= 1)

In [None]:
# at least two defects
filtered = defect_log[defect_log.sum(axis=1) > 1]

In [None]:
# at most difference of one in severity
filtered *= defects.loc[filtered.columns]['severity']
filtered = filtered[filtered.apply(has_close_pair, axis=1)]

In [None]:
# apply the filter
defect_log = defect_log.loc[filtered.index]
log = log.loc[filtered.index]

# small sample

In [None]:
def generate_submission_html(submission_id, characteristics, threshold):
    # Fetch submission info
    sub = log.loc[submission_id]
    task_id = sub['item']
    code = sub['answer']
    
    task_name = items.loc[task_id, 'name']
    instructions = items.loc[task_id, 'instructions']
    
    present_defects = defect_log.loc[submission_id]
    present_defects = present_defects[present_defects == 1].index.tolist()


    defect_rows = []
    for defect in present_defects:
        defect_rows.append({
            "Defect": defects.loc[defect, "defect name"],
            "Description": defects.loc[defect, "description"],
            f"Characteristic (t={threshold:.2f})": f"{characteristics.loc[task_id, defect]:.2f}"
        })
    
    defect_df = pd.DataFrame(defect_rows)
    
    html = f"""
    <div style="background-color: #121212; color: #f0f0f0; font-family: 'Segoe UI', sans-serif; padding: 20px;">
        <div style="text-align: left;">
            <table style="width: 90%; margin-left: auto; border-collapse: collapse; background-color: #1e1e1e; border: 1px solid #444;">
                <tr>
                    <td style="vertical-align: top; width: 50%; border-right: 1px solid #333; padding: 20px; text-align: left;">
                        <h2 style="color: #ffffff;">{task_name}</h2>
                        <p><strong>Instructions:</strong><br>{instructions}</p>
                        <div style="background-color: #2b2b2b; color: #dcdcdc; padding: 15px; border-radius: 5px; overflow-x: auto; text-align: left;">
                            <pre style="margin: 0; white-space: pre-wrap;">{code}</pre>
                        </div>
                    </td>
                    <td style="vertical-align: top; width: 50%; padding: 20px;">
                        <h2 style="color: #ffffff;">Detected Defects</h2>
                        {defect_df.to_html(index=False, escape=False, border=0, justify='left', classes='defect-table')}
                    </td>
                </tr>
            </table>
        </div>
    </div>
    """
    return html


In [None]:
# identify task-defect pairs where z-score > 1
characteristic = frequencies
threshold = 0.2
flagged = (characteristic >= threshold)

qualified = []

# find all submission ids for relevant task-defect pairs
for task, row in flagged.iterrows():
    for defect, is_significant in row.items():
        if is_significant:
            submission_ids = log[(log['item'] == task) & (defect_log[defect])].index
            for sid in submission_ids:
                qualified.append({
                    'submission_id': sid,
                    'task': task,
                    'defect': defect,
                    'z_score': characteristic.loc[task, defect]
                })

qualified_df = pd.DataFrame(qualified)

# sample 10 unique submissions
sampled_df = qualified_df.drop_duplicates(subset='submission_id').sample(n=10, random_state=42)


In [None]:
sampled_df

In [None]:
display(HTML(generate_submission_html(sampled_df['submission_id'].iloc[2], characteristic, threshold)))

# random sample

In [None]:
def random_sample(sample_size=20, random_state=42):
    """Sample log indexes uniformly over all tasks."""
    task_weights = 1 / log['item'].value_counts().reindex(log['item']).values
    return log.sample(sample_size, random_state=random_state, weights=task_weights).index

In [None]:
def create_export_dataframes(indexes):
    """Create submission and defect dataframes for export from given indexes."""
    submission_df = []
    defect_df = []

    
    for idx in indexes:
        row = log.loc[idx]
        submission_df.append({
            'submission': row['answer'],
            'task name': items.loc[row['item']]['name'],
            'instructions': items.loc[row['item']]['instructions']
        })
        
        # Previously made defects
        defect_history = defect_log.loc[
            log[(log['user'] == row['user']) & (log['time'] <= row['time'])]
            .sort_values(by='time').index
        ].reset_index(drop=True).astype(bool)
        
        for defect in defect_log.loc[idx][defect_log.loc[idx] > 0].index:
            defect_df.append({
                'submission id': idx,
                'defect id': defect,
                'severity': defects.loc[defect]['severity'],
                'name': defects.loc[defect]['defect name'],
                'description': defects.loc[defect]['description'],
                'code example': defects.loc[defect]['code example'],
                'code fix example': defects.loc[defect]['code fix example'],
                'last encountered': (defect_history.index - defect_history[defect].cumsum().where(defect_history[defect]).ffill()).iloc[-1]
            })
    
    return pd.DataFrame(submission_df, index=indexes), pd.DataFrame(defect_df)

In [None]:
submission_df = []
submission_index = []
defect_df = []
for idx, row in log[defect_log.sum(axis=1) >= 2].sample(20, random_state=42).iterrows():
    submission_index.append(idx)
    submission_row = {}
    submission_row['submission'] = row['answer']
    submission_row['task name'] = items.loc[row['item']]['name']
    submission_row['instructions'] = items.loc[row['item']]['instructions']
    submission_df.append(submission_row)

    # previously made defects
    defect_history = defect_log.loc[
        log[(log['user'] == row['user']) & (log['time'] <= row['time'])].sort_values(by='time').index
    ].reset_index(drop=True).astype(bool)

    for defect in defect_log.loc[idx][defect_log.loc[idx] > 0].index:
        defect_row = {}
        defect_row['submission id'] = idx
        defect_row['defect id'] = defect
        defect_row['severity'] = defects.loc[defect]['severity']
        defect_row['name'] = defects.loc[defect]['defect name']
        defect_row['description'] = defects.loc[defect]['description']
        defect_row['code example'] = defects.loc[defect]['code example']
        defect_row['code fix example'] = defects.loc[defect]['code fix example']
        defect_df.append(defect_row)

submission_df = pd.DataFrame(submission_df, index=submission_index)
defect_df = pd.DataFrame(defect_df)

In [None]:
submission_df.head()

In [None]:
defect_df.head()

## filtering

Look for uniformative entries that might pollute the survey pool.

In [None]:
# empty or overly long submissions
lengths = submission_df['submission'].apply(len).sort_values(ascending=False)
plt.figure(figsize=figsize, layout='constrained')
plt.plot(range(len(lengths)), lengths)

plt.xticks(range(len(lengths)), lengths.index, rotation=90)
plt.show()

In [None]:
for idx in lengths[(lengths > 500) | (lengths < 100)].index:
    print(idx, submission_df.loc[idx]['submission'])
    print('=' * 50)

In [None]:
# duplicities - tasks
task_names = submission_df['task name'][submission_df['task name'].duplicated(keep=False)].unique()
for name in task_names:
    for idx in submission_df[submission_df['task name'] == name].index:
        print(idx, submission_df.loc[idx]['submission'])
    print('=' * 50)

In [None]:
# duplicities - defect pairs
# TODO check that it works for triples

ids_sets = defect_df.groupby('submission id')['defect id'].unique().apply(set)
duplicates = ids_sets[ids_sets.duplicated(keep=False)]
duplicates.apply(frozenset).unique()  # set in not hashable

for duplicate in duplicates:
    submission_ids = defect_df.groupby('submission id')['defect id'].apply(frozenset) == duplicate
    submission_ids = submission_ids[submission_ids].index
    for idx in submission_ids:
        print(idx, submission_df.loc[idx]['submission'])
    print(duplicate)
    print('=' * 50)

In [None]:
if False:   
    defect_df.to_csv(data_path / 'export' / 'defects.csv', sep=';', index_label='index')
    submission_df.to_csv(data_path / 'export' / 'submissions.csv', sep=';', index_label='index')

# additional filtering

In [None]:
def has_close_pair(row):
    """Check if there is a pair of values with difference one or less."""
    row_values = row.values
    return np.any(np.abs(row_values[:, None] - row_values) <= 1)

In [None]:
# at most difference of one in severity
filtered *= defects.loc[filtered.columns]['severity']
filtered = filtered[filtered.apply(has_close_pair, axis=1)]

In [None]:
# apply the filter
defect_log = defect_log.loc[filtered.index]
log = log.loc[filtered.index]

# task history

## is the task-defect "possible"?

Minimum threshold

In [None]:
def threshold_by_frequency(threshold: float = 0.05):
    """Generate a matrix with common and uncommon task-defect pairs."""
    merged = log[['item']].merge(defect_log, left_index=True, right_index=True)
    return merged.groupby('item').mean() >= threshold


In [None]:
threshold = 0.05

kept_pairs = threshold_by_frequency(threshold)

defect_names = [defects['defect name'].loc[idx] for idx in kept_pairs.columns]
task_names = [items['name'].loc[idx] for idx in kept_pairs.index]

plt.figure(figsize=(18, 10), layout="constrained")
sns.heatmap(kept_pairs.T, xticklabels=task_names, yticklabels=defect_names, cbar=True)
plt.title("Sufficiently common task-defect pairs for threshold t={}".format(threshold))
plt.xlabel("")
plt.ylabel("")

# plt.savefig('defect_anomalies.png', dpi=300)
plt.show()

Threshold vs percent of allowed task-defect pairs

In [None]:
steps = 100
remaining_fractions = []
thresholds = list(map(lambda x: x / steps, range(steps)))
for threshold in thresholds:
    kept_pairs = threshold_by_frequency(threshold)
    remaining_fractions.append(kept_pairs.values.mean())

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

ax.plot(thresholds, remaining_fractions, color='red', linestyle='-', marker='o', label='Cumulative')

ax.set_xlabel('Threshold')
ax.set_ylabel('Fraction left')
ax.set_title('Threshold vs Fraction of Kept Task-Defect Pairs')
ax.grid(True)
ax.legend()

plt.show()

In [None]:
incremental_fractions = [remaining_fractions[i] - remaining_fractions[i + 1] for i in range(len(remaining_fractions) - 1)]
start = 2 # for better zoom

fig, ax = plt.subplots(layout="constrained", figsize=figsize)

ax.plot(thresholds[start:-1], incremental_fractions[start:], color='blue', linestyle='-', marker='o', label='Incremental')

ax.set_xlabel('Threshold')
ax.set_ylabel('Fraction left')
ax.set_title('Threshold vs Fraction of Kept Task-Defect Pairs')
ax.grid(True)
ax.legend()

plt.show()

percent of allowed submissions vs threshold

individually for tasks

In [None]:
raise RuntimeError

## characteristic defect

Task-specific defect Z-scores

In [None]:
common_defects = (defect_log.groupby(log["item"]).mean() - defect_log.mean(axis=0)) / defect_log.std(axis=0)
highest_variance_tasks = common_defects.var(axis=1).sort_values(ascending=False)[:30].index
defect_names = [defects['defect name'].loc[idx] for idx in defect_log.columns]
task_names = [items['name'].loc[idx] for idx in highest_variance_tasks]

plt.figure(figsize=figsize, layout="constrained")
biggest_value = common_defects.abs().values.max()
sns.heatmap(common_defects.loc[highest_variance_tasks].T, xticklabels=task_names, yticklabels=defect_names, cmap="vlag", cbar=True, vmin=-biggest_value, vmax=biggest_value)
plt.title("Task-Level Defect Anomalies (Z-scores, Tasks With Highest Variance)")
plt.xlabel("")
plt.ylabel("")

# plt.savefig('defect_anomalies.png', dpi=300)
plt.show()

Issues in task templates removed (handled differently)

In [None]:
democode_messages = items["democode"].apply(generate_linter_messages)
democode_messages = [
    {'item': idx, 'defect': code}
    for idx, code_message_list in democode_messages.items()
    for code, _ in code_message_list
]
democode_messages = pd.DataFrame(democode_messages)
democode_messages = democode_messages[democode_messages["defect"].isin(code_to_defect_id.keys())]
democode_messages["defect"] = democode_messages["defect"].replace(code_to_defect_id).astype(int)

Possibly other anomalies? TODO

"Too common" defects are signals that something is off - Setting the threshold - binary map at different levels

Feature - task-defect Z-score

Filtering for good representatives - big difference in scores

In [None]:
task_profiles = defect_log.groupby(log['item']).mean()
task_counts = defect_log.groupby(log['item']).count()

In [None]:
log['item']

In [None]:
log

## currently taught topic

Manually label

## connected to poor performance

# student history

In [None]:
# number of submissions since last encountered
defect_row['last encountered'] = (defect_history.index - defect_history[defect].cumsum().where(defect_history[defect]).ffill()).iloc[-1]

## recently repeated

## often repeated

## many times repeated

## inside an already complex expression