# loading data

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas as pd
import numpy as np
from ipywidgets import Output
from IPython.display import display, HTML

import src.ipython_loader as loader

from src.code_processing import parse_code_string, generate_linter_messages

figsize = (10, 7)
resolution = 300 # dpi
data_path = Path('data')
ipython_path = data_path / 'ipython_new'

items, log, defects, defect_log = loader.load(ipython_path, data_path)

In [None]:
# 

In [None]:
# experiment with other options then mean
item_profiles = defect_log.groupby(log['item']).mean()

# random sample

In [None]:
submission_df = []
submission_index = []
defect_df = []
for idx, row in log[defect_log.sum(axis=1) >= 2].sample(20, random_state=42).iterrows():
    submission_index.append(idx)
    submission_row = {}
    submission_row['submission'] = row['answer']
    submission_row['task name'] = items.loc[row['item']]['name']
    submission_row['instructions'] = items.loc[row['item']]['instructions']
    submission_df.append(submission_row)

    # previously made defects
    defect_history = defect_log.loc[
        log[(log['user'] == row['user']) & (log['time'] <= row['time'])].sort_values(by='time').index
    ].reset_index(drop=True).astype(bool)

    for defect in defect_log.loc[idx][defect_log.loc[idx] > 0].index:
        defect_row = {}
        defect_row['submission id'] = idx
        defect_row['defect id'] = defect
        defect_row['severity'] = defects.loc[defect]['severity']
        defect_row['name'] = defects.loc[defect]['defect name']
        defect_row['description'] = defects.loc[defect]['description']
        defect_row['code example'] = defects.loc[defect]['code example']
        defect_row['code fix example'] = defects.loc[defect]['code fix example']
        defect_row['frequency'] = item_profiles.loc[row['item'], defect]
        # number of submissions since last encountered
        defect_row['last encountered'] = (defect_history.index - defect_history[defect].cumsum().where(defect_history[defect]).ffill()).iloc[-1]
        # TODO impact
        defect_df.append(defect_row)

submission_df = pd.DataFrame(submission_df, index=submission_index)
defect_df = pd.DataFrame(defect_df)

In [None]:
submission_df.head()

In [None]:
defect_df.head()

## filtering

Look for uniformative entries that might pollute the survey pool.

In [None]:
# empty or overly long submissions
lengths = submission_df['submission'].apply(len).sort_values(ascending=False)
plt.figure(figsize=figsize, layout='constrained')
plt.plot(range(len(lengths)), lengths)

plt.xticks(range(len(lengths)), lengths.index, rotation=90)
plt.show()

In [None]:
for idx in lengths[(lengths > 500) | (lengths < 100)].index:
    print(idx, submission_df.loc[idx]['submission'])
    print('=' * 50)

In [None]:
print(submission_df.loc[98002]['submission'])

In [None]:
# duplicities - tasks
task_names = submission_df['task name'][submission_df['task name'].duplicated(keep=False)].unique()
for name in task_names:
    for idx in submission_df[submission_df['task name'] == name].index:
        print(idx, submission_df.loc[idx]['submission'])
    print('=' * 50)

In [None]:
# duplicities - defect pairs
# TODO check that it works for triples

ids_sets = defect_df.groupby('submission id')['defect id'].unique().apply(set)
duplicates = ids_sets[ids_sets.duplicated(keep=False)]
duplicates.apply(frozenset).unique()  # set in not hashable

for duplicate in duplicates:
    submission_ids = defect_df.groupby('submission id')['defect id'].apply(frozenset) == duplicate
    submission_ids = submission_ids[submission_ids].index
    for idx in submission_ids:
        print(idx, submission_df.loc[idx]['submission'])
    print(duplicate)
    print('=' * 50)

In [None]:
if False:   
    defect_df.to_csv(data_path / 'export' / 'defects.csv', sep=';', index_label='index')
    submission_df.to_csv(data_path / 'export' / 'submissions.csv', sep=';', index_label='index')

# context sensitive

In [None]:
def has_close_pair(row):
    """Check if there is a pair of values with difference one or less."""
    row_values = row.values
    return np.any(np.abs(row_values[:, None] - row_values) <= 1)

Filter only for differences in severity <= 1

In [None]:
# at least two defects
filtered = defect_log[defect_log.sum(axis=1) > 1]

In [None]:
filtered *= defects.loc[filtered.columns]['severity']
filtered = filtered[filtered.apply(has_close_pair, axis=1)]

In [None]:
defect_log = defect_log.loc[filtered.index]
log = log.loc[filtered.index]

## repeated in a single submission

In [None]:
defect_log

## characteristic defect