# loading data

In [1]:
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas as pd
import numpy as np
from ipywidgets import Output
from IPython.display import display, HTML


from src.code_processing import parse_code_string, generate_linter_messages

figsize = (10, 7)
resolution = 300 # dpi
data_path = Path('data')
ipython_path = data_path / 'ipython_new'

In [2]:
## load ipython items
items = pd.read_csv(ipython_path / 'item.csv', sep=";", index_col=0)

# drop unused columns
items = items[['name', 'instructions', 'solution', 'democode']]
# extract user instructions
items["instructions"] = items["instructions"].apply(lambda x: eval(x)[0][1])
# extract and decode example solutions
items["solution"] = items["solution"].apply(lambda x: eval(x)[0][1]).apply(parse_code_string)
items["democode"] = items["democode"].apply(lambda x: eval(x)[0][1]).apply(parse_code_string)

In [3]:
## load the ipython log
log = pd.read_csv(ipython_path / 'log.csv', sep=";")

# drop unused columns
log = log[["id", "user", "item", "answer", "correct", "responseTime", "time"]]
# correct data types
log["time"] = pd.to_datetime(log["time"])
log["correct"] = log["correct"].astype(bool)
# drop problematic rows
log.dropna(inplace=True)
log.drop_duplicates(inplace=True)
# decode submissions
log["answer"] = log["answer"].apply(parse_code_string)
# only correct answers
log = log[log["correct"]]
log.drop('correct', axis=1, inplace=True)
# only one answer per session
log = log.reset_index().groupby(["user", "item"], as_index=False).last().set_index("index")
# sorted by time
log.sort_values(by='time', inplace=True)

In [None]:
# keep only items with at least 100 (correct) submission
items = items.loc[items.index.isin((log["item"].value_counts() > 100).index)]

# filter them out of the log also
log = log[log["item"].isin(items.index)]

In [6]:
## load the defect table
defects = pd.read_csv(data_path / 'defects.csv')

# drop unused columns
defects = defects[["defect name", "EduLint code", "defect type", "description", "code example", "code fix example", "severity", "id"]]
# drop defects not detected by EduLint
defects.dropna(subset=["EduLint code"], inplace=True)
# convert EduLint codes from string to tuple
defects["EduLint code"] = defects["EduLint code"].apply(lambda x: tuple(map(str.strip, x.split(","))))
# drop the "missing docstring" defect (not really appropriate in the context)
# drop the "mixed indentation" defect (it is exceptionally noisy - errors during logging, students copy-pasting, ...)
defects.drop([66, 4], axis=0, inplace=True)
# create a dictionary mapping EduLint codes to the index of the associated defect
code_to_defect_id = {val: idx for idx, val in defects['EduLint code'].explode().items()}

In [None]:
## load the EduLint messages corresponding to the entries in the ipython log
# open the message log
messages = pd.read_csv(ipython_path / 'message_log.csv', index_col=0, header=0)

# remove some of the messages associated with the "trailing whitespace" defect (they are likely logging errors)
messages = messages[~messages['message'].isin(['no newline at end of file', 'trailing whitespace'])]

# keep only the messages still in the ipython log
messages = messages[messages["log entry"].isin(log.index)]

# keep only messages with an associated defect
messages = messages[messages["defect"].isin(code_to_defect_id.keys())]

# use defect ids instead of message codes
messages["defect"] = messages["defect"].replace(code_to_defect_id).astype(int)

messages.reset_index(drop=True, inplace=True)

In [8]:
# vectorize defects
defect_log = pd.crosstab(messages["log entry"], messages["defect"]).reindex(log.index, fill_value=0)

# replace defect counts with presence
defect_log = (defect_log > 0).astype(int)

# keep only detected defects
defects = defects.loc[defects.index.isin(defect_log.columns)]

In [9]:
# experiment with other options then mean
item_profiles = defect_log.groupby(log['item']).mean()

# random sample

In [26]:
submission_df = []
submission_index = []
defect_df = []
for idx, row in log[defect_log.sum(axis=1) >= 2].sample(20, random_state=42).iterrows():
    submission_index.append(idx)
    submission_row = {}
    submission_row['submission'] = row['answer']
    submission_row['task name'] = items.loc[row['item']]['name']
    submission_row['instructions'] = items.loc[row['item']]['instructions']
    submission_df.append(submission_row)

    # previously made defects
    defect_history = defect_log.loc[
        log[(log['user'] == row['user']) & (log['time'] <= row['time'])].sort_values(by='time').index
    ].reset_index(drop=True).astype(bool)

    for defect in defect_log.loc[idx][defect_log.loc[idx] > 0].index:
        defect_row = {}
        defect_row['submission id'] = idx
        defect_row['severity'] = defects.loc[defect]['severity']
        defect_row['name'] = defects.loc[defect]['defect name']
        defect_row['description'] = defects.loc[defect]['description']
        defect_row['code example'] = defects.loc[defect]['code example']
        defect_row['code fix example'] = defects.loc[defect]['code fix example']
        defect_row['frequency'] = item_profiles.loc[row['item'], defect]
        # number of submissions since last encountered
        defect_row['last encountered'] = (defect_history.index - defect_history[defect].cumsum().where(defect_history[defect]).ffill()).iloc[-1]
        # TODO impact
        defect_df.append(defect_row)

submission_df = pd.DataFrame(submission_df, index=submission_index)
defect_df = pd.DataFrame(defect_df)

In [None]:
if False:   
    defect_df.to_csv(data_path / 'export' / 'defects.csv', sep=';', index_label='index')
    submission_df.to_csv(data_path / 'export' / 'submissions.csv', sep=';', index_label='index')