# loading data

In [1]:
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas as pd
import numpy as np
from ipywidgets import Output
from IPython.display import display, HTML


from src.code_processing import parse_code_string, generate_linter_messages

figsize = (10, 7)
resolution = 300 # dpi
data_path = Path('data')
ipython_path = data_path / 'ipython_new'

In [2]:
## load ipython items
items = pd.read_csv(ipython_path / 'item.csv', sep=";", index_col=0)

# drop unused columns
items = items[['name', 'instructions', 'solution', 'democode']]
# extract user instructions
items["instructions"] = items["instructions"].apply(lambda x: eval(x)[0][1])
# extract and decode example solutions
items["solution"] = items["solution"].apply(lambda x: eval(x)[0][1]).apply(parse_code_string)
items["democode"] = items["democode"].apply(lambda x: eval(x)[0][1]).apply(parse_code_string)

In [3]:
## load the ipython log
log = pd.read_csv(ipython_path / 'log.csv', sep=";")

# drop unused columns
log = log[["id", "user", "item", "answer", "correct", "responseTime", "time"]]
# correct data types
log["time"] = pd.to_datetime(log["time"])
log["correct"] = log["correct"].astype(bool)
# drop problematic rows
log.dropna(inplace=True)
log.drop_duplicates(inplace=True)
# decode submissions
log["answer"] = log["answer"].apply(parse_code_string)
# only correct answers
log = log[log["correct"]]
log.drop('correct', axis=1, inplace=True)
# only one answer per session
log = log.reset_index().groupby(["user", "item"], as_index=False).last().set_index("index")
# sorted by time
log.sort_values(by='time', inplace=True)

In [4]:
# keep only items with at least 100 (correct) submission
items = items.loc[items.index.isin((log["item"].value_counts() > 100).index)]

# filter them out of the log also
log = log[log["item"].isin(items.index)]

In [5]:
## load the defect table
defects = pd.read_csv(data_path / 'defects.csv')

# drop unused columns
defects = defects[["defect name", "EduLint code", "defect type", "description", "code example", "code fix example", "severity", "id"]]
# drop defects not detected by EduLint
defects.dropna(subset=["EduLint code"], inplace=True)
# convert EduLint codes from string to tuple
defects["EduLint code"] = defects["EduLint code"].apply(lambda x: tuple(map(str.strip, x.split(","))))
# drop the "missing docstring" defect (not really appropriate in the context)
# drop the "mixed indentation" defect (it is exceptionally noisy - errors during logging, students copy-pasting, ...)
defects.drop([66, 4], axis=0, inplace=True)
# create a dictionary mapping EduLint codes to the index of the associated defect
code_to_defect_id = {val: idx for idx, val in defects['EduLint code'].explode().items()}

In [6]:
## load the EduLint messages corresponding to the entries in the ipython log
# open the message log
messages = pd.read_csv(ipython_path / 'message_log.csv', index_col=0, header=0)

# remove some of the messages associated with the "trailing whitespace" defect (they are likely logging errors)
messages = messages[~messages['message'].isin(['no newline at end of file', 'trailing whitespace'])]

# keep only the messages still in the ipython log
messages = messages[messages["log entry"].isin(log.index)]

# keep only messages with an associated defect
messages = messages[messages["defect"].isin(code_to_defect_id.keys())]

# use defect ids instead of message codes
messages["defect"] = messages["defect"].replace(code_to_defect_id).astype(int)

messages.reset_index(drop=True, inplace=True)

  messages["defect"] = messages["defect"].replace(code_to_defect_id).astype(int)


In [7]:
# vectorize defects
defect_log = pd.crosstab(messages["log entry"], messages["defect"]).reindex(log.index, fill_value=0)

# replace defect counts with presence
defect_log = (defect_log > 0).astype(int)

# keep only detected defects
defects = defects.loc[defects.index.isin(defect_log.columns)]

In [8]:
# experiment with other options then mean
item_profiles = defect_log.groupby(log['item']).mean()

# random sample

In [9]:
submission_df = []
submission_index = []
defect_df = []
for idx, row in log[defect_log.sum(axis=1) >= 2].sample(20, random_state=42).iterrows():
    submission_index.append(idx)
    submission_row = {}
    submission_row['submission'] = row['answer']
    submission_row['task name'] = items.loc[row['item']]['name']
    submission_row['instructions'] = items.loc[row['item']]['instructions']
    submission_df.append(submission_row)

    # previously made defects
    defect_history = defect_log.loc[
        log[(log['user'] == row['user']) & (log['time'] <= row['time'])].sort_values(by='time').index
    ].reset_index(drop=True).astype(bool)

    for defect in defect_log.loc[idx][defect_log.loc[idx] > 0].index:
        defect_row = {}
        defect_row['submission id'] = idx
        defect_row['defect id'] = defect
        defect_row['severity'] = defects.loc[defect]['severity']
        defect_row['name'] = defects.loc[defect]['defect name']
        defect_row['description'] = defects.loc[defect]['description']
        defect_row['code example'] = defects.loc[defect]['code example']
        defect_row['code fix example'] = defects.loc[defect]['code fix example']
        defect_row['frequency'] = item_profiles.loc[row['item'], defect]
        # number of submissions since last encountered
        defect_row['last encountered'] = (defect_history.index - defect_history[defect].cumsum().where(defect_history[defect]).ffill()).iloc[-1]
        # TODO impact
        defect_df.append(defect_row)

submission_df = pd.DataFrame(submission_df, index=submission_index)
defect_df = pd.DataFrame(defect_df)

In [42]:
submission_df.head()

Unnamed: 0,submission,task name,instructions
29624,def chessboard(n):\n for i in range(n):\n ...,Šachovnice,"Napište funkci <b>chessboard(n)</b>, která vyk..."
299230,def censorship(text):\n t = len(text)\n ...,Cenzura,"Napište funkci <b>censorship(text)</b>, která ..."
284317,"def censor_number(k, n):\n p = []\n for ...",Číselná cenzura,"Napište funkci <b>censor_number(k, n)</b>, kte..."
78519,def empty_square(n):\n for i in range(n):\n...,Prázdný čtverec,"Napište funkci <b>empty_square(n)</b>, která ..."
38610,"def impose_fine(age, beer):\n if beer == Tr...",Pokuta,Policista potřebuje pomoct s udělováním pokut....


In [41]:
defect_df.head()

Unnamed: 0,submission id,defect id,severity,name,description,code example,code fix example,frequency,last encountered
0,29624,24,3,long line,Line with more than 100 characters.,income = (gross_wages + taxable_interest + (di...,income = (gross_wages\r\n + taxable_i...,0.003668,28.0
1,29624,39,4,inappropriate whitespace: visible,Violating PEP 8 conventions about whitespace (...,"x= 3 *y\nprint( x, y )","x = 3*y\nprint(x, y)\n",0.624358,2.0
2,299230,6,4,while as for,While loop with the number of iterations known...,i = 0\nwhile i <= n:\n print(i)\n i += 1,for i in range(n):\n print(i),0.00517,12.0
3,299230,12,4,all if branches start/end with identical code,All if/elif/else branches start or end with th...,"if cond:\n print(""foo"")\n i += 1\nelse:\...","if cond:\n print(""foo"")\nelse:\n print(""ba...",0.014032,14.0
4,299230,36,2,negated condition with else clause,The code contains an if statement with an else...,if not cond:\n # body 1\nelse:\n # body 2,if cond:\n # body 2\nelse:\n # body 1,0.089365,14.0


## filtering

Look for uniformative entries that might pollute the survey pool.

In [36]:
# duplicities - tasks
task_names = submission_df['task name'][submission_df['task name'].duplicated(keep=False)].unique()
for name in task_names:
    for idx in submission_df[submission_df['task name'] == name].index:
        print(idx, submission_df.loc[idx]['submission'])
    print('=' * 50)

38610 def impose_fine(age, beer):
    if beer == True and age < 18:
        return True
    return False
59076 def impose_fine(age, beer):
    while age < 18 and beer == True:
        return True
    else: 
        return False
73706 def alergy_list(a, b):
    for i in range(a, b+1):
        if i % 5 != 0:
        	print(i, end=" ")
        elif i % 10 == 0:  
            print(i, end=" ")            
    print()
287950 def alergy_list(a, b):
    for i in range(a,b+1,1):
        if i % 10 == 0:
            print(i, end=" ")
        elif i % 5 != 0:
            print(i, end=" ")
    print()
364394 def near_fifty(n):
    if n in range(40,61) or n in range(140,161):
        return True
    else:
        return False
175696 def near_fifty(n):
    if (n>=40 and n<=60) or (n>=140 and n<=160):
        return True
    else:return False
198099 def duplication(text):
    vystup=""
    for znak in text:
        
        vystup += znak*2
    return vystup
565657 def duplication(text):
    dup_resu

In [None]:
# defect pairs
# TODO check for triples

ids = defect_df.groupby('submission id')['defect id'].unique().apply(set).duplicated(keep=False)
ids = ids[ids].index
submission_df[ids]

In [61]:
ids

Index([73706, 198099, 287950, 324923, 402206, 565657], dtype='int64', name='submission id')

In [47]:
defect_df.groupby('submission id')['defect id'].unique().apply(set)

submission id
29624            {24, 39}
38610             {17, 3}
59076            {17, 33}
73706             {5, 39}
78519        {76, 71, 39}
98002            {77, 39}
175696        {3, 55, 39}
198099           {76, 39}
216886           {36, 38}
284317           {76, 61}
287950            {5, 39}
299230    {12, 39, 6, 36}
324923           {53, 39}
350702       {62, 38, 39}
364394            {3, 39}
391489            {3, 38}
402206           {53, 39}
429324       {26, 76, 39}
565657           {76, 39}
583215       {72, 37, 39}
Name: defect id, dtype: object

In [12]:
if False:   
    defect_df.to_csv(data_path / 'export' / 'defects.csv', sep=';', index_label='index')
    submission_df.to_csv(data_path / 'export' / 'submissions.csv', sep=';', index_label='index')