# Loading data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas as pd
import numpy as np

from pathlib import Path

from src.code_processing import parse_code_string

figsize = (10, 7)
resolution = 300 # dpi
data_path = Path('data')
ipython_path = data_path / 'ipython_old'

In [None]:
## load ipython items
items = pd.read_csv(ipython_path / 'item.csv', sep=";", index_col=0)

# drop unused columns
items = items[['name', 'instructions', 'solution']]
# extract user instructions
items["instructions"] = items["instructions"].apply(lambda x: eval(x)[0][1])
# extract and decode example solutions
items["solution"] = items["solution"].apply(lambda x: eval(x)[0][1]).apply(parse_code_string)

In [None]:
## load the ipython log
log = pd.read_csv(ipython_path / 'log.csv', sep=";")

# drop unused columns
log = log[["id", "user", "item", "answer", "correct", "responseTime", "time"]]
# correct data types
log["time"] = pd.to_datetime(log["time"])
log["correct"] = log["correct"].astype(bool)
# drop problematic rows
log.dropna(inplace=True)
log.drop_duplicates(inplace=True)
# decode submissions
log["answer"] = log["answer"].apply(parse_code_string)
# only correct answers
log = log[log["correct"]]
# only one answer per session, first because EduLint might already be integrated
log = log.reset_index().groupby(["user", "item"], as_index=False).first().set_index("index")

In [None]:
## load the defect table
defects = pd.read_csv(data_path / 'defects.csv')

# drop unused columns
defects = defects[["defect name", "EduLint code", "defect type", "description", "code example", "code fix example", "severity", "id"]]
# drop defects not detected by EduLint
defects.dropna(subset=["EduLint code"], inplace=True)
# convert EduLint codes from string to tuple
defects["EduLint code"] = defects["EduLint code"].apply(lambda x: tuple(map(str.strip, x.split(","))))
# create a dictionary mapping EduLint codes to the index of the associated defect
code_to_defect_id = {val: idx for idx, val in defects['EduLint code'].explode().items()}

In [None]:
## load the EduLint messages corresponding to the entries in the ipython log
# open the message log
with open(ipython_path / "messages.txt", "r") as f:
    messages = [eval(line) for line in f.readlines()]
# create a dataframe
messages = [
    {'log entry': idx, 'defect': code, 'message': message}
    for idx, code_message_list in messages
    for code, message in code_message_list
]
messages = pd.DataFrame(messages)

# keep only the messages still in the ipython log
messages = messages[messages["log entry"].isin(log.index)]

# keep only messages with an associated defect
messages = messages[messages["defect"].isin(code_to_defect_id.keys())]

# use defect ids instead of message codes
messages["defect"] = messages["defect"].replace(code_to_defect_id).astype(int)

In [None]:
# vectorize defects
defect_log = pd.crosstab(messages["log entry"], messages["defect"]).reindex(log.index, fill_value=0)

# replace defect counts with presence
defect_log = (defect_log > 0).astype(int)

# keep only detected defects
defects = defects.loc[defects.index.isin(defect_log.columns)]

# keep only items with at least one submission
items = items.loc[items.index.isin(log["item"].unique())]

# detected defects

In [None]:
counts = defect_log.sum(axis=0).sort_values(ascending=False)
percentages = counts / len(defect_log) * 100

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.bar([defects['defect name'].loc[idx] for idx in counts.index], percentages)

ax.bar_label(p, labels = counts, label_type='edge', rotation=45)

ax.set_ylabel('% of submissions')
plt.xticks(rotation = 45, ha='right')
plt.ylim(0, max(percentages) + 5)
ax.set_title('Submissions containing a defect ({} submissions in total)'.format(len(defect_log)))

#plt.savefig('defect_histogram.png', dpi=300)
plt.show()

In [None]:
"""TODO
rare_codes = counts[counts < 100].index

counts = counts[counts >= 100]

percentages = counts / len(defect_log) * 100
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.bar(counts.index, percentages)

ax.bar_label(p, labels = counts, label_type='edge', rotation=45)

ax.set_ylabel('% of submissions')
plt.xticks(rotation = 45, ha='right')
plt.ylim(0, 40)
ax.set_title('EduLint codes without a corresponding defect (# of occurances < 100 omitted)'.format(len(defect_log)))

#plt.savefig('defect_histogram.png', dpi=300)
plt.show()
"""

# \# of defects histogram

In [None]:
counts = defect_log.sum(axis=1).value_counts()

num_of_submissions = [str(i) for i in range(5)] + ['>=5']
defect_counts= list(counts[:5]) + [counts[5:].sum()]
defect_percentage = list(map(lambda x: x / len(defect_log) * 100, defect_counts))

fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.bar(num_of_submissions, defect_percentage)

ax.bar_label(p, labels = defect_counts, label_type='edge')

ax.set_ylabel('% of submissions')
ax.set_xlabel('Number of defects')
ax.set_title('Submissions by the number of unique defects ({} in total)'.format(len(defect_log)))

#plt.savefig('submissions_histogram.png', dpi=300)
plt.show()

In [None]:
counts = defect_log.sum(axis=1).value_counts().cumsum()

num_of_submissions = ["0"] + [f"<={i}" for i in range(1, max(counts.index) + 1)]

max_idx = max(counts.index)
defect_counts = [
    counts.loc[idx] if idx in counts.index else counts.loc[max([j for j in counts.index if j < idx])]
    for idx in range(max_idx + 1)
]

defect_percentage = [(x / len(defect_log)) * 100 for x in defect_counts]

fig, ax1 = plt.subplots(layout="constrained", figsize=(10, 6))

bars = ax1.bar(num_of_submissions, defect_counts, alpha=0.6, color='blue', label='Absolute counts')

ax2 = ax1.twinx()
ax2.plot(num_of_submissions, defect_percentage, marker='o', color='red', label='Cumulative %')

ax1.set_ylabel('Absolute counts')
ax2.set_ylabel('% of submissions')
ax1.set_xlabel('Number of defects')
ax1.set_title(f'Submissions by the number of unique defects ({len(defect_log)} in total)')

ax1.set_ylim(0, len(defect_log))
ax2.set_ylim(0, 100)

ax1.grid(visible=True, linestyle='--', linewidth=0.5)
fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))

#plt.savefig('submissions_cumulative_histogram.png', dpi=300)
plt.show()

# defect co-occurance

## co-occurance matrix

In [None]:
# co-occurances
co_occurances = (defect_log.T.dot(defect_log)).values
np.fill_diagonal(co_occurances, 0)
defect_names = [defects['defect name'].loc[idx] for idx in defect_log.columns]

In [None]:
fig = px.imshow(co_occurances, x = defect_names, y = defect_names)
fig.update_xaxes(showticklabels=False)
fig.update_layout(yaxis={"dtick":1},margin={"t":0,"b":0})
fig

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.imshow(co_occurances)

ax.set_yticks(np.arange(len(defect_names)), labels=defect_names)

for i in range(len(defect_names)):
    for j in range(len(defect_names)):
        text = ax.text(j, i, np.round(co_occurances[i, j] / len(defect_log) * 100, 1),
                       ha="center", va="center", color="w")

ax.set_title('Co-occurance of defects (% of all submissions)')

#plt.savefig('co_occurance.png', dpi=300)
plt.show()

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.imshow(co_occurances > 1000)

ax.set_yticks(np.arange(len(defect_names)), labels=defect_names)

for i in range(len(defect_names)):
    for j in range(len(defect_names)):
        text = ax.text(j, i, np.round(co_occurances[i, j] / len(defect_log) * 100, 1),
                       ha="center", va="center", color="w")

ax.set_title('Co-occurance of defects (>= 1000 occurances highlighted)')

#plt.savefig('co_occurance_1000.png', dpi=300)
plt.show()

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.imshow(co_occurances > 250)

ax.set_yticks(np.arange(len(defect_names)), labels=defect_names)

for i in range(len(defect_names)):
    for j in range(len(defect_names)):
        text = ax.text(j, i, np.round(co_occurances[i, j] / len(defect_log) * 100, 1),
                       ha="center", va="center", color="w")
        
ax.set_title('Co-occurance of defects (>= 250 occurances highlighted)')


#plt.savefig('co_occurance_250.png', dpi=300)
plt.show()

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.imshow(co_occurances > 100)

ax.set_yticks(np.arange(len(defect_names)), labels=defect_names)

for i in range(len(defect_names)):
    for j in range(len(defect_names)):
        text = ax.text(j, i, np.round(co_occurances[i, j] / len(defect_log) * 100, 1),
                       ha="center", va="center", color="w")
        
ax.set_title('Co-occurance of defects (>= 100 occurances highlighted)')

#plt.savefig('co_occurance_100.png', dpi=300)
plt.show()

## by defect category

In [None]:
# co-occurances
co_occurances = (defect_log.T.dot(defect_log)).values
np.fill_diagonal(co_occurances, 0)
defect_names = [defects['defect name'].loc[idx] for idx in defect_log.columns]

In [None]:
categories = defects['defect type'].loc[defect_log.columns]
dfs_by_category = {category: defect_log[columns].copy() for category, columns in categories.groupby(categories).groups.items()}
categories = categories.unique()
NUM_ROWS = 2
NUM_COLS = len(categories) // 2


for i in range(len(categories)):
    this_category = categories[i]
    fig, axes = plt.subplots(NUM_ROWS, NUM_COLS, figsize=figsize)
    fig.suptitle("Programming Defect Co-occurance by Categories: {} (absolute counts)".format(categories[i]), fontsize=16)
    axes = axes.flatten()
    for j in range(len(categories)):
        other_category = categories[j]
        this_df, other_df = dfs_by_category[this_category], dfs_by_category[other_category]
        
        # Compute cross product between two category DataFrames
        product_df = this_df.T.dot(other_df)

        if this_category == other_category:
            np.fill_diagonal(product_df.values, 0)

        sns.heatmap(product_df, annot=True, cmap="Reds", fmt="d", ax=axes[j], cbar=False)
        axes[j].set_title(other_category)
        axes[j].set(xlabel="", ylabel="")
        axes[j].set_xticks(np.arange(product_df.shape[1]), labels=defects["defect name"].loc[product_df.columns].apply(lambda x: x[:20]), rotation=40, ha="right")
        if j % NUM_COLS == 0:
            axes[j].set_yticks(np.arange(product_df.shape[0]), labels=defects["defect name"].loc[product_df.index].apply(lambda x: x[:30]), rotation=0)
        else:
            axes[j].set_yticks([])
        
        plt.subplots_adjust(
            left=0.25,
            bottom=0.25, 
            right=0.95, 
            top=0.9, 
            wspace=0.1, 
            hspace=0.85
        )
                
        #plt.savefig('co_occurance_{}.png'.format(this_category), dpi=300)
    plt.show()

# Defects in tasks

# Exploring specific defect co-occurances

- (5, 32) using global variables, identifier breaks naming conventions - the global variable is what breaks the convention
- (15, 24) while as for, augmentable assignment - the increment in while is augmentable


In [None]:
def wrap_text(text, max_length=100):
    lines = []
    line = []
    length = 0

    for word in text.split():
        if (length := length + len(word) + 1) <= max_length:
            line.append(word)
        else:
            lines.append(" ".join(line))
            line = [word]
            length = len(word) + 1
    
    if line:
        lines.append(" ".join(line))
    
    return "\n".join(lines)


def item_info(idx):
    row = items.loc[idx]
    print('Task: ', row['name'])
    print('='*50)
    print('Description: ', wrap_text(row['instructions']), sep='\n')
    print('-'*50)
    print('Example solution: ', row['solution'], sep='\n')


def defect_info(idx):
    row = defects.loc[idx]
    print(row["defect type"], ": ", row['defect name'])
    print('='*50)
    print('Description: ', wrap_text(row['description']), sep='\n')
    print('-'*50)
    print('Code example: ', row['code example'], sep='\n')

In [None]:
defects["defect name"].to_dict()

In [None]:
defect1, defect2 = 29, 30
print(f"({defect1}, {defect2}) {defects.loc[defect1]['defect name']}, {defects.loc[defect2]['defect name']}")
defect_info(defect1)
print("\n\n")
defect_info(defect2)
df = log.loc[defect_log[(defect_log.loc[:, defect1] >= 1) & (defect_log.loc[:, defect2] >= 1)].index]
df["item"].value_counts()

In [None]:
item = 60
item_df = df[df["item"] == item]
item_info(item)
print('-'*50)
for i in range(min(len(item_df), 5)):
    print(f"Solution {i + 1}, index: {item_df.index[i]}", item_df.iloc[i]["answer"], sep="\n")
    print("-"*50)

In [None]:
index = 103332


In [None]:
# defect descriptions
defect1_row = defects.loc[defect1, ["defect name", "description", "code example", "code fix example"]]
defect2_row = defects.loc[defect2, ["defect name", "description", "code example", "code fix example"]]
defect1_row.index = ["defect 1 name", "defect 1 description", "defect 1 code example", "defect 1 code fix example"]
defect2_row.index = ["defect 2 name", "defect 2 description", "defect 2 code example", "defect 2 code fix example"]

# item description
item_row = items.loc[item, ["name", "instructions"]]
item_row.index = ["item name", "item instructions"]

# user submission
log_row = log.loc[[index]].reset_index()[["index", "answer"]].iloc[0]
log_row.index = ["submission index", "submission text"]

new_row = pd.concat([item_row, log_row, defect1_row, defect2_row])
pd.concat([
    pd.read_csv(data_path / "questions.csv", sep=";", index_col=0),
    pd.DataFrame([new_row])
], ignore_index=True).to_csv(data_path / "questions.csv", sep=";")

In [None]:
#pd.DataFrame([new_row]).to_csv("questions.csv", sep=";")