# Loading data

In [1]:
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
import numpy as np

from pathlib import Path

from src.load_scripts import load_ipython_data

figsize = (10, 7)
resolution = 300 # dpi

In [None]:
item, defects, log, defect_log = load_ipython_data(Path('data/ipython'), Path('data/defects.csv'))

TODO try with only correct/final submissions

# Exploration

## Defects Histogram

In [15]:
counts = defect_log.sum(axis=0).sort_values(ascending=False)
percentages = counts / len(defect_log) * 100

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.bar([defects['defect name'].loc[idx] for idx in counts.index], percentages)

ax.bar_label(p, labels = counts, label_type='edge', rotation=45)

ax.set_ylabel('% of submissions')
plt.xticks(rotation = 45, ha='right')
ax.set_title('Submissions containing a defect ({} submissions in total)'.format(len(defect_log)))

#plt.savefig('defect_histogram.png', dpi=300)
plt.show()

## Submissions Histogram

In [17]:
counts = defect_log.sum(axis=1).value_counts()

In [None]:
num_of_submissions = [str(i) for i in range(5)] + ['>=5']
defect_count = list(counts[:5]) + [counts[5:].sum()]
defect_percentage = list(map(lambda x: x / len(defect_log) * 100, defect_count))

fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.bar(num_of_submissions, defect_percentage)

ax.bar_label(p, labels = defect_count, label_type='edge')

ax.set_ylabel('% of submissions')
ax.set_xlabel('Number of defects')
ax.set_title('Submissions by the number of unique defects ({} in total)'.format(len(defect_log)))

#plt.savefig('submission_histogram.png', dpi=300)
plt.show()

## Co-occurance matrix

In [9]:
# co-occurances

co_occurances = (defect_log.T.dot(defect_log)).values
np.fill_diagonal(co_occurances, 0)

In [None]:
(co_occurances > 250).sum() / 2

In [None]:
defect_names = [defects['defect name'].loc[idx] for idx in defect_log.columns]

fig = px.imshow(co_occurances > 1000, x = defect_names, y = defect_names)
fig.update_xaxes(showticklabels=False)
fig.update_layout(yaxis={"dtick":1},margin={"t":0,"b":0})
fig

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.imshow(co_occurances > 1000)

ax.set_yticks(np.arange(len(defect_names)), labels=defect_names)

for i in range(len(defect_names)):
    for j in range(len(defect_names)):
        text = ax.text(j, i, np.round(co_occurances[i, j] / len(defect_log) * 100, 1),
                       ha="center", va="center", color="w")

ax.set_title('Co-occurance of defects (>= 1000 occurances highlighted)')

#plt.savefig('co_occurance_1000.png', dpi=300)
plt.show()

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.imshow(co_occurances > 250)

ax.set_yticks(np.arange(len(defect_names)), labels=defect_names)

for i in range(len(defect_names)):
    for j in range(len(defect_names)):
        text = ax.text(j, i, np.round(co_occurances[i, j] / len(defect_log) * 100, 1),
                       ha="center", va="center", color="w")
        
ax.set_title('Co-occurance of defects (>= 250 occurances highlighted)')


#plt.savefig('co_occurance_250.png', dpi=300)
plt.show()

In [None]:
fig, ax = plt.subplots(layout="constrained", figsize=figsize)

p = ax.imshow(co_occurances > 100)

ax.set_yticks(np.arange(len(defect_names)), labels=defect_names)

for i in range(len(defect_names)):
    for j in range(len(defect_names)):
        text = ax.text(j, i, np.round(co_occurances[i, j] / len(defect_log) * 100, 1),
                       ha="center", va="center", color="w")
        
ax.set_title('Co-occurance of defects (>= 100 occurances highlighted)')

#plt.savefig('co_occurance_100.png', dpi=300)
plt.show()