In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path

# Loading data

In [None]:
from src.load_scripts import load_ipython_item, load_ipython_log

data_path = Path("data/")

item = load_ipython_item(data_path)
log, features = load_ipython_log(data_path, data_path / "edulint")

In [None]:
log.head()

In [None]:
item.head()

In [None]:
import re


def find_example_message_from_message_code(messages, message_code):
    pattern = r'"[^"]*{}[^"]*"'.format(re.escape(message_code))
    message = re.search(pattern, messages)
    if message is None:
        return f'"{message_code}_unknown"'
    return message.group()


with open(data_path / "edulint" / "results.txt") as f:
    messages = f.read()
    feature_descriptions = {}
    for feature_name in features:
        feature_descriptions[feature_name] = find_example_message_from_message_code(
            messages, feature_name.upper()
        )

feature_descriptions['r1705'] = 'Unnecessary elif after return'
feature_descriptions['c0103'] = 'Naming style violation.'

In [None]:
messages = pd.DataFrame(
    np.vstack(log["linter_messages"]), columns=features, index=log.index
)


In [None]:
# messages.div(log['answer'].apply(lambda x: len(x.split())), axis=0)
messages = messages == 1

# Exploring and preprocessing

In [None]:
def example_submission_for_messages(*args, idx=0):
    for arg in args:
        print(f'Total: {messages[arg].sum()}, Description: {feature_descriptions[arg]}')
    print('-------------------------------------------------------------')
    mask = messages[args[0]].copy()
    if len(args) > 1:
        for arg in args[1:]:
            if messages[arg].dtype == bool:
                mask &= messages[arg]
            else:
                mask += messages[arg]
        print(f'Intersection total: {mask.sum()}')
    print(log[mask > 0].iloc[idx]["answer"])

example_submission_for_messages("w293", 'e303', idx=2)


In [None]:
counts = messages.sum(axis=0).sort_values()
fig = px.bar(x=[feature_descriptions[i] for i in counts.index], y=counts / counts.sum())
fig.update_layout(
    title=f"Selected linter messages and the frequency of their presence (in total {counts.sum()} submissions)",
    xaxis_title="Message",
    yaxis_title="Frequency of submissions",
    showlegend=False,
)
fig.show()

In [None]:
feature_correlations = messages.corr()

fig = px.imshow(
    feature_correlations,
    labels=dict(x="Message codes", y="Messages", color="Correlation"),
    x=feature_correlations.columns,
    y=feature_correlations.columns,
    color_continuous_scale="Viridis",
)

fig.update_layout(
    height=800,
    title="Feature Correlogram Before Preprocessing",
    yaxis=dict(
        tickvals=list(range(len(feature_correlations.columns))),
        ticktext=[feature_descriptions[col] for col in feature_correlations.columns],
    ),
)


In [None]:
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform

feature_distances = 1 - np.abs(feature_correlations)
dist_linkage = hierarchy.ward(squareform(feature_distances))
dendro = hierarchy.dendrogram(
    dist_linkage, labels=messages.columns.to_list(), leaf_rotation=90
)
plt.title('Dendrogram of Feature Correlations Before Preprocessing')
plt.show()

In [None]:
def combine_messages(description, *args):
    new_name = '+'.join(args)
    messages[new_name] = messages[args[0]].copy()
    for arg in args[1:]:
        if messages[arg].dtype == bool:
            messages[new_name] |= messages[arg]
        else:
            messages[new_name] += messages[arg]
    feature_descriptions[new_name] = description
    messages.drop(list(args), axis=1, inplace=True)

combine_messages('Bad inline comment.', 'e261', 'e262')
combine_messages('Redefining var/foo.', 'f811', 'e0102')
combine_messages('Spaces in indentation.', 'e101', 'w191')
combine_messages('Bad indentation.', 'e111', 'e117')

In [None]:
combine_messages('No spacing between blocks.', 'e302', 'e305')

E305 and E302 both show students debugging / trying to calculate the answer manually / very confused about the basic principles of functions.

In [None]:
feature_correlations = messages.corr()
feature_distances = 1 - np.abs(messages.corr())

fig = px.imshow(
    feature_correlations,
    labels=dict(x="Messages", y="Message codes", color="Correlation"),
    x=feature_correlations.columns,
    y=feature_correlations.columns,
    color_continuous_scale="Viridis",
)

fig.update_layout(
    height=800,
    title="Feature Correlogram After Preprocessing",
    yaxis=dict(
        tickvals=list(range(len(feature_correlations.columns))),
        ticktext=[feature_descriptions[col] for col in feature_correlations.columns],
    ),
)


In [None]:
feature_distances = 1 - np.abs(feature_correlations)
dist_linkage = hierarchy.ward(squareform(feature_distances))
dendro = hierarchy.dendrogram(
    dist_linkage, labels=messages.columns.to_list(), leaf_rotation=90
)
plt.title('Dendrogram of Feature Correlations After Preprocessing')
plt.show()

# How likely are users to repeat each of the detected mistakes?

# How severe are the messages?

## Looking at submissions

### Naively predicting whether the submission was unsuccessful

In [None]:
from sklearn.metrics import matthews_corrcoef

scores = {}
for msg in messages.columns:
    scores[msg] = matthews_corrcoef(messages[msg], 1 - log["correct"])

labels, score = zip(*sorted(scores.items(), key=lambda x: x[1]))
fig = px.bar(x=score, y=[feature_descriptions[label] for label in labels])
fig.update_layout(
    title="Correlation between the presence of each message and whether the submission was unsuccessful",
    xaxis_title="Messages",
    yaxis_title="MCC",
)

In [None]:
correlation_scores = pd.Series(scores)

In [None]:
from sklearn.feature_selection import r_regression
from matplotlib import pyplot as plt

r_scores = sorted(list(zip(messages.columns, r_regression(messages, log["correct"]))),key=lambda x: x[1])
plt.barh(*zip(*r_scores))

In [None]:
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff


def pretty_confusion_matrix(cm):
    labels = [
        f"{label}\n{count}"
        for label, count in zip(
            ["True Negatives", "False Positives", "False Negatives", "True Positives"],
            [cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]],
        )
    ]
    # Create confusion matrix table
    cm_table = ff.create_annotated_heatmap(
        z=cm,
        x=["Predicted 0", "Predicted 1"],
        y=["Actual 0", "Actual 1"],
        colorscale="Blues",
    )
    cm_table.update_layout(
        title_text="Confusion Matrix",
        xaxis=dict(title="Predicted label"),
        yaxis=dict(title="True label"),
    )

    # Add labels to the confusion matrix
    for i in range(len(cm_table.layout.annotations)):
        cm_table.layout.annotations[i].text = labels[i]

    # Show confusion matrix
    cm_table.show()


def confusion_matrix_for_message(code):
    cm = confusion_matrix(log["correct"], messages[code])
    pretty_confusion_matrix(cm)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(
    messages, 1 - log["correct"], test_size=0.33, random_state=42
)
clf = RandomForestClassifier().fit(X_train, y_train)
scores = np.round(cross_val_score(clf, X_train, y_train, cv=5, scoring="matthews_corrcoef"), 3)
print(f"Scores on validation data for each fold: {scores}.")
print(f"Score on the holdout test set: {matthews_corrcoef(y_test, clf.predict(X_test))}.")


In [None]:
from sklearn.metrics import precision_score, recall_score

pred = clf.predict(X_test)
print('Precision: ', precision_score(y_test, pred))
print('Recall: ', recall_score(y_test, pred))
pretty_confusion_matrix(confusion_matrix(pred, y_test))

In [None]:
from sklearn.inspection import permutation_importance

importance = permutation_importance(clf, X_test, y_test, scoring="matthews_corrcoef", random_state=42)

fig = go.Figure()
for i in range(len(messages.columns)):
    if importance.importances_mean[i] > 0.003:
        fig.add_trace(
            go.Box(
                x=importance.importances[i],
                name=feature_descriptions[messages.columns[i]],
                hoverinfo='name',
                hoverlabel = dict(namelength = -1),
            )
        )

fig.update_layout(
    title="Permutation Feature Importance for a Model Trained to Predict the Success of a Submission from the presence of Linter Messages",
    yaxis_title="Features",
    xaxis_title="Importance",
    showlegend=False,
)


#### Why is there positive correlation for some of the messages?

## Looking at sessions

End of session is defined as sucessful submission, changing to a different task or not submitting for more than 20 minutes.

In [None]:
new_log = []
for user in np.unique(log['user']):
    # get the user history and make sure the values are sorted
    user_history = log[log['user'] == user].sort_values('time')

    # find the session breakpoints
    user_history['sessionEnd'] = user_history['correct']        | \
        user_history['item'].ne(user_history['item'].shift(-1)) | \
        (user_history['time'].diff() > pd.Timedelta(minutes=20))

    # propagate the session success backwards
    user_history['sessionSucess'] = False # to correctly initialize the type
    for index, row in user_history.iloc[::-1].iterrows():
        if row['sessionEnd']:
            sucess = row['correct']
        user_history.at[index, 'sessionSucess'] = sucess
    new_log.append(user_history)

log = pd.concat(new_log).sort_index()
messages.sort_index(inplace=True)


### Predicting the success of the session

In [None]:
if messages.dtypes.apply(lambda x: x == bool).all():
    msg_scores = {}
    for msg in messages.columns:
        msg_scores[feature_descriptions[msg]] = matthews_corrcoef(messages[msg], 1 - log["sessionSucess"])

    x, y = zip(*sorted(msg_scores.items(), key=lambda x: x[1]))
    fig = px.bar(x=x, y=y)
    fig.update_layout(
        title="Correlation between the presence of each message and whether the session was unsuccessful",
        xaxis_title="Messages",
        yaxis_title="MCC",
    )

In [None]:
from sklearn.feature_selection import r_regression
from matplotlib import pyplot as plt

r_scores = sorted(list(zip(messages.columns, r_regression(messages, log["sessionSucess"]))),key=lambda x: x[1])
plt.barh(*zip(*r_scores))

In [None]:
#messages = messages[messages.columns[np.asarray(list(msg_scores.values())) > 0.01]]

In [None]:
from sklearn.naive_bayes import GaussianNB

X_train, X_test, y_train, y_test = train_test_split(
    messages, 1 - log["sessionSucess"], test_size=0.33, random_state=0
)
clf = GaussianNB().fit(X_train, y_train)
scores = np.round(cross_val_score(clf, X_train, y_train, cv=5, scoring="matthews_corrcoef"), 3)
print(f"Scores on validation data for each fold: {scores}.")
print(f"Score on the holdout test set: {matthews_corrcoef(y_test, clf.predict(X_test))}.")


In [None]:
pred = clf.predict(X_test)
print('Precision: ', precision_score(y_test, pred))
print('Recall: ', recall_score(y_test, pred))
pretty_confusion_matrix(confusion_matrix(pred, y_test))

### Look at only last N submissions is a session

### Time until completion / the next submission

## Looking at sudents

### Do the messages distinguish successful students from unsuccessful?

## Looking at tasks

In [None]:
from sklearn.metrics import matthews_corrcoef

df = []
index = []
for group in log['item'].unique():
    incorrect = log[log['item'] == group]['correct']
    if len(incorrect) < 500:
        continue
    df.append(messages[log['item'] == group].apply(lambda x: matthews_corrcoef(x, incorrect), axis=0))
    index.append(group)

df = pd.DataFrame(np.vstack(df), index=[item.loc[i]['name'] for i in index], columns=messages.columns).T

In [None]:
px.imshow(df)

In [None]:
from scipy.stats import entropy

def entropy_for_group(group):
    return entropy(group.value_counts())

df = []
index = []
for value, count in log['item'].value_counts().items():
    if count < 800:
        continue
    df.append(messages[log['item'] == value].apply(entropy_for_group))
    index.append(value)

df = pd.DataFrame(np.vstack(df), index=[item.loc[i]['name'] for i in index], columns=[feature_descriptions[msg][:35] for msg in messages.columns]).T
df['Mean Entropy'] = df.mean(axis=1)

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

best_pos_labels = [feature_descriptions[msg][:35] for msg in correlation_scores.sort_values(ascending=False)[:5].index]
best_neg_labels = [feature_descriptions[msg][:35] for msg in correlation_scores.sort_values()[:5].index]

sns.set_theme(rc={'figure.figsize':(15,8)})
g = sns.heatmap(df.loc[df.mean(axis=1).sort_values(ascending=False)[:25].index], vmin=0, vmax=1).set_title('Entropy of Linter Messages for the Most Frequent Items')
for tick_label in g.axes.get_yticklabels():
    if tick_label.get_text() in best_pos_labels:
        tick_label.set_color("red")
    if tick_label.get_text() in best_neg_labels:
        tick_label.set_color("blue")

for tick_label in g.axes.get_xticklabels():
    if tick_label.get_text() == 'Mean Entropy':
        tick_label.set_color("blue")