In [None]:
import pandas as pd
import numpy as np
import json
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
from tqdm import tqdm

In [None]:
data_path = Path('data/')
item = pd.read_csv(data_path / 'cached_item.csv', index_col=0)

item = pd.concat([
    item,
    *[pd.DataFrame({'name': f'unknown_{i}', 'solution': 'pass'}, index=[idx]) for i, idx in enumerate([12, 118, 142, 143, 144, 145, 146])]
])

# load from cache
log = pd.read_csv(data_path / 'dataset.csv')
log['time'] = pd.to_datetime(log['time'])
log['linter_messages'] = log['linter_messages'].apply(lambda x: np.array(eval(x)))

feature_descriptions = json.load(open(Path(data_path / 'edulint' / 'features.json'), 'r'))

In [None]:
log.head()

In [None]:
item.head()

In [None]:
## naive model
log['distance'] = log['linter_messages'].apply(np.sum)

In [None]:
cutoff = 10

values, counts = np.unique(log[log['distance'] < cutoff]['distance'], return_counts=True)

fig = px.bar(x=values.tolist() + [cutoff], y=counts.tolist() + [(log['distance'] > cutoff).sum()], title='Naive detector - submissions')
fig.update_xaxes(
    tickmode = 'array',
    tickvals = values.tolist() + [cutoff],
    ticktext = list(map(str, values)) + [f'>= {cutoff}']
)
fig.update_layout(
    xaxis_title="Distance from profile", yaxis_title="# submissions"
)
fig.show()

In [None]:
def submission_to_string(idx, log, item):
    # TODO maybe print side by side
    submission = log.loc[idx]
    task = item.iloc[submission['item']]

    return                                                                                          \
        f"SUBMISSION: by user: {submission['user']} of task: {submission['item']}-{task['name']}" + \
        '\n' + "-" * 50 + '\n'                                                                      \
        f"DISTANCE:\n {submission['distance']}" +                                                   \
        '\n' + "-" * 50 + '\n'                                                                      \
        f"INSTRUCTIONS:\n {task['instructions']}" +                                                 \
        '\n' + "-" * 50 + '\n'                                                                      \
        f"SOLUTION:\n {task['solution']}" +                                                         \
        '\n' + "-" * 50 + '\n'                                                                      \
        f"ANSWER:\n {submission['answer']}"

In [None]:
# submissions with most mistakes
for idx in log.sort_values(by='distance', ascending=False).index[:5]:
    print(submission_to_string(idx, log, item))
    print('\n' * 3)

In [None]:
# task with most mistakes

mistake_counts = log.groupby('item')['distance'].mean().sort_values(ascending=False)
mistake_counts[:5]

In [None]:
fig = px.bar(x=[item['name'].iloc[idx] for idx in mistake_counts.index], y=mistake_counts.values, title='Average number of messages per submission for each task')
fig.update_layout(
    yaxis_title="Average number of messages", xaxis_title=""
)
fig

In [None]:
for idx in mistake_counts.index:
    try:
        print(item['name'].iloc[idx])
    except:
        print(idx)

In [None]:
idx = 128
print(item.iloc[idx]['name'])
print(item.iloc[idx]['instructions'])
print(item.iloc[idx]['solution'])

In [None]:
feature_descriptions[0] = 'c0103_snake_case_naming_style'
feature_descriptions[24] = 'r1705_unnecessary_elif_after_return'
feature_descriptions[20] = 'f841_unused_local_variable'
cutoff = 0.15
counts = dict(zip(feature_descriptions, log[log['item'] == idx]['linter_messages'].mean()))
counts = {label:value for label, value in counts.items() if value >= cutoff}
counts = dict(sorted(counts.items(), reverse=True, key=lambda x: x[1]))
fig = px.bar(x=counts.keys(), y=counts.values(), title=f'Naive detector - task {idx} profile')
fig.update_layout(
    yaxis_title="Average occurance count", xaxis_title=""
)
fig.show()

In [None]:
# user with most mistakes

log.groupby('user')['distance'].mean().sort_values(ascending=False)[:5]

In [None]:
px.histogram(log.groupby('user')['distance'].mean(), nbins=1000)

In [None]:
import math
import plotly.graph_objects as go
from plotly.subplots import make_subplots

user_id = 18379796                       
user_history = log[log['user'] == user_id].copy()
user_history['final'] = np.append(user_history['item'][:-1].values != user_history['item'][1:].values, True)
session_breakpoints = np.nonzero((user_history['time'][1:] - user_history['time'][:-1]) > pd.Timedelta(1, 'h'))[0].tolist()

fig = make_subplots(rows=math.ceil((len(session_breakpoints) + 1) / 4), cols=4)

start = 0
for i, end in enumerate(session_breakpoints + [len(user_history)]):
    session = user_history[start:end + 1]
    fig.add_trace(
        go.Scatter(
            x=session.index,
            y=session['distance'],
            text='task id ' + session['item'].astype(str),
            mode='lines+markers',
            marker=dict(
                color=session['correct'].apply(lambda x: 'green' if x else 'red'),
                symbol=session['final'].apply(lambda x: 'x' if x else 'circle'),
                size=10
            ),
        ),
        col=i % 4 + 1, row=i // 4 + 1
    )
    fig.update_layout(
        margin=dict(l=0,r=0,b=0,t=40),
        showlegend=False,
        title=f'Sessions of user id {user_id}'
    )
    fig.update_xaxes(
        tickformat="%H:%M<br>%d-%m"
    )
    start = end + 1

fig.show()

In [None]:
cutoff = 0.15
counts = dict(zip(feature_descriptions, log[log['user'] == user_id]['linter_messages'].mean()))
counts = {label:value for label, value in counts.items() if value >= cutoff}
counts = dict(sorted(counts.items(), reverse=True, key=lambda x: x[1]))
fig = px.bar(x=counts.keys(), y=counts.values(), title=f'Naive detector - user {user_id} profile')
fig.update_layout(
    yaxis_title="Average occurance count", xaxis_title=""
)
fig.show()

In [None]:
from src.linter_profile import make_task_means, make_task_profiles

item['msg_mean'] = make_task_means(item, log)
item['profile'] = make_task_profiles(item, log)

In [None]:
from src.linter_profile import make_user_profiles

log['profile'] = make_user_profiles(log, task_means=item['msg_mean'])

In [None]:
from src.model import DistanceModel

log['distance'] = DistanceModel().fit_predict(log['profile'], log['linter_messages'])

In [None]:
from sklearn.preprocessing import normalize


messages = normalize(np.vstack(log['linter_messages']), 'l2', axis=1)

In [None]:
def only_not_first(log):
    return log[log.groupby('user')['time'].transform('idxmin') != log.index]


def only_last(log):
    return log.loc[log.groupby('user')['time'].idxmax()]

In [None]:
import umap.umap_ as umap

profiles = np.vstack(item['profile'])
reducer = umap.UMAP()
reducer.fit(profiles)
embedding = reducer.transform(profiles)

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

target = embedding

scores = []
best_score = -2
best_k = 3
kmeans = None
for n_clusters in tqdm(range(3, 20)):
    new = KMeans(n_clusters=n_clusters, n_init='auto').fit(target)
    score = silhouette_score(target, new.predict(target), random_state=42, sample_size=25000)
    scores.append(score)
    print(np.round(score, 2), end='     ')
    if score > best_score:
        best_score = score
        best_k = n_clusters
        kmeans = new
        
labels = kmeans.predict(target)

centroids = kmeans.cluster_centers_

import plotly.express as px

fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=labels.astype(str), hover_name=item.index)
fig.add_trace(px.scatter(x=centroids[:, 0], y=centroids[:, 1], size=np.zeros(centroids.shape[0]) + 1, opacity=.5).data[0])
fig.show()

In [None]:
num = 9
print(item.loc[num]['name'])
print(item.loc[num]['solution'])

In [None]:
num = 19
print(item.loc[num]['name'])
print(item.loc[num]['solution'])

In [None]:
import umap.umap_ as umap

if only_last_profile:
    filtered_log = new_log[new_log['final'] == True]
else:
    filtered_log = new_log[new_log['first'] == False]

if user_profiles:
    profiles = np.vstack(filtered_log['freq_profile'])
else:
    profiles = np.vstack(item['profile'])

reducer = umap.UMAP()
reducer.fit(profiles)
embedding = reducer.transform(profiles)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

if cluster_profiles:
    target = profiles
else:
    target = embedding

scores = []
best_score = -2
best_k = 3
kmeans = None
for n_clusters in tqdm(range(3, 20)):
    new = KMeans(n_clusters=n_clusters, n_init='auto').fit(target)
    score = silhouette_score(target, new.predict(target), random_state=42, sample_size=25000)
    scores.append(score)
    print(np.round(score, 2), end='     ')
    if score > best_score:
        best_score = score
        best_k = n_clusters
        kmeans = new
        
labels = kmeans.predict(target)

In [None]:
if cluster_profiles:
    centroids = reducer.transform(kmeans.cluster_centers_)
else:
    centroids = kmeans.cluster_centers_

In [None]:
import plotly.express as px

fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=labels.astype(str), text=range(embedding.shape[0]))
fig.add_trace(px.scatter(x=centroids[:, 0], y=centroids[:, 1], size=np.zeros(centroids.shape[0]) + 1, opacity=.5).data[0])
fig.show()

In [None]:
user = list(set(filtered_log['user']))[300]

labels = kmeans.fit_predict(embedding)
px.scatter(
    x=embedding[:, 0],
    y=embedding[:, 1],
    color=(filtered_log['user'] == user).astype(str),
    opacity=.5,
    size=((filtered_log['user'] == user) * 5 + 1),
    text=range(embedding.shape[0])
)

In [None]:
fig = px.bar(np.vstack(filtered_log['freq_profile'][[10532, 8598, 5492, 9655]]).T)
fig.show()


In [None]:
import math
import plotly.graph_objects as go
from plotly.subplots import make_subplots

user_id = 14723775 # filtered_log['user'].iloc[2170]
user_history = new_log[new_log['user'] == user_id]
session_breakpoints = np.nonzero((user_history.index[1:] - user_history.index[:-1]) > pd.Timedelta(1, 'h'))[0].tolist()

fig = make_subplots(rows=math.ceil((len(session_breakpoints) + 1) / 4), cols=4)

start = 0
for i, end in enumerate(session_breakpoints + [len(user_history)]):
    session = user_history[start:end + 1]
    fig.add_trace(
        go.Scatter(
            x=session.index,
            y=session['dist_from_profile'],
            text='task id ' + session['item'].astype(str),
            mode='lines+markers',
            marker=dict(
                color=session['correct'].apply(lambda x: 'green' if x else 'red'),
                symbol=session['final'].apply(lambda x: 'x' if x else 'circle'),
                size=10
            ),
        ),
        col=i % 4 + 1, row=i // 4 + 1
    )
    fig.update_layout(
        margin=dict(l=0,r=0,b=0,t=40),
        showlegend=False,
        title=f'Sessions of user id {user_id}'
    )
    fig.update_xaxes(
        tickformat="%H:%M<br>%d-%m"
    )
    start = end + 1

fig.show()

In [None]:
new_log[new_log['first'] == False].sort_values('dist_from_profile', ascending=False).head()

In [None]:
for i, submission in new_log[(new_log['user'] == 14723775) & (new_log['item'] == 66)].iterrows():
    print(submission['answer'])

In [None]:
for i, submission in new_log[(new_log['user'] == 52565485) & (new_log['item'] == 66)].iterrows():
    print(submission['answer'])

In [None]:
for i, submission in new_log[(new_log['user'] == 41811185) & (new_log['item'] == 74)].iterrows():
    print(submission['answer'])

In [None]:
for i, submission in new_log[(new_log['user'] == 39127736) & (new_log['item'] == 39)].iterrows():
    print(submission['answer'])

In [None]:
# user profiles
from src.linter_profile import freq_profile
from sklearn.linear_model import Ridge
from scipy.spatial.distance import euclidean, cosine
from sklearn.preprocessing import normalize

user_profiles = True
euclidean_distance = True
linear_model = False
subtract_task = False
only_last_profile = False

result = []
for user_id in set(log['user']):
    user_history = log[log['user'] == user_id].sort_values('time')

    user_history['final'] = np.append(user_history['item'][:-1].values != user_history['item'][1:].values, True)
    user_history['first'] = [True] + [False] * (len(user_history) - 1)

    user_history['norm_messages'] = user_history['linter_messages'].apply(lambda x: normalize(x.reshape(1, -1)))

    if subtract_task:
        user_history['freq_profile'] = freq_profile(np.vstack(user_history['linter_messages']), np.vstack(item['mean'][user_history['item']])).tolist()
    else:
        user_history['freq_profile'] = freq_profile(np.vstack(user_history['linter_messages'])).tolist()

    result.append(user_history)

new_log = pd.concat(result)


In [None]:
X, y = np.vstack(new_log['freq_profile']), np.vstack(new_log['linter_messages'])

In [None]:
from sklearn.linear_model import Ridge

model = Ridge().fit(X, y)
model.score(X, y)

In [None]:
from scipy.stats import pearsonr

predictions = X # model.predict(X) / X

correlations = []
pvalues = []
for dim in range(y.shape[1]):
    cor, p = pearsonr(predictions[:, dim], y[:, dim])
    correlations.append(cor)
    pvalues.append(p)
fig = px.bar(correlations, text=np.round(np.array(pvalues), 2))
fig.show()

In [None]:
px.imshow(pd.DataFrame(X).corr(), zmin=-1, zmax=1)

In [None]:
px.imshow(pd.DataFrame(y).corr(), zmin=-1, zmax=1)

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Heatmap(z=model.coef_, text=[[f'{x} -> {y}' for x in feature_descriptions] for y in feature_descriptions]))
fig.update_layout(xaxis=dict(scaleanchor='y',constrain='domain'))
fig.show()