In [None]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from tqdm import tqdm

In [None]:
data_path = Path('data/')
item = pd.read_csv(data_path / 'cached_item.csv', index_col=0)

# load from cache
log = pd.read_csv(data_path / 'dataset.csv')
log['time'] = pd.to_datetime(log['time'])
log['linter_messages'] = log['linter_messages'].apply(lambda x: np.array(eval(x)))
log.set_index('time', inplace=True)

feature_descriptions = json.load(open(Path(data_path / 'edulint' / 'features.json'), 'r'))

In [None]:
# task profiles
from src.linter_profile import task_profile

item = pd.concat([
    item,
    pd.DataFrame({'name': 'unknown', 'solution': 'pass'}, index=[12]),
    pd.DataFrame({'name': 'unknown', 'solution': 'pass'}, index=[118]),

])

profiles = []
means = []
for task_id in item.index:
    history = log['linter_messages'][log['item'] == task_id]
    if len(history) == 0:
        profiles.append(np.zeros(log['linter_messages'].iloc[0].shape[0]))
        means.append(np.zeros(log['linter_messages'].iloc[0].shape[0]))
    else:
        profiles.append(task_profile(np.vstack(history)))
        means.append(history.mean(axis=0))
item['profile'] = profiles
item['mean'] = means

In [None]:
# user profiles
from src.linter_profile import freq_profile
from sklearn.linear_model import Ridge
from scipy.spatial.distance import euclidean, cosine
from sklearn.preprocessing import normalize

user_profiles = True
euclidean_distance = True
linear_model = True
subtract_task = True
only_last_profile = True
cluster_profiles = True

result = []
for user_id in set(log['user']):
    user_history = log[log['user'] == user_id].sort_values('time')

    user_history['final'] = np.append(user_history['item'][:-1].values != user_history['item'][1:].values, True)
    user_history['first'] = [True] + [False] * (len(user_history) - 1)

    user_history['norm_messages'] = user_history['linter_messages'].apply(lambda x: normalize(x.reshape(1, -1)))

    if subtract_task:
        user_history['freq_profile'] = freq_profile(np.vstack(user_history['linter_messages']), np.vstack(item['mean'][user_history['item']])).tolist()
    else:
        user_history['freq_profile'] = freq_profile(np.vstack(user_history['linter_messages'])).tolist()

    result.append(user_history)

new_log = pd.concat(result)

if linear_model:
    X, y = np.vstack(new_log['freq_profile']), np.vstack(new_log['linter_messages'])
    model = Ridge().fit(X, y)
    new_log['freq_profile'] = model.predict(X).tolist()

if euclidean_distance:
    new_log['dist_from_profile'] = [euclidean(profile, model.predict(actual) if linear_model else actual) for i, (profile, actual) in new_log[['freq_profile', 'norm_messages']].iterrows()]
else:
    new_log['dist_from_profile'] = [cosine(profile, model.predict(actual) if linear_model else actual) for i, (profile, actual) in new_log[['freq_profile', 'norm_messages']].iterrows()]

In [None]:
import umap.umap_ as umap

if only_last_profile:
    filtered_log = new_log[new_log['final'] == True]
else:
    filtered_log = new_log[new_log['first'] == False]

if user_profiles:
    profiles = np.vstack(filtered_log['freq_profile'])
else:
    profiles = np.vstack(item['profile'])

reducer = umap.UMAP()
reducer.fit(profiles)
embedding = reducer.transform(profiles)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

if cluster_profiles:
    target = profiles
else:
    target = embedding

scores = []
best_score = -2
best_k = 3
kmeans = None
for n_clusters in tqdm(range(3, 20)):
    new = KMeans(n_clusters=n_clusters, n_init='auto').fit(target)
    score = silhouette_score(target, new.predict(target), random_state=42, sample_size=25000)
    scores.append(score)
    print(np.round(score, 2), end='     ')
    if score > best_score:
        best_score = score
        best_k = n_clusters
        kmeans = new
        
labels = kmeans.predict(target)

In [None]:
if cluster_profiles:
    centroids = reducer.transform(kmeans.cluster_centers_)
else:
    centroids = kmeans.cluster_centers_

In [None]:
import plotly.express as px

fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=labels.astype(str), text=range(embedding.shape[0]))
fig.add_trace(px.scatter(x=centroids[:, 0], y=centroids[:, 1], size=np.zeros(centroids.shape[0]) + 1, opacity=.5).data[0])
fig.show()

In [None]:
user = list(set(filtered_log['user']))[300]

labels = kmeans.fit_predict(embedding)
px.scatter(
    x=embedding[:, 0],
    y=embedding[:, 1],
    color=(filtered_log['user'] == user).astype(str),
    opacity=.5,
    size=((filtered_log['user'] == user) * 5 + 1),
    text=range(embedding.shape[0])
)

In [None]:
fig = px.bar(np.vstack(filtered_log['freq_profile'][[10532, 8598, 5492, 9655]]).T)
fig.show()


In [None]:
import math
import plotly.graph_objects as go
from plotly.subplots import make_subplots

user_id = filtered_log['user'].iloc[2170] # filtered_log['user'].iloc[2170]
user_history = new_log[new_log['user'] == user_id]
session_breakpoints = np.nonzero((user_history.index[1:] - user_history.index[:-1]) > pd.Timedelta(1, 'h'))[0].tolist()

fig = make_subplots(rows=math.ceil((len(session_breakpoints) + 1) / 4), cols=4)

start = 0
for i, end in enumerate(session_breakpoints + [len(user_history)]):
    session = user_history[start:end + 1]
    fig.add_trace(
        go.Scatter(
            x=session.index,
            y=session['dist_from_profile'],
            text='task id ' + session['item'].astype(str),
            mode='lines+markers',
            marker=dict(
                color=session['correct'].apply(lambda x: 'green' if x else 'red'),
                symbol=session['final'].apply(lambda x: 'x' if x else 'circle'),
                size=10
            ),
        ),
        col=i % 4 + 1, row=i // 4 + 1
    )
    fig.update_layout(
        margin=dict(l=0,r=0,b=0,t=40),
        showlegend=False,
        title=f'Sessions of user id {user_id}'
    )
    fig.update_xaxes(
        tickformat="%H:%M<br>%d-%m"
    )
    start = end + 1

fig.show()

In [None]:
new_log[new_log['first'] == False].sort_values('dist_from_profile', ascending=False).head()

In [None]:
print(new_log[(new_log['user'] == 18489457) & (new_log['item'] == 39)]['answer'].iloc[0])

In [None]:
print(new_log[(new_log['user'] == 28707279) & (new_log['item'] == 73)]['answer'].iloc[0])

In [None]:
print(new_log[(new_log['user'] == 28700003) & (new_log['item'] == 39)]['answer'].iloc[0])

In [None]:
# user profiles
from src.linter_profile import freq_profile
from sklearn.linear_model import Ridge
from scipy.spatial.distance import euclidean, cosine
from sklearn.preprocessing import normalize

user_profiles = True
euclidean_distance = True
linear_model = False
subtract_task = True
only_last_profile = False

result = []
for user_id in set(log['user']):
    user_history = log[log['user'] == user_id].sort_values('time')

    user_history['final'] = np.append(user_history['item'][:-1].values != user_history['item'][1:].values, True)
    user_history['first'] = [True] + [False] * (len(user_history) - 1)

    user_history['norm_messages'] = user_history['linter_messages'].apply(lambda x: normalize(x.reshape(1, -1)))

    if subtract_task:
        user_history['freq_profile'] = freq_profile(np.vstack(user_history['linter_messages']), np.vstack(item['mean'][user_history['item']])).tolist()
    else:
        user_history['freq_profile'] = freq_profile(np.vstack(user_history['linter_messages'])).tolist()

    result.append(user_history)

new_log = pd.concat(result)

target = 'norm_messages'

if linear_model:
    target = 'linter_messages'
    X, y = np.vstack(new_log['freq_profile']), np.vstack(new_log['linter_messages'])
    model = Ridge().fit(X, y)
    new_log['freq_profile'] = model.predict(X).tolist()

if euclidean_distance:
    new_log['dist_from_profile'] = [euclidean(profile, actual) for i, (profile, actual) in new_log[['freq_profile', target]].iterrows()]
else:
    new_log['dist_from_profile'] = [cosine(profile, actual) for i, (profile, actual) in new_log[['freq_profile', target]].iterrows()]

In [None]:
X, y = np.vstack(new_log['freq_profile']), np.vstack(new_log['linter_messages'])

In [None]:
from sklearn.linear_model import Ridge

model = Ridge().fit(X, y)
model.score(X, y)

In [None]:
from scipy.stats import pearsonr

predictions = X # model.predict(X) / X

correlations = []
pvalues = []
for dim in range(y.shape[1]):
    cor, p = pearsonr(predictions[:, dim], y[:, dim])
    correlations.append(cor)
    pvalues.append(p)
fig = px.bar(correlations, text=np.round(np.array(pvalues), 2))
fig.show()

In [None]:
px.imshow(model.coef_) # target x feature

In [None]:
px.imshow(pd.DataFrame(X).corr(), zmin=-1, zmax=1)

In [None]:
px.imshow(pd.DataFrame(y).corr(), zmin=-1, zmax=1)