In [None]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from src.code_processing import decode_code_string
from src.linting import analyze_strings

retrain_data = False

data_path = Path('data/')

if retrain_data:
    # load
    item = pd.read_csv(data_path / 'umimeprogramovatcz-ipython_item.csv', sep=';', index_col=0)
    # process
    item = item[['name', 'solution']]
    item['solution'] = item['solution'].apply(lambda x: eval(x)[0][1]).apply(decode_code_string)
    # cache
    item.to_csv(data_path / 'cached_item.csv')

    # load
    log = pd.read_csv(data_path / 'umimeprogramovatcz-ipython_log.csv', sep=';')
    # process
    log.drop_duplicates(inplace=True)
    log.dropna(inplace=True)
    # TODO drop rows referring to nonexistent items?
    log['time'] = pd.to_datetime(log['time'])
    log.set_index('time', inplace=True)
    log['answer'] = log['answer'].apply(decode_code_string)
    log = log[log['answer'].str.strip().astype(bool)]
    # cache
    log.to_csv(data_path / 'cached_log.csv')
else:
    # load from cache
    item = pd.read_csv(data_path / 'cached_item.csv', index_col=0)

    # load from cache
    log = pd.read_csv(data_path / 'cached_log.csv')
    log['time'] = pd.to_datetime(log['time'])
    log.set_index('time', inplace=True)

In [None]:
import plotly.express as px
counts = log['user'].value_counts()
fig = px.histogram(counts[(counts > 5) & (counts < 600)], nbins=1000, title='Histogram of user activity')
fig.update_layout(
    xaxis_title="# submissions", yaxis_title="# users"
)
# poisson?

In [None]:
print('Looking for active but also representative users...')
start, stop = 33, 38  # 40, 80
selected_counts = counts[(counts >= start) & (counts < stop)]
print(f'In the range of {start} to {stop} submissions found {selected_counts.shape[0]} users, \
with total {selected_counts.sum()} submissions, corresponding to {selected_counts.sum() / log.shape[0] * 100}% of the data')

In [None]:
log = log.query('user in @selected_counts.index')  # keep only data for selected users

In [5]:
edulint_result_path = Path(f'data/edulint/edulint_results_{start}-{stop}.json')

result = analyze_strings(map(lambda tup: tup[1], log['answer'].items()), result_path=edulint_result_path)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

result = [' '.join(alist) for alist in json.load(open(edulint_result_path, 'r'))]
vectorizer = CountVectorizer(min_df=0.001)
vectors = vectorizer.fit_transform(result)
log['linter_messages'] = list(map(np.array, vectors.toarray().tolist()))

In [None]:
vectorizer.get_feature_names_out()

In [None]:
from src.linter_profile import task_profile

item = pd.concat([item, pd.DataFrame({'name': 'unknown', 'solution': 'pass'}, index=[12])])  # TODO add in preprocessing

profiles = []
means = []
for task_id in item.index:
    history = log['linter_messages'][log['item'] == task_id]
    if len(history) == 0:
        profiles.append(np.zeros(len(vectorizer.get_feature_names_out())))
        means.append(np.zeros(len(vectorizer.get_feature_names_out())))
    else:
        profiles.append(task_profile(np.vstack(history)))
        means.append(history.mean(axis=0))
item['profile'] = profiles
item['mean'] = means

In [None]:
from src.linter_profile import freq_profile

result = []
for user_id in set(log['user']):
    user_history = log[log['user'] == user_id].sort_values('time')

    user_history['final'] = np.append(user_history['item'][:-1].values != user_history['item'][1:].values, True)
    user_history['first'] = [True] + [False] * (len(user_history) - 1)

    user_history['freq_profile'] = freq_profile(np.vstack(user_history['linter_messages'])).tolist()
    user_history['task_relative_profile'] = freq_profile(np.vstack(user_history['linter_messages']), np.vstack(item['mean'][user_history['item']])).tolist()

    
    user_history['distance_from_profile'] = np.round(
        np.linalg.norm(
            np.vstack(user_history['linter_messages'].apply(lambda x: x / (x.sum() + 1e-6))) - np.vstack(user_history['task_relative_profile']), axis=1
        ),
        2
    ).tolist()
    result.append(user_history)

new_log = pd.concat(result)

In [None]:
new_log[new_log['first'] == False].sort_values('distance_from_profile', ascending=False) #[5:10]

In [None]:
import math
import plotly.graph_objects as go
from plotly.subplots import make_subplots

user_id = 50440544  # 15773986 # 11099474 # 28668216 # 24903035 # 39137547
user_history = new_log[new_log['user'] == user_id]
session_breakpoints = np.nonzero((user_history.index[1:] - user_history.index[:-1]) > pd.Timedelta(1, 'h'))[0].tolist()

fig = make_subplots(rows=math.ceil((len(session_breakpoints) + 1) / 4), cols=4)

start = 0
for i, end in enumerate(session_breakpoints + [len(user_history)]):
    session = user_history[start:end + 1]
    fig.add_trace(
        go.Scatter(
            x=session.index,
            y=session['distance_from_profile'],
            text='task id ' + session['item'].astype(str),
            mode='lines+markers',
            marker=dict(
                color=session['correct'].apply(lambda x: 'green' if x else 'red'),
                symbol=session['final'].apply(lambda x: 'x' if x else 'circle'),
                size=10
            ),
        ),
        col=i % 4 + 1, row=i // 4 + 1
    )
    fig.update_layout(
        margin=dict(l=0,r=0,b=0,t=40),
        showlegend=False,
        title=f'Sessions of user id {user_id}'
    )
    fig.update_xaxes(
        tickformat="%H:%M<br>%d-%m"
    )
    start = end + 1

fig.show()

In [None]:
for i, row in user_history[user_history['item'] == 50].iterrows():
    print(row['answer'], row['distance_from_profile'])

In [None]:
for i, row in user_history[user_history['item'] == 1].iterrows():
    print(row['answer'], row['linter_messages'], row['distance_from_profile'], row['freq_profile'])