In [None]:
import pandas as pd
import numpy as np
import string
from pathlib import Path
from src.code_processing import get_code, parse_code, clean_code, remove_czech_symbols

retrain_data = False

data_path = Path('data/')

if retrain_data:
    item = pd.read_csv(data_path / 'umimeprogramovatcz-ipython_item.csv', sep=';', index_col=0)
    item = item[['name', 'solution']]
    item['solution'] = item['solution'].apply(get_code).apply(parse_code).apply(clean_code).apply(remove_czech_symbols)
    log = pd.read_csv(data_path / 'umimeprogramovatcz-ipython_log.csv', sep=';')
    log.drop_duplicates(inplace=True)
    log.dropna(inplace=True)
    log['time'] = pd.to_datetime(log['time'])
    log.set_index('time', inplace=True)
    log['answer'] = log['answer'].apply(parse_code).apply(clean_code).apply(remove_czech_symbols)
    log = log[log['answer'].str.strip().astype(bool)]
    item.to_csv(data_path / 'cached_item.csv')
    log.to_csv(data_path / 'cached_log.csv')
else:
    # data is not saved properly, NaN appear
    item = pd.read_csv(data_path / 'cached_item.csv', index_col=0)
    log = pd.read_csv(data_path / 'cached_log.csv')
    log['time'] = pd.to_datetime(log['time'])
    log.set_index('time', inplace=True)

In [None]:
log.info()

In [None]:
import plotly.express as px
counts = log['user'].value_counts()
fig = px.histogram(counts[(counts > 5) & (counts < 600)], nbins=1000, title='Histogram of user activity')
fig.update_layout(
    xaxis_title="# submissions", yaxis_title="# users"
)
# poisson?

In [None]:
print('Looking for active but also representative users...')
start, stop = 100, 100  # 40, 80
selected_counts = counts[(counts >= start) & (counts <= stop)]
print(f'In the range of {start} to {stop} submissions found {selected_counts.shape[0]} users, \
with total {selected_counts.sum()} submissions, corresponding to {selected_counts.sum() / log.shape[0] * 100}% of the data')

In [None]:
log = log.query('user in @selected_counts.index')  # keep only data for selected users

In [None]:
import os
import subprocess
import json
from tqdm import tqdm


def edulint_analysis(code_string):
    with open("temp_code.py", "w") as file:
        file.write(code_string)
    result = subprocess.run(['py', '-m', 'edulint', 'temp_code.py'], text=True, capture_output=True)
    os.remove("temp_code.py")
    return [msg[msg.rfind(':') + 2:msg.find('[')].replace(' ', '_') for msg in result.stdout.split('\n') if len(msg) > 0]

if True:
    result = []
    for i, row in tqdm(log.iterrows()):
        result.append(edulint_analysis(row['answer']))

    json.dump(result, open('edulint_result.json', 'w'))

In [None]:
result

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

result = [' '.join(alist).replace('-', '_') for alist in json.load(open('edulint_result.json', 'r'))]
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(result)
log['linter_messages'] = list(map(np.array, vectors.toarray().tolist()))
log['linter_messages'] = log['linter_messages']

In [None]:
result = []
for user_id in set(log['user']):
    user_history = log[log['user'] == user_id].sort_values('time')
    user_history['final'] = np.append(user_history['item'][:-1].values != user_history['item'][1:].values, True)
    user_history['first'] = [True] + [False] * (len(user_history) - 1)
    user_history['profile'] = (user_history['linter_messages'].cumsum() - user_history['linter_messages']).apply(lambda x: x / (x.sum() + 1e-6))
    user_history['distance_from_profile'] = np.round(
        np.linalg.norm(
            np.vstack(user_history['linter_messages'].apply(lambda x: x / (x.sum() + 1e-6))) - np.vstack(user_history['profile']), axis=1
        ),
        2
    ).tolist()
    result.append(user_history)

new_log = pd.concat(result)

In [None]:
new_log[new_log['first'] == False].sort_values('distance_from_profile', ascending=False) #[5:10]

In [None]:
import math
import plotly.graph_objects as go
from plotly.subplots import make_subplots

user_id = 15773986 # 11099474 # 28668216 # 24903035  # 39137547
user_history = new_log[new_log['user'] == user_id]
session_breakpoints = np.nonzero((user_history.index[1:] - user_history.index[:-1]) > pd.Timedelta(1, 'h'))[0].tolist()

fig = make_subplots(rows=math.ceil((len(session_breakpoints) + 1) / 4), cols=4)

start = 0
for i, end in enumerate(session_breakpoints + [len(user_history)]):
    session = user_history[start:end + 1]
    fig.add_trace(
        go.Scatter(
            x=session.index,
            y=session['distance_from_profile'],
            text='task id ' + session['item'].astype(str),
            mode='lines+markers',
            marker=dict(
                color=session['correct'].apply(lambda x: 'green' if x else 'red'),
                symbol=session['final'].apply(lambda x: 'x' if x else 'circle'),
                size=10
            ),
        ),
        col=i % 4 + 1, row=i // 4 + 1
    )
    fig.update_layout(
        margin=dict(l=0,r=0,b=0,t=40),
        showlegend=False,
        title=f'Sessions of user id {user_id}'
    )
    fig.update_xaxes(
        tickformat="%H:%M<br>%d-%m"
    )
    start = end + 1

fig.show()

In [None]:
user_history

In [None]:
vectorizer.get_feature_names_out()

In [None]:
for i, row in user_history[user_history['item'] == 5].iterrows():
    print(row['answer'])
    print(row['linter_messages'])

In [None]:
for i, row in user_history[user_history['item'] == 48].iterrows():
    print(row['answer'])

In [None]:
for i, row in user_history[user_history['item'] == 58].iterrows():
    print(row['answer'])

In [None]:
for i, row in user_history[user_history['item'] == 90].iterrows():
    print(row['answer'])