In [1]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from src.code_processing import get_code, parse_code, clean_code, remove_czech_symbols
from src.linting import analyze_strings

retrain_data = False

data_path = Path('data/')

if retrain_data:
    # load
    item = pd.read_csv(data_path / 'umimeprogramovatcz-ipython_item.csv', sep=';', index_col=0)
    # process
    item = item[['name', 'solution']]
    item['solution'] = item['solution'].apply(get_code).apply(parse_code).apply(clean_code).apply(remove_czech_symbols)
    # cache
    item.to_csv(data_path / 'cached_item.csv')

    # load
    log = pd.read_csv(data_path / 'umimeprogramovatcz-ipython_log.csv', sep=';')
    # process
    log.drop_duplicates(inplace=True)
    log.dropna(inplace=True)
    # TODO drop nonexistent items
    log['time'] = pd.to_datetime(log['time'])
    log.set_index('time', inplace=True)
    log['answer'] = log['answer'].apply(parse_code).apply(clean_code).apply(remove_czech_symbols)
    log = log[log['answer'].str.strip().astype(bool)]
    # cache
    log.to_csv(data_path / 'cached_log.csv')
else:
    # load from cache
    item = pd.read_csv(data_path / 'cached_item.csv', index_col=0)

    # load from cache
    log = pd.read_csv(data_path / 'cached_log.csv')
    log['time'] = pd.to_datetime(log['time'])
    log.set_index('time', inplace=True)

In [2]:
log

Unnamed: 0_level_0,id,user,item,answer,correct,moves,responseTime
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-08-02 14:14:38,8934,2121562,33,"def impose_fine(age, beer):\n return False",0,-1,4600
2018-08-02 14:14:44,8935,2121562,33,"def impose_fine(age, beer):\n return False",0,-1,9000
2018-08-02 14:14:45,8936,2121562,33,"def impose_fine(age, beer):\n return False",0,-1,10000
2018-08-02 14:14:46,8937,2121562,33,"def impose_fine(age, beer):\n return False",0,-1,10700
2018-08-02 14:14:48,8938,2121562,33,"def impose_fine(age, beer):\n return False",0,-1,12200
...,...,...,...,...,...,...,...
2023-09-17 21:17:31,427656,38849145,94,def helke(text):\n for letter in text:\n ...,0,0,752650
2023-09-17 21:19:57,427657,38849145,94,def helke(text):\n for letter in text:\n ...,1,0,899300
2023-09-17 21:46:45,427658,38849145,145,"def letter_distances(text):\n for index, le...",1,0,787950
2023-09-17 22:16:37,427659,38849145,26,def censorship(text):\n result = ''\n fo...,1,0,20100


In [3]:
import plotly.express as px
counts = log['user'].value_counts()
fig = px.histogram(counts[(counts > 5) & (counts < 600)], nbins=1000, title='Histogram of user activity')
fig.update_layout(
    xaxis_title="# submissions", yaxis_title="# users"
)
# poisson?

In [4]:
print('Looking for active but also representative users...')
start, stop = 56, 57  # 40, 80
selected_counts = counts[(counts >= start) & (counts < stop)]
print(f'In the range of {start} to {stop} submissions found {selected_counts.shape[0]} users, \
with total {selected_counts.sum()} submissions, corresponding to {selected_counts.sum() / log.shape[0] * 100}% of the data')

Looking for active but also representative users...
In the range of 55 to 56 submissions found 42 users, with total 2310 submissions, corresponding to 0.5870289473884835% of the data


In [5]:
log = log.query('user in @selected_counts.index')  # keep only data for selected users

In [6]:
edulint_result_path = Path(f'data/edulint_results_{start}-{stop}.json')

result = analyze_strings(map(lambda tup: tup[1], log['answer'].items()), result_path=edulint_result_path)

Creating code files...


2310it [00:01, 2085.62it/s]


Done!
Processing files...


100%|██████████| 2310/2310 [1:48:46<00:00,  2.83s/it]


Done!
Cleaning up...
Done!
All finished!


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

result = [' '.join(alist) for alist in json.load(open(edulint_result_path, 'r'))]
vectorizer = CountVectorizer(min_df=0.001)
vectors = vectorizer.fit_transform(result)
log['linter_messages'] = list(map(np.array, vectors.toarray().tolist()))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [8]:
vectorizer.get_feature_names_out()

array(['c0103', 'e101', 'e111', 'e115', 'e117', 'e201', 'e202', 'e203',
       'e211', 'e221', 'e222', 'e225', 'e228', 'e231', 'e251', 'e265',
       'e271', 'e275', 'e302', 'e303', 'e305', 'e501', 'e701', 'e703',
       'e741', 'else', 'f821', 'f841', 'for', 'len', 'money', 'r1704',
       'r1705', 'r6201', 'r6204', 'r6305', 'r6601', 'r6602', 'r6604',
       'r6611', 'result', 'tax', 'text', 'w0101', 'w0104', 'w0622',
       'w191', 'w291', 'w292', 'w293'], dtype=object)

In [9]:
from src.linter_profile import task_profile

item = pd.concat([item, pd.DataFrame({'name': 'unknown', 'solution': 'pass'}, index=[12])])  # TODO add in preprocessing

profiles = []
means = []
for task_id in item.index:
    history = log['linter_messages'][log['item'] == task_id]
    if len(history) == 0:
        profiles.append(np.zeros(len(vectorizer.get_feature_names_out())))
        means.append(np.zeros(len(vectorizer.get_feature_names_out())))
    else:
        profiles.append(task_profile(np.vstack(history)))
        means.append(history.mean(axis=0))
item['profile'] = profiles
item['mean'] = means

In [10]:
item

Unnamed: 0,name,solution,profile,mean
1,Výpis čísel,"def numbers(n):\n for i in range(1, n + 1):...","[0.0, 0.2229944040256456, 0.02229944040256456,...","[0.0, 0.2702702702702703, 0.02702702702702703,..."
2,Fibonacciho posloupnost,def fibonacci(n):\n current = 1\n next =...,"[0.0, 0.15617376188860607, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0,..."
3,Faktoriál,def factorial(n):\n value = 1\n for i in...,"[0.16239245061540772, 0.032478490123081544, 0....","[0.20833333333333334, 0.041666666666666664, 0...."
4,Abeceda,def alphabet(n):\n for i in range(n):\n ...,"[0.0, 0.07293249574894728, 0.0, 0.0, 0.0729324...","[0.0, 0.08333333333333333, 0.0, 0.0, 0.0833333..."
5,Zdvojení znaků,"def duplication(text):\n output = """"\n f...","[0.0, 0.09166984970282113, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
143,Klesající zásoby,"def reserves(n, a, b):\r\n for i in range(1...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
144,Podpis pana Kaplana,def add_stars(name):\r\n name = name.upper(...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
145,Vzdálenosti písmen,def letter_distances(text):\r\n text = text...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
146,Odstranění nejčastějšího,"def shorten(text):\r\n most_common = """"\r\n...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [11]:
from src.linter_profile import freq_profile

result = []
for user_id in set(log['user']):
    user_history = log[log['user'] == user_id].sort_values('time')

    user_history['final'] = np.append(user_history['item'][:-1].values != user_history['item'][1:].values, True)
    user_history['first'] = [True] + [False] * (len(user_history) - 1)

    user_history['freq_profile'] = freq_profile(np.vstack(user_history['linter_messages'])).tolist()
    user_history['task_relative_profile'] = freq_profile(np.vstack(user_history['linter_messages']), np.vstack(item['mean'][user_history['item']])).tolist()

    
    user_history['distance_from_profile'] = np.round(
        np.linalg.norm(
            np.vstack(user_history['linter_messages'].apply(lambda x: x / (x.sum() + 1e-6))) - np.vstack(user_history['task_relative_profile']), axis=1
        ),
        2
    ).tolist()
    result.append(user_history)

new_log = pd.concat(result)

In [12]:
new_log.to_csv(data_path / f'preprocessed_{start}-{stop}.csv')

In [13]:
new_log[new_log['first'] == False].sort_values('distance_from_profile', ascending=False) #[5:10]

Unnamed: 0_level_0,id,user,item,answer,correct,moves,responseTime,linter_messages,final,first,freq_profile,task_relative_profile,distance_from_profile
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-10-13 10:30:49,303908,28662380,47,"def sum_dif(a, b):\n print(a,""+"",b,""="",a+b)...",1,-1,303850,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, ...",True,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.025194714908468925, 0.0, 0.0, 0.0, 0.0, -0...",1.81
2021-10-03 17:31:30,187694,28826821,47,"def sum_dif(a, b):\n print(a,""+"",b,""="", a+b...",1,-1,273250,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, ...",True,False,"[0.08164965809277261, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.24189586609322594, 0.0, 0.0, 0.0, 0.0, -0.0...",1.81
2021-09-21 14:26:02,182573,28613827,50,"def cross(z):\n print(z,"" "",z)\n print(""...",1,-1,64250,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...",False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.00974414740527073, 0.0, 0.0, 0.0, 0.0, -0....",1.81
2019-12-30 13:31:31,60232,12736130,47,"def sum_dif(a, b):\n print(a,""+"",b,""="",a+b)...",1,-1,66000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, ...",True,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.012154414578583605, 0.0, 0.0, 0.0, 0.0, -0...",1.80
2022-05-10 20:20:12,270088,36660119,50,"def cross(z):\n print(z,"" "",z)\n print(""...",1,-1,170800,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, ...",True,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.08838834...","[-0.01132876612347424, 0.0, 0.0, 0.0, 0.0, -0....",1.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-13 14:57:09,160072,25753730,50,"def cross(z):\n print(z,"" "",z)\n print(""...",1,-1,86650,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...",True,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0018513166645016685, 0.0, 0.0, 0.0, 0.0, -...",0.24
2019-12-30 13:37:21,60236,12736130,50,"def cross(z):\n print(z,"" "",z,""\n""+"" "",z,"" ...",1,-1,211500,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, ...",True,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0029381830506087546, 0.0, 0.0, 0.0, 0.0, -...",0.24
2023-01-14 15:46:47,342562,45949150,47,"def sum_dif(a, b):\n print(a,""+"",b,""="",a + ...",1,0,161000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, ...",True,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.008103287583811016, 0.0, 0.0, 0.0, 0.0, -0...",0.23
2021-10-04 15:28:09,188498,29332624,50,"def cross(z):\n print(z,"" "",z)\n print(""...",1,-1,28900,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, ...",True,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.003995975948272682, 0.0, 0.0, 0.0, 0.0, -0...",0.22


In [14]:
import math
import plotly.graph_objects as go
from plotly.subplots import make_subplots

user_id = 50440544  # 15773986 # 11099474 # 28668216 # 24903035 # 39137547
user_history = new_log[new_log['user'] == user_id]
session_breakpoints = np.nonzero((user_history.index[1:] - user_history.index[:-1]) > pd.Timedelta(1, 'h'))[0].tolist()

fig = make_subplots(rows=math.ceil((len(session_breakpoints) + 1) / 4), cols=4)

start = 0
for i, end in enumerate(session_breakpoints + [len(user_history)]):
    session = user_history[start:end + 1]
    fig.add_trace(
        go.Scatter(
            x=session.index,
            y=session['distance_from_profile'],
            text='task id ' + session['item'].astype(str),
            mode='lines+markers',
            marker=dict(
                color=session['correct'].apply(lambda x: 'green' if x else 'red'),
                symbol=session['final'].apply(lambda x: 'x' if x else 'circle'),
                size=10
            ),
        ),
        col=i % 4 + 1, row=i // 4 + 1
    )
    fig.update_layout(
        margin=dict(l=0,r=0,b=0,t=40),
        showlegend=False,
        title=f'Sessions of user id {user_id}'
    )
    fig.update_xaxes(
        tickformat="%H:%M<br>%d-%m"
    )
    start = end + 1

fig.show()

In [15]:
for i, row in user_history[user_history['item'] == 50].iterrows():
    print(row['answer'], row['distance_from_profile'])

In [16]:
for i, row in user_history[user_history['item'] == 1].iterrows():
    print(row['answer'], row['linter_messages'], row['distance_from_profile'], row['freq_profile'])