In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from src.load_scripts import load_log, load_item

In [2]:
data_path = Path('data/')

retrain = False

if retrain:
    item = load_item(data_path)
    log = load_log(data_path, data_path / 'edulint')

    item.to_csv(data_path / 'cached_item.csv')
    log.to_csv(data_path / 'cached_log.csv')
else:
    item = pd.read_csv(data_path / 'cached_item.csv', index_col=0)
    log = pd.read_csv(data_path / 'cached_log.csv', index_col=0)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(min_df=0.005)
vectors = vectorizer.fit_transform(log['linter_messages'])
tfidf = TfidfTransformer().fit_transform(vectors)

log['linter_messages'] = list(map(np.array, vectors.toarray().tolist()))

In [4]:
with open(Path('data/edulint/results.txt'), 'r') as f:
    results = f.read().lower()
    
feature_descriptions = []
for feature_name in vectorizer.get_feature_names_out():
    begin = results.find(feature_name)
    feature_descriptions.append(results[begin:results.find('_"', begin)])

json.dump(feature_descriptions, open(Path('data/edulint/features.json'), 'w'))

In [5]:
def mean_in_out_class_distances(log: pd.DataFrame, target: str):
    in_classes = []
    out_classes = []
    for cls in np.unique(log[target]):
        in_classes.append(log[log[target] == cls]['distance_from_profile'].mean())
        out_classes.append(log[log[target] != cls]['distance_from_profile'].mean())
    return np.asarray(in_classes).mean(), np.asarray(out_classes).mean()


In [6]:
from src.linter_profile import MeanTaskProfiler, MeanNormTaskProfiler, NormSumTaskProfiler, NormForgetUserProfiler
from src.model import DistanceModel

dim = len(feature_descriptions)
profilers = [MeanNormTaskProfiler(dim), NormForgetUserProfiler(dim)]
model = DistanceModel('euclidean', 'l2')
for profiler in profilers:
    log['profile'] = profiler.build_profiles(log)
    log['distance_from_profile'] = model.calculate_distances(log['profile'], log['linter_messages'])
    print(mean_in_out_class_distances(log, 'item'))
    

(0.6047516921278043, 0.5514296103500583)
(0.5789256070117379, 0.5394568165010662)


In [16]:
log['task_profile'] = MeanNormTaskProfiler(dim).build_profiles(log)
log['user_profile'] = NormForgetUserProfiler(dim).build_profiles(log)

In [51]:
from sklearn.linear_model import LinearRegression

X = np.hstack([np.vstack(log['task_profile']), np.vstack(log['user_profile'])])
# X = np.vstack(log['task_profile'])
# X = np.vstack(log['user_profile'])
y = np.vstack(log['linter_messages'])
reg = LinearRegression().fit(X, y)
reg.score(X, y)

0.22451181677644344

In [38]:
X.shape

(160173, 62)

In [39]:
y.shape

(160173, 31)

In [35]:
log['linter_messages']

419       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
421       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
424       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
425       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
426       [0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                                ...                        
385037    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
385038    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
385039    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
385040    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
385041    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: linter_messages, Length: 160173, dtype: object

In [30]:
np.hstack([np.vstack(log['task_profile']), np.vstack(log['user_profile'])]).shape

(160173, 62)

In [13]:
log.sort_values(by='distance_from_profile', ascending=False)

Unnamed: 0,id,user,item,answer,correct,moves,responseTime,time,linter_messages,profile,distance_from_profile
169066,199845,30065896,53,"def middle_number(a, b, c):\n if a<b and b<...",1,-1,697350,2021-10-22 13:11:48,"[0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.351337e+00
126789,157568,22313410,66,"def decide(symbol1, symbol2):\n if symbol1=...",0,-1,371550,2021-05-05 21:46:26,"[0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.351203e+00
138227,169006,25070815,60,"def big_even(a, b):\n return a>b and a%2==0...",1,-1,30250,2021-06-03 19:31:29,"[0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.6485032414269816, 0.6878364276643...",1.347694e+00
120357,151172,23370541,90,1111111111111111111111111111111111111111111111...,0,-1,330750,2021-04-14 08:13:51,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.347478e+00
55547,86393,17126622,86,"def double(a, b):\n if b == a * 2:\n print...",1,-1,155100,2020-06-16 21:19:07,"[0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.002325648847305583, 0.00348847327...",1.347331e+00
...,...,...,...,...,...,...,...,...,...,...,...
221051,253174,33901490,45,def print_sums(n):\n for i in range(n):\n ...,0,-1,19200,2022-03-23 10:28:59,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.645505e-07
221052,253175,33901490,45,def print_sums(n):\n for i in range(n):\n ...,0,-1,19350,2022-03-23 10:28:59,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.316404e-07
221053,253176,33901490,45,def print_sums(n):\n for i in range(n):\n ...,0,-1,19550,2022-03-23 10:28:59,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.053123e-07
221054,253177,33901490,45,def print_sums(n):\n for i in range(n):\n ...,0,-1,19750,2022-03-23 10:29:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8.424983e-08


In [15]:
print(log.loc[126789]['answer'])

def decide(symbol1, symbol2):
    if symbol1=="N" and symbol2=="P":
        print (symbol1)
    elif symbol1=="P" and symbol2=="K":
        print (symbol1)
    elif symbol1=="K" and symbol2=="N":
        print (symbol1)
    elif symbol2=="N" and symbol1=="P":
        print (symbol2)
    elif symbol2=="P" and symbol1=="K":
        print (symbol2)
    elif symbol2=="K" and symbol1=="N":
        print (symbol2)
    print("Remiza")


In [11]:
from linter_profile import UserProfiler

for user, history in log.groupby(by='user'):
    UserProfiler().build_profile()

ModuleNotFoundError: No module named 'linter_profile'