In [None]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from src.load_scripts import load_log, load_item

In [None]:
data_path = Path('data/')

retrain = False

if retrain:
    item = load_item(data_path)
    log = load_log(data_path, data_path / 'edulint')

    item.to_csv(data_path / 'cached_item.csv')
    log.to_csv(data_path / 'cached_log.csv')
else:
    item = pd.read_csv(data_path / 'cached_item.csv', index_col=0)
    log = pd.read_csv(data_path / 'cached_log.csv', index_col=0)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(min_df=0.005)
vectors = vectorizer.fit_transform(log['linter_messages'])
tfidf = TfidfTransformer().fit_transform(vectors)

log['linter_messages'] = list(map(np.array, vectors.toarray().tolist()))

In [None]:
with open(Path('data/edulint/results.txt'), 'r') as f:
    results = f.read().lower()
    
feature_descriptions = []
for feature_name in vectorizer.get_feature_names_out():
    begin = results.find(feature_name)
    feature_descriptions.append(results[begin:results.find('_"', begin)])

json.dump(feature_descriptions, open(Path('data/edulint/features.json'), 'w'))

In [None]:
log.iloc[100000]

In [None]:
def mean_in_out_class_distances(log: pd.DataFrame, target: str):
    in_classes = []
    out_classes = []
    for cls in np.unique(log[target]):
        in_classes.append(log[log[target] == cls]['distance_from_profile'].mean())
        out_classes.append(log[log[target] != cls]['distance_from_profile'].mean())
    return np.asarray(in_classes).mean(), np.asarray(out_classes).mean()


In [None]:
from src.linter_profile import MeanTaskProfiler, MeanNormTaskProfiler, NormSumTaskProfiler, NormForgetUserProfiler
from src.model import DistanceModel

dim = len(feature_descriptions)
profilers = [NormForgetUserProfiler(dim)]
model = DistanceModel('euclidean', 'l2')
for profiler in profilers:
    log['profile'] = profiler.build_profiles(log)
    log['distance_from_profile'] = model.calculate_distances(log['profile'], log['linter_messages'])
    print(mean_in_out_class_distances(log, 'user'))
    

In [None]:
log['task_profile'] = MeanNormTaskProfiler(dim).build_profiles(log)
log['user_profile'] = NormForgetUserProfiler(dim).build_profiles(log)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

X = np.hstack([np.vstack(log['task_profile']), np.vstack(log['user_profile'])])
# X = np.vstack(log['task_profile'])
# X = np.vstack(log['user_profile'])
y = np.vstack(log['linter_messages'])
reg = RandomForestRegressor().fit(X, y)
reg.score(X, y)

In [None]:
pred = reg.predict(X)

In [None]:
from scipy.stats import pearsonr

results = []
for i in range(y.shape[1]):
    results.append(pearsonr(X[:, i], y[:, i]))

In [None]:
np.array([s for s, p in results]).mean()

In [None]:
log.sort_values(by='distance_from_profile', ascending=False)