In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from pycaret.classification import *

In [2]:
path = os.path.join('vocabulary', 'db', 'translation_exercise_results.csv')
path

'vocabulary\\db\\translation_exercise_results.csv'

In [3]:
raw_df = pd.read_csv(path, sep=';', parse_dates=['time'])
raw_df.head()

Unnamed: 0,user,session_id,lang,word_pl,correct_translation,user_answer,is_correct,time
0,default_user,1,latin,sławny,"cĕlĕbĕr, celebris, celebre",celeber celebris celebre,True,2023-10-28 20:54:36
1,default_user,2,latin,świątynia,"templum, templi",templum templi,True,2023-10-28 21:16:59
2,default_user,2,latin,"rynek, forum","fŏrum, fori",forum fori,True,2023-10-28 21:17:03
3,default_user,2,latin,śnieg,"nix, nivis",nix nivis,True,2023-10-28 21:17:09
4,default_user,2,latin,"odważny, silny","fortis, fortis, forte",fortis fortis forte,True,2023-10-28 21:17:39


In [46]:
df = raw_df.query('user == "default_user"')\
           .query('lang == "latin"')\
           .drop(['user', 'lang','session_id', 'correct_translation', 'user_answer'], axis=1)\
           .replace({True: 0, False: 1})\
           .rename(columns={'is_correct': 'wrong'})\
           .sort_values(['word_pl', 'time'])           

df.head(10)

Unnamed: 0,word_pl,wrong,time
109,Iliada,0,2023-11-07 22:00:26
150,Iliada,0,2023-11-08 20:45:56
223,Iliada,0,2023-12-07 20:14:33
296,Iliada,0,2023-12-17 20:48:17
424,Iliada,1,2023-12-24 12:07:07
462,Iliada,0,2023-12-24 19:12:35
642,Iliada,0,2024-01-20 12:11:25
120,artysta,0,2023-11-07 22:01:41
178,artysta,0,2023-11-08 21:42:11
256,artysta,0,2023-12-17 19:56:10


In [47]:
time_shift = df.groupby('word_pl')['time'].shift(1).to_frame().rename(columns={'time': 'time_shift'})
time_shift

Unnamed: 0,time_shift
109,NaT
150,2023-11-07 22:00:26
223,2023-11-08 20:45:56
296,2023-12-07 20:14:33
424,2023-12-17 20:48:17
...,...
1,NaT
336,2023-10-28 21:16:59
458,2023-12-17 21:32:36
497,2023-12-24 19:12:15


In [48]:
was_last_wrong = df.groupby('word_pl')['wrong'].shift(1).to_frame().rename(columns={'wrong': 'was_last_wrong'})
was_last_wrong

Unnamed: 0,was_last_wrong
109,
150,0.0
223,0.0
296,0.0
424,0.0
...,...
1,
336,0.0
458,0.0
497,1.0


In [49]:
df_with_timeshift_tmp = df.join(time_shift).join(was_last_correct)

time_col = df_with_timeshift_tmp.time
timeshift_col = df_with_timeshift_tmp.time_shift
time_diff_col = (time_col - timeshift_col)

df_with_timeshift = df_with_timeshift_tmp\
                        .assign(when_last_asked = time_diff_col)\
                        .drop(['time', 'time_shift'], axis=1)\
                        .query('when_last_asked != "NaT"')

df_with_timeshift['when_last_asked_min'] = (df_with_timeshift.when_last_asked.values / 60).astype('timedelta64[s]').astype('int')

df_with_timeshift

Unnamed: 0,word_pl,wrong,was_last_correct,when_last_asked,when_last_asked_min
150,Iliada,0,1.0,0 days 22:45:30,1365
223,Iliada,0,1.0,28 days 23:28:37,41728
296,Iliada,0,1.0,10 days 00:33:44,14433
424,Iliada,1,1.0,6 days 15:18:50,9558
462,Iliada,0,0.0,0 days 07:05:28,425
...,...,...,...,...,...
626,śnieg,0,1.0,26 days 16:25:12,38425
740,śnieg,0,1.0,8 days 04:12:38,11772
336,świątynia,0,1.0,50 days 00:15:37,72015
458,świątynia,1,1.0,6 days 21:39:39,9939


In [50]:
df_final = df_with_timeshift.drop('when_last_asked', axis=1)
df_final

Unnamed: 0,word_pl,wrong,was_last_correct,when_last_asked_min
150,Iliada,0,1.0,1365
223,Iliada,0,1.0,41728
296,Iliada,0,1.0,14433
424,Iliada,1,1.0,9558
462,Iliada,0,0.0,425
...,...,...,...,...
626,śnieg,0,1.0,38425
740,śnieg,0,1.0,11772
336,świątynia,0,1.0,72015
458,świątynia,1,1.0,9939


## other possible features:
#### when last correct answer (but how handle NaTs ? easier is 'was last correct'. ideally all history should be taken into account as a feature)
### how many times asked
### how many correct answers (ratio ?)
### part of speech
### jeśli np. kilka z rzędu odpowiedzi było złych szansa powinna być większa
### ale zamiast tak wymyślać zasady trzeba jakoś przedstawić cechy (np. sekwencja poprzednich odpowiedzi) i pozwolić algorytmowi znaleźć zasadę
### idealnie liczba kolumn mogłaby być dynamiczna, ale to trudniej zaprogramować i mogłoby się to bardzo rozrosnąć - czy to nie byłby problem, że liczba kolumn będzie zawsze większa niż liczba wierszy (np. dla każdego poprzedniego odpytania o dane słowo kolumny 'kiedy było' i 'jaka była odpowiedź'. dodatkowo liczba kolumn powinna być stała dla df, a tu dla różnych słów byłaby różna..
### na początek może lepiej wziąć stałą wartość, np. 3

In [51]:
train, test = train_test_split(df_final, test_size=0.25, random_state=42, shuffle=True)

In [52]:
s = setup(train, target='wrong')

Unnamed: 0,Description,Value
0,Session id,7112
1,Target,wrong
2,Target type,Binary
3,Original data shape,"(453, 4)"
4,Transformed data shape,"(453, 4)"
5,Transformed train set shape,"(317, 4)"
6,Transformed test set shape,"(136, 4)"
7,Numeric features,2
8,Categorical features,1
9,Preprocess,True


In [53]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7542,0.0,0.4156,0.6812,0.4986,0.353,0.3807,0.006
et,Extra Trees Classifier,0.7038,0.6976,0.52,0.5459,0.5136,0.3074,0.3194,0.021
lr,Logistic Regression,0.7005,0.4451,0.0311,0.2,0.0533,0.0346,0.0547,0.236
lightgbm,Light Gradient Boosting Machine,0.6973,0.6765,0.4878,0.5312,0.4922,0.2828,0.2926,0.03
dummy,Dummy Classifier,0.6942,0.5,0.0,0.0,0.0,0.0,0.0,0.006
lda,Linear Discriminant Analysis,0.6909,0.6924,0.4578,0.5016,0.4634,0.2545,0.2625,0.006
rf,Random Forest Classifier,0.6879,0.7001,0.4967,0.5135,0.4947,0.2736,0.2784,0.024
gbc,Gradient Boosting Classifier,0.6873,0.6741,0.4556,0.5156,0.4677,0.2547,0.263,0.013
nb,Naive Bayes,0.6846,0.672,0.1011,0.45,0.1559,0.0525,0.0775,0.007
qda,Quadratic Discriminant Analysis,0.6845,0.7215,0.4778,0.5044,0.4742,0.2569,0.2655,0.007


In [54]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [55]:
predictions = predict_model(best, data=test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ridge Classifier,0.7237,0.605,0.3043,0.5833,0.4,0.2429,0.2646


In [57]:
df_final.wrong.sum() / len(df_final)

0.30413223140495865

In [59]:
predictions

Unnamed: 0,word_pl,was_last_correct,when_last_asked_min,wrong,prediction_label
309,spustoszyć,0.0,57568,0,1
444,pogrzeb,1.0,9927,0,0
124,dyskrecja,1.0,1329,0,0
281,dyskrecja,1.0,56059,1,0
492,miłość,1.0,9973,0,0
...,...,...,...,...,...
159,zamknięty,1.0,52,0,0
643,sprytny,1.0,38886,0,0
632,"być obecnym, być pomocnym",1.0,38885,1,0
150,Iliada,1.0,1365,0,0


In [67]:
type(best)

sklearn.linear_model._ridge.RidgeClassifier

In [76]:
to_be_predicted = df_final.sort_values(by=['word_pl', 'when_last_asked_min']).groupby('word_pl').last().reset_index()
to_be_predicted

Unnamed: 0,word_pl,wrong,was_last_correct,when_last_asked_min
0,Iliada,0,1.0,41728
1,artysta,0,1.0,56053
2,bać się,1,0.0,41672
3,biały,1,0.0,43112
4,bitwa,0,1.0,29780
...,...,...,...,...
110,złoto,0,1.0,66130
111,łatwy,0,1.0,41730
112,ślad,1,1.0,38427
113,śnieg,1,1.0,41670


In [84]:
best

In [85]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import RidgeClassifier

In [86]:
load_breast_cancer(return_X_y=True)

(array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
 