In [6]:
import pandas as pd
import numpy as np


In [7]:
data = pd.read_csv('data.csv')
data.drop(columns=['ui_language', 'lexeme_string', 'timestamp'], inplace=True)
data

Unnamed: 0,p_recall,delta,user_id,learning_language,lexeme_id,history_seen,history_correct,session_seen,session_correct
0,1.000000,27649635,u:FO,de,76390c1350a8dac31186187e2fe1e178,6,4,2,2
1,0.500000,27649635,u:FO,de,7dfd7086f3671685e2cf1c1da72796d7,4,4,2,1
2,1.000000,27649635,u:FO,de,35a54c25a2cda8127343f6a82e6f6b7d,5,4,1,1
3,0.500000,27649635,u:FO,de,0cf63ffe3dda158bc3dbd55682b355ae,6,5,2,1
4,1.000000,27649635,u:FO,de,84920990d78044db53c1b012f5bf9ab5,4,4,1,1
...,...,...,...,...,...,...,...,...,...
9994,1.000000,135265,u:iofD,de,230a37a2e4479a89b4f3f409841c5556,4,3,5,5
9995,1.000000,226278,u:irVT,es,065d3cd6bc42c437f89877740698c750,2,2,2,2
9996,1.000000,139465,u:irVT,es,b1b2b2203009f082a1cf172e42fa65a3,6,6,1,1
9997,0.666667,139465,u:irVT,es,86584daef5933b284384453795bbf0ed,9,8,3,2


In [8]:
data['accuracy_rate'] = data['history_correct'] / data['history_seen']
data['session_accuracy'] = data['session_correct'] / data['session_seen']
data['delta_days'] = data['delta'] / (60 * 60 * 24)

data.drop(columns=['delta', 'history_seen', 'history_correct', 'session_seen', 'session_correct'], inplace=True)
data

Unnamed: 0,p_recall,user_id,learning_language,lexeme_id,accuracy_rate,session_accuracy,delta_days
0,1.000000,u:FO,de,76390c1350a8dac31186187e2fe1e178,0.666667,1.000000,320.018924
1,0.500000,u:FO,de,7dfd7086f3671685e2cf1c1da72796d7,1.000000,0.500000,320.018924
2,1.000000,u:FO,de,35a54c25a2cda8127343f6a82e6f6b7d,0.800000,1.000000,320.018924
3,0.500000,u:FO,de,0cf63ffe3dda158bc3dbd55682b355ae,0.833333,0.500000,320.018924
4,1.000000,u:FO,de,84920990d78044db53c1b012f5bf9ab5,1.000000,1.000000,320.018924
...,...,...,...,...,...,...,...
9994,1.000000,u:iofD,de,230a37a2e4479a89b4f3f409841c5556,0.750000,1.000000,1.565567
9995,1.000000,u:irVT,es,065d3cd6bc42c437f89877740698c750,1.000000,1.000000,2.618958
9996,1.000000,u:irVT,es,b1b2b2203009f082a1cf172e42fa65a3,1.000000,1.000000,1.614178
9997,0.666667,u:irVT,es,86584daef5933b284384453795bbf0ed,0.888889,0.666667,1.614178


In [9]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables
categorical_cols = ['user_id', 'learning_language', 'lexeme_id']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

data

Unnamed: 0,p_recall,user_id,learning_language,lexeme_id,accuracy_rate,session_accuracy,delta_days
0,1.000000,1,0,1242,0.666667,1.000000,320.018924
1,0.500000,1,0,1336,1.000000,0.500000,320.018924
2,1.000000,1,0,537,0.800000,1.000000,320.018924
3,0.500000,1,0,132,0.833333,0.500000,320.018924
4,1.000000,1,0,1403,1.000000,1.000000,320.018924
...,...,...,...,...,...,...,...
9994,1.000000,456,0,359,0.750000,1.000000,1.565567
9995,1.000000,470,2,60,1.000000,1.000000,2.618958
9996,1.000000,470,2,1853,1.000000,1.000000,1.614178
9997,0.666667,470,2,1421,0.888889,0.666667,1.614178


In [51]:
from sklearn.model_selection import train_test_split

target_col = 'p_recall'
X = data.drop(columns=[target_col])
y = data[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
from sklearn.metrics import mean_squared_error
import xgboost as xgb

model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    objective='reg:squarederror',
    random_state=42
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 6.81577232639126e-08


In [53]:
for true, pred in zip(y_test[:5], y_pred[:5]):
    print(f"True Recall: {true:.2f}, Predicted Recall: {pred:.2f}")

True Recall: 1.00, Predicted Recall: 1.00
True Recall: 0.75, Predicted Recall: 0.75
True Recall: 1.00, Predicted Recall: 1.00
True Recall: 1.00, Predicted Recall: 1.00
True Recall: 1.00, Predicted Recall: 1.00


In [60]:
# Example new data
new_data = pd.DataFrame({
    'delta': [3600, 7200],
    'user_id': ['u:FO', 'u:FO'],
    'learning_language': ['de', 'de'],
    'lexeme_id': ['76390c1350a8dac31186187e2fe1e178', '7dfd7086f3671685e2cf1c1da72796d7'],
    'history_seen': [8, 6],
    'history_correct': [6, 4],
    'session_seen': [3, 2],
    'session_correct': [2, 1],
})

# Preprocess new data
new_data['accuracy_rate'] = new_data['history_correct'] / new_data['history_seen']
new_data['session_accuracy'] = new_data['session_correct'] / new_data['session_seen']
new_data['delta_days'] = new_data['delta'] / (60 * 60 * 24)

# Encode categorical variables using the label encoders from training
for col in categorical_cols:
    new_data[col] = label_encoders[col].transform(new_data[col])

# Drop unnecessary columns
new_data = new_data.drop(columns=['delta', 'history_seen', 'history_correct', 'session_seen', 'session_correct'])
new_data

Unnamed: 0,user_id,learning_language,lexeme_id,accuracy_rate,session_accuracy,delta_days
0,1,0,1242,0.75,0.666667,0.041667
1,1,0,1336,0.666667,0.5,0.083333


In [61]:
# Perform inference
predicted_recalls = model.predict(new_data)
print(f"Predicted Recall Probability: {predicted_recalls[0]:.2f}")

new_data['predicted_recall'] = predicted_recalls


Predicted Recall Probability: 0.67


In [62]:
ranked_words = new_data.sort_values(by='predicted_recall', ascending=False)
print(ranked_words[['lexeme_id', 'predicted_recall']])

   lexeme_id  predicted_recall
0       1242          0.666679
1       1336          0.500014
