
## Data Preparation

Before we process the data, we need to first acquire the dataset which is shown in [prepare_dataset.ipynb](prepare_dataset.ipynb)

## Parameters
This notebook show the experiment that use the improved rasch embedding:  
Dataset:Assistment2009  
use_rasch:True  
use_answer_time: False

In [1]:
import numpy as np
import torch
import random
def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True
# 设置随机数种子
setup_seed(2023)

In [2]:
import pickle
def generate_q_matrix(path, n_skill, n_exercise):
    with open(path, 'rb') as f:
        problem2skill = pickle.load(f)
        
    q_matrix = np.zeros((n_exercise + 1, n_skill + 1))
    for p,values in problem2skill.items():
        for s in values:
            q_matrix[p][s] = 1
    return q_matrix

In [3]:
n_exercise = 3162
n_skill = 102
batch_size = 32
hidden_size = 128
k_components= 32
dropout = 0.2 
use_rasch = True
path = "../../data/anonymized_full_release_competition_dataset/problem2skill.pickle"
q_matrix =  generate_q_matrix(path,n_skill,n_exercise)
dataset_name = "assistment2017"
seq_len = 200
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
lr=1e-3

## Training and Persistence

In [4]:
import logging
logging.getLogger().setLevel(logging.INFO)

In [5]:
from FRKT import FRKT
frkt = FRKT(n_exercise, n_skill, batch_size, q_matrix, device=device, hidden_size=hidden_size, k_components=k_components, dropout=dropout, use_rasch=use_rasch)
frkt.train(dataset_name, seq_len=seq_len, epoch=100, lr=lr)

../../data/anonymized_full_release_competition_dataset/train0.txt


Training: 100%|██████████| 113/113 [00:49<00:00,  2.29it/s]


[Epoch 0] LogisticLoss: 0.607717, auc: 0.677895, accuracy: 0.674792 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.48it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 0] LogisticLoss: 0.577109, auc: 0.733438, accuracy: 0.694871


Training: 100%|██████████| 113/113 [00:51<00:00,  2.21it/s]


[Epoch 1] LogisticLoss: 0.569018, auc: 0.740813, accuracy: 0.700693 


Testing: 100%|██████████| 27/27 [00:03<00:00,  8.37it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 1] LogisticLoss: 0.563148, auc: 0.751223, accuracy: 0.703926


Training: 100%|██████████| 113/113 [00:49<00:00,  2.28it/s]


[Epoch 2] LogisticLoss: 0.557977, auc: 0.754882, accuracy: 0.709524 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.92it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 2] LogisticLoss: 0.556426, auc: 0.759601, accuracy: 0.709594


Training: 100%|██████████| 113/113 [00:49<00:00,  2.29it/s]


[Epoch 3] LogisticLoss: 0.550651, auc: 0.763325, accuracy: 0.714700 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.29it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 3] LogisticLoss: 0.551631, auc: 0.764739, accuracy: 0.713697


Training: 100%|██████████| 113/113 [00:49<00:00,  2.28it/s]


[Epoch 4] LogisticLoss: 0.543697, auc: 0.771296, accuracy: 0.719477 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.59it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 4] LogisticLoss: 0.546994, auc: 0.770984, accuracy: 0.716193


Training: 100%|██████████| 113/113 [00:50<00:00,  2.24it/s]


[Epoch 5] LogisticLoss: 0.537636, auc: 0.777930, accuracy: 0.723955 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.68it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 5] LogisticLoss: 0.543106, auc: 0.773998, accuracy: 0.719629


Training: 100%|██████████| 113/113 [00:49<00:00,  2.29it/s]


[Epoch 6] LogisticLoss: 0.532543, auc: 0.783279, accuracy: 0.727351 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.99it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 6] LogisticLoss: 0.538876, auc: 0.779136, accuracy: 0.723572


Training: 100%|██████████| 113/113 [00:52<00:00,  2.14it/s]


[Epoch 7] LogisticLoss: 0.527682, auc: 0.788180, accuracy: 0.731257 


Testing: 100%|██████████| 27/27 [00:03<00:00,  8.33it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 7] LogisticLoss: 0.539018, auc: 0.779678, accuracy: 0.722007


Training: 100%|██████████| 113/113 [00:51<00:00,  2.17it/s]


[Epoch 8] LogisticLoss: 0.524824, auc: 0.791038, accuracy: 0.733229 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.78it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 8] LogisticLoss: 0.537103, auc: 0.781665, accuracy: 0.722654


Training: 100%|██████████| 113/113 [00:51<00:00,  2.18it/s]


[Epoch 9] LogisticLoss: 0.522645, auc: 0.793234, accuracy: 0.734550 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.88it/s]


[Epoch 9] LogisticLoss: 0.539601, auc: 0.781023, accuracy: 0.721979


Training: 100%|██████████| 113/113 [00:51<00:00,  2.19it/s]


[Epoch 10] LogisticLoss: 0.520518, auc: 0.795235, accuracy: 0.736099 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.62it/s]


[Epoch 10] LogisticLoss: 0.536637, auc: 0.781521, accuracy: 0.724253


Training: 100%|██████████| 113/113 [00:51<00:00,  2.21it/s]


[Epoch 11] LogisticLoss: 0.518703, auc: 0.797056, accuracy: 0.737778 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.71it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 11] LogisticLoss: 0.535773, auc: 0.781923, accuracy: 0.724406


Training: 100%|██████████| 113/113 [00:50<00:00,  2.23it/s]


[Epoch 12] LogisticLoss: 0.517477, auc: 0.798143, accuracy: 0.737891 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.34it/s]


[Epoch 12] LogisticLoss: 0.536573, auc: 0.780919, accuracy: 0.724559


Training: 100%|██████████| 113/113 [00:52<00:00,  2.16it/s]


[Epoch 13] LogisticLoss: 0.515719, auc: 0.799849, accuracy: 0.739709 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.55it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 13] LogisticLoss: 0.536223, auc: 0.781943, accuracy: 0.725011


Training: 100%|██████████| 113/113 [00:51<00:00,  2.19it/s]


[Epoch 14] LogisticLoss: 0.514725, auc: 0.800718, accuracy: 0.740675 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.69it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 14] LogisticLoss: 0.535976, auc: 0.783234, accuracy: 0.725853


Training: 100%|██████████| 113/113 [00:50<00:00,  2.25it/s]


[Epoch 15] LogisticLoss: 0.512975, auc: 0.802476, accuracy: 0.741548 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.62it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 15] LogisticLoss: 0.536554, auc: 0.783266, accuracy: 0.726409


Training: 100%|██████████| 113/113 [00:49<00:00,  2.29it/s]


[Epoch 16] LogisticLoss: 0.512042, auc: 0.803336, accuracy: 0.741830 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.75it/s]


[Epoch 16] LogisticLoss: 0.535496, auc: 0.783174, accuracy: 0.725011


Training: 100%|██████████| 113/113 [00:48<00:00,  2.32it/s]


[Epoch 17] LogisticLoss: 0.511218, auc: 0.804080, accuracy: 0.742881 


Testing: 100%|██████████| 27/27 [00:03<00:00,  8.03it/s]


[Epoch 17] LogisticLoss: 0.538619, auc: 0.780475, accuracy: 0.724712


Training: 100%|██████████| 113/113 [00:49<00:00,  2.30it/s]


[Epoch 18] LogisticLoss: 0.510566, auc: 0.804682, accuracy: 0.742963 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.76it/s]
INFO:root:save parameters to saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


[Epoch 18] LogisticLoss: 0.535463, auc: 0.783930, accuracy: 0.727105


Training: 100%|██████████| 113/113 [00:53<00:00,  2.11it/s]


[Epoch 19] LogisticLoss: 0.509781, auc: 0.805451, accuracy: 0.743832 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.21it/s]


[Epoch 19] LogisticLoss: 0.537262, auc: 0.782299, accuracy: 0.724399


Training: 100%|██████████| 113/113 [00:51<00:00,  2.19it/s]


[Epoch 20] LogisticLoss: 0.508924, auc: 0.806299, accuracy: 0.744630 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.88it/s]


[Epoch 20] LogisticLoss: 0.538472, auc: 0.782065, accuracy: 0.725255


Training: 100%|██████████| 113/113 [00:50<00:00,  2.24it/s]


[Epoch 21] LogisticLoss: 0.507492, auc: 0.807524, accuracy: 0.745525 


Testing: 100%|██████████| 27/27 [00:03<00:00,  7.70it/s]


[Epoch 21] LogisticLoss: 0.538045, auc: 0.782714, accuracy: 0.725602


Training:  37%|███▋      | 42/113 [00:19<00:32,  2.20it/s]


KeyboardInterrupt: 

## Loading and Testing

In [6]:
import numpy as np
import torch
import random
def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True
# 设置随机数种子
setup_seed(2023)

import pickle
def generate_q_matrix(path, n_skill, n_exercise):
    with open(path, 'rb') as f:
        problem2skill = pickle.load(f)
        
    q_matrix = np.zeros((n_exercise + 1, n_skill + 1))
    for p,values in problem2skill.items():
        for s in values:
            q_matrix[p][s] = 1
    return q_matrix

In [7]:
n_exercise = 3162
n_skill = 102
batch_size = 32
hidden_size = 128
k_components = 32
dropout = 0.2
use_rasch = True
path = "../../data/anonymized_full_release_competition_dataset/problem2skill.pickle"
q_matrix =  generate_q_matrix(path,n_skill,n_exercise)
dataset_name = "assistment2017"
seq_len = 200
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
lr=1e-3

dataset_dir ={}
dataset_dir["assistment2009"] = "2009_skill_builder_data_corrected_collapsed"
dataset_dir["assistment2012"] = "2012-2013-data-with-predictions-4-final"
dataset_dir["assistment2015"] = "2015_100_skill_builders_main_problems"
dataset_dir["assistment2017"] = "anonymized_full_release_competition_dataset"
dataset_dir["algebra2005"] = "algebra_2005_2006"
dataset_dir["statics"] = "statics"
dataset_dir["EdNet-KT1"] = "EdNet-Contents/contents"

import os 

saved_path = os.path.join("saved_model",dataset_dir[dataset_name])
model_path = os.path.join(saved_path, f"model-seq_len{seq_len:03d}-lr{lr}-hidden_size{hidden_size:03d}-k{k_components}-use_rasch{use_rasch:01d}.pt")

In [8]:
from load_data import DATA
from load_data_akt import DATA as DT_DATA             # 来自DTransformer的数据集
from load_data_akt import PID_DATA as DT_PID_DATA

data_dir = "../../data/"+dataset_dir[dataset_name]
if dataset_name in ["assistment2009", "assistment2012", "assistment2015", "assistment2017","EdNet-KT1","algebra2005"]:
    test_path = os.path.join(data_dir, "test.txt")      
    dat = DATA(seqlen=seq_len, separate_char=',')
    
elif dataset_name in ["statics"]:
    test_path = os.path.join(data_dir, "test.txt")
    if n_exercise>0:
        dat = DT_PID_DATA(seqlen=seq_len, separate_char=',')
    else:
        dat = DT_DATA(seqlen=seq_len, separate_char=',')
else:
    raise ValueError('ValueError: Unknown dataset! ')
       
test_data = dat.load_data(test_path)

In [9]:
import logging
logging.getLogger().setLevel(logging.INFO)

from FRKT import FRKT
print(model_path)
frkt = FRKT(n_exercise, n_skill, batch_size, q_matrix, device=device, hidden_size=hidden_size, k_components=k_components, dropout=dropout, use_rasch=use_rasch)
frkt.load(model_path)
frkt.eval(device, test_data)

# we find the at in 2017 is also important         (0.5253329277038574, 0.7833795517802937, 0.7426268005399308)
#0.7873361

INFO:root:load parameters from saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


saved_model/anonymized_full_release_competition_dataset/model-seq_len200-lr0.001-hidden_size128-k32-use_rasch1.pt


Testing: 100%|██████████| 34/34 [00:04<00:00,  7.71it/s]


(0.5343742370605469, 0.7847922879040903, 0.7284745481219033)