###### * Base Source: https://www.kaggle.com/wangsg/a-self-attentive-model-for-knowledge-tracing
* My First Work: https://www.kaggle.com/leadbest/sakt-self-attentive-knowledge-tracing-submitter

1. Version 1: State Updates -> LB 0.765
2. Version 3: Random Selection of User Interactions -> LB 0.768
3. Version 6: Small Optimization -> LB 0.771?

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv
/kaggle/input/riiid-test-answer-prediction/example_test.csv
/kaggle/input/riiid-test-answer-prediction/questions.csv
/kaggle/input/riiid-test-answer-prediction/train.csv
/kaggle/input/riiid-test-answer-prediction/lectures.csv
/kaggle/input/riiid-test-answer-prediction/riiideducation/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/riiid-test-answer-prediction/riiideducation/__init__.py
/kaggle/input/saktmodel/valid_2.csv
/kaggle/input/saktmodel/best_model_ela_part.pt
/kaggle/input/saktmodel/train_200_valid_2.csv
/kaggle/input/saktmodel/train_100_valid_2.csv


In [2]:
import gc
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [3]:
#HDKIM
MAX_SEQ = 160
#HDKIMHDKIM

## Load data

In [4]:
%%time
dtype = {'timestamp': 'int64', 
         'user_id': 'int64' ,
         'content_id': 'int16',
         'content_type_id': 'int8',
         'answered_correctly':'int8',
         'prior_question_elapsed_time':'float64'}

train_df = pd.read_csv('/kaggle/input/saktmodel/train_200_valid_2.csv', usecols=[1,2,3,4,7,8], dtype=dtype)
train_df.head()


CPU times: user 23.8 s, sys: 1.46 s, total: 25.3 s
Wall time: 42 s


Unnamed: 0,timestamp,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time
0,0,841499248,6054,0,0,
1,0,1722355108,4260,0,1,
2,0,1973833483,5629,0,1,
3,0,488555728,5527,0,1,
4,0,1413146787,7900,0,1,


In [5]:
train_df = train_df[train_df.content_type_id == False]

#arrange by timestamp
train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop = True)

In [6]:
valid = pd.read_csv('/kaggle/input/saktmodel/valid_2.csv', usecols=[ 2, 3,4,5,8,9], dtype=dtype)
valid = valid[valid.content_type_id == False]

valid.head(10)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time
0,0,866605366,6267,0,1,
1,0,1436067548,3709,0,1,
2,0,2119645200,7900,0,1,
3,0,2019867894,4421,0,0,
4,0,1801712607,5137,0,1,
5,0,863504442,5554,0,1,
6,0,220576717,4660,0,1,
7,0,108605579,5067,0,0,
8,0,811433702,4517,0,0,
9,0,38532797,6479,0,0,


In [7]:
question_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
question_df['part'].fillna(1, inplace=True)
question_df['part'] = question_df['part']-1

In [8]:
part_skill = question_df['part'].unique()
print("part is:",part_skill)
n_part_skill = len(part_skill)
print("number part skills", n_part_skill)

part is: [0 1 2 3 4 5 6]
number part skills 7


In [9]:
elapsed_mean = train_df.prior_question_elapsed_time.mean()
elapsed_mean

24430.802356914817

In [10]:
train_df['prior_question_elapsed_time'].fillna(elapsed_mean, inplace=True)


In [11]:
def get_elapsed_time(ela):
    ela = ela // 1000
    if ela > 300:
        return 300
    else:
        return ela

In [12]:
train_df['prior_question_elapsed_time'] = train_df['prior_question_elapsed_time'].apply(lambda x: get_elapsed_time(x))

In [13]:
train_df = pd.merge(train_df, question_df, left_on='content_id',right_on='question_id',how='left')
valid = pd.merge(valid, question_df, left_on='content_id',right_on='question_id',how='left')

## Preprocess

In [14]:
# skills = train_df["content_id"].unique()
n_skill = 13523
print("number skills", n_skill)

number skills 13523


In [15]:
ela_skill = train_df['prior_question_elapsed_time'].unique()
n_ela_skill = 301
print("number ela skills", n_ela_skill)
print("ela is:", ela_skill)

number ela skills 301
ela is: [ 24.   5.  17.  83.  21.  51.  48.  29.  30.  69. 101.  34.  31.  55.
  20.   6.  56.  42.  19.  27.  67.  36.  46.  25.  13.  26.  22.  50.
  61.  70.  14.  47.  52.   0.  64.  76.  28.  86.  41.  15.  33.   1.
  58.  23.  54.  82.  75.   3.  11.  59.   4.  45.  32.  37.   2.  16.
  38.  74.  18.  71.  10.   9.  63.  81.   7.  60.  53.  62.  35.  72.
  39.  65.  40.  68.  57.  43.  78.  12.  90.  49.  77.  93.   8.  87.
  89.  44.  84. 121.  91.  79.  94. 109. 162. 300. 187. 188. 155. 170.
 156. 160. 126. 100. 134. 217. 218.  73. 183. 216. 204.  80.  98. 108.
 129.  96. 154. 148. 141. 241. 221. 124.  95. 202. 186. 180.  97. 127.
 104.  85. 181. 208. 210. 172. 106. 266. 167. 232.  66. 115. 152. 143.
 244.  88. 297. 193. 132. 133. 110. 116. 196. 215. 179. 219. 139. 228.
 117. 105. 113.  99. 112. 102.  92. 225. 136. 174. 250. 159. 207. 138.
 251. 213. 201. 175. 114. 199. 197. 192. 128. 107. 264. 246. 171. 125.
 165. 120. 119. 149. 230. 135. 206. 111. 191. 1

In [16]:
valid['prior_question_elapsed_time'].fillna(elapsed_mean, inplace=True)
valid['prior_question_elapsed_time'] = valid['prior_question_elapsed_time'].apply(lambda x: get_elapsed_time(x))

In [17]:
group = train_df[['user_id', 'part','content_id', 'answered_correctly','prior_question_elapsed_time']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values,
            r['prior_question_elapsed_time'].values,
            r['part'].values))



del train_df
gc.collect()

0

In [18]:
#HDKIM
import random
random.seed(1)
#HDKIMHDKIM

In [19]:
class SAKTDataset(Dataset):
    def __init__(self, group, n_skill,n_ela_skill, max_seq=MAX_SEQ): #HDKIM 100
        super(SAKTDataset, self).__init__()
        self.max_seq = max_seq
        self.n_skill = n_skill
        self.n_ela_skill = n_ela_skill
        self.samples = group
        
#         self.user_ids = [x for x in group.index]
        self.user_ids = []
        for user_id in group.index:
            q, qa,ela,part = group[user_id]
            if len(q) < 2: #HDKIM 10
                continue
            self.user_ids.append(user_id)
            
            #HDKIM Memory reduction
#             if len(q)>self.max_seq:
#                 group[user_id] = (q[-self.max_seq:],qa[-self.max_seq:], ela[-self.max_seq:])

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        q_, qa_,ela_,part_ = self.samples[user_id]
        seq_len = len(q_)

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        ela = np.zeros(self.max_seq, dtype=int)
        part = np.zeros(self.max_seq, dtype=int)

        if seq_len >= self.max_seq:
            if random.random() >0.1:
                start = random.randint(0, (seq_len-self.max_seq))
                end = start + self.max_seq
                q[:] = q_[start:end]
                qa[:] = qa_[start:end]
                ela[:] = ela_[start:end]
                part[:] = part_[start:end]

            else:   
                q[:] = q_[-self.max_seq:]
                qa[:] = qa_[-self.max_seq:]
                ela[:] = ela_[-self.max_seq:]
                part[:] = part_[-self.max_seq:]
                
        else:
            if random.random() > 0.1:
                start = 0
                end = random.randint(2, seq_len)
                seq_len = end -start
                q[-seq_len:] = q_[0:seq_len]
                qa[-seq_len:] = qa_[0:seq_len]
                ela[-seq_len:] = ela_[0:seq_len]
                part[-seq_len:] = part_[0:seq_len]
                
            else:
                q[-seq_len:] = q_
                qa[-seq_len:] = qa_
                ela[-seq_len:] = ela_   
                part[-seq_len:] = part_   
            
        target_id = q[1:]
        label = qa[1:]
        ela_target = ela[1:]
        part_target = part[1:]

        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[:-1].copy()
#         x += (qa[:-1] == 1) * self.n_skill
        
        ela_x = np.zeros(self.max_seq-1, dtype=int)
        ela_x = ela[:-1].copy()
        
        part_x = np.zeros(self.max_seq-1, dtype=int)
        part_x = part[:-1].copy()        
        
        ans_x = np.zeros(self.max_seq-1, dtype=int)
        ans_x = qa[:-1].copy()
#         ela_x += (qa[:-1]==1) * self.n_ela_skill

        return x, target_id,ela_x,ela_target,part_x,part_target,ans_x,label
    # x和target_id都是来自于q，存在一个错位，x比target_id提前一个位置
    # 所以我们可以尝试将ela 也分为ela_x和ela_target_id

In [20]:
dataset = SAKTDataset(group, n_skill, n_ela_skill)
dataloader = DataLoader(dataset, batch_size=2048, shuffle=True,num_workers=16)

item = dataset.__getitem__(5)


In [21]:
class ValidDataset(Dataset):
    def __init__(self, samples, test_df, n_skill,n_ela_skill, max_seq=MAX_SEQ): #HDKIM 100
        super(ValidDataset, self).__init__()
        self.samples = samples
        self.user_ids = [x for x in test_df["user_id"].unique()]
        self.test_df = test_df
        self.n_ela_skill = n_ela_skill
        self.n_skill = n_skill
        self.max_seq = max_seq

    def __len__(self):
        return self.test_df.shape[0]

    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]

        user_id = test_info["user_id"]
        target_id = test_info["content_id"]
        ela_target_id = test_info["prior_question_elapsed_time"]
        part_target_id = test_info["part"]
        
        label = test_info['answered_correctly']

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        ela = np.zeros(self.max_seq, dtype=int)
        part = np.zeros(self.max_seq, dtype=int)

        if user_id in self.samples.index:
            q_, qa_, ela_, part_ = self.samples[user_id]
            
            seq_len = len(q_)

            if seq_len >= self.max_seq:
                q = q_[-self.max_seq:]
                qa = qa_[-self.max_seq:]
                ela = ela_[-self.max_seq:]
                part = part_[-self.max_seq:]
                
            else:
                q[-seq_len:] = q_
                qa[-seq_len:] = qa_       
                ela[-seq_len:] = ela_
                part[-seq_len:] = part_
        
        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[1:].copy()
#         x += (qa[1:] == 1) * self.n_skill
        
        questions = np.append(q[2:], [target_id])
        
        ela_x = np.zeros(self.max_seq-1, dtype=int)
        ela_x = ela[1:].copy()
#         ela_x += (qa[1:] == 1) * self.n_ela_skill
        
        ela_target = np.append(ela[2:],[ela_target_id])
        
        part_x = np.zeros(self.max_seq-1, dtype=int)
        part_x = part[1:].copy()
        
        part_target = np.append(part[2:],[part_target_id])

        ans_x = np.zeros(self.max_seq-1, dtype=int)
        ans_x = qa[1:].copy()
#         return x, questions, ela_x, ela_target, np.array([label])
        return torch.LongTensor(x), torch.LongTensor(questions),\
            torch.LongTensor(ela_x), torch.LongTensor(ela_target),\
            torch.LongTensor(part_x), torch.LongTensor(part_target),\
            torch.LongTensor(ans_x),\
            torch.FloatTensor(np.array([label]))

In [22]:
test_dataset = ValidDataset(group, valid, n_skill, n_ela_skill)
test_dataloader = DataLoader(test_dataset, batch_size=2048, shuffle=False,num_workers=8)

## Define model

In [23]:
class FFN(nn.Module):
    def __init__(self, state_size=200):
        super(FFN, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(state_size, state_size)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(seq_length):
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)


class SAKTModel(nn.Module):
    def __init__(self, n_skill,n_ela_skill,n_part_skill, max_seq=MAX_SEQ, embed_dim=128): #HDKIM 100
        super(SAKTModel, self).__init__()
        self.n_skill = n_skill
        self.n_ela_skill = n_ela_skill
        self.n_part_skill = n_part_skill
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(2*n_skill+1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq-1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)
        
        self.ela_ans_embedding = nn.Embedding(2*n_ela_skill+1, embed_dim)
        self.ela_embedding = nn.Embedding(n_ela_skill+1, embed_dim)

        self.part_embedding = nn.Embedding(n_part_skill+1, embed_dim)
        self.ans_embedding = nn.Embedding(2, embed_dim)
        
        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=8, dropout=0.2)

        self.dropout = nn.Dropout(0.2)
        self.layer_normal = nn.LayerNorm(embed_dim) 

        self.ffn = FFN(embed_dim)
        self.pred = nn.Linear(embed_dim, 1)
    
    def forward(self, x, question_ids, ela_x, ela_target_ids, part_x, part_target_ids,ans_x):
        device = x.device   
#         print("x shape is:{}".format(x.shape))
#         print("x is:{}\n".format(x))
#         x = self.embedding(x)
        x = self.e_embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)
        
        ans_x = self.ans_embedding(ans_x)
        pos_x = self.pos_embedding(pos_id)
#         ela_x = self.ela_ans_embedding(ela_x)
        ela_x = self.ela_embedding(ela_x)
        part_x = self.part_embedding(part_x)
        
        x = x + ela_x + pos_x + part_x + ans_x

        e = self.e_embedding(question_ids)
        ela = self.ela_embedding(ela_target_ids)
        part = self.part_embedding(part_target_ids)
        e = e + ela + part
        
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = e.permute(1, 0, 2)
        att_mask = future_mask(x.size(0)).to(device)
        att_output, att_weight = self.multi_att(e, x, x, attn_mask=att_mask)
        # 这个att_output其实是x的一个attention表示
        att_output = self.layer_normal(att_output + e)
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]

        x = self.ffn(att_output)
        x = self.layer_normal(x + att_output)
        x = self.pred(x)

        return x.squeeze(-1), att_weight

In [24]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = xm.xla_device()

model = SAKTModel(n_skill,n_ela_skill,n_part_skill, embed_dim=128)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.99, weight_decay=0.005)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model.to(device)
criterion.to(device)

BCEWithLogitsLoss()

In [25]:
def train_epoch(model, train_iterator, optim, criterion, device="cpu"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    tbar = tqdm(train_iterator)
    for item in tbar:
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        ela_x = item[2].to(device).long()
        ela_target = item[3].to(device).long()
        part_x = item[4].to(device).long()
        part_target = item[5].to(device).long()  
        ans_x = item[6].to(device).long()  

        label = item[7].to(device).float()

        optim.zero_grad()
        output, atten_weight = model(x, target_id, ela_x, ela_target, part_x, part_target,ans_x)
        loss = criterion(output, label)
        loss.backward()
        optim.step()
#         xm.mark_step()
        train_loss.append(loss.item())

        output = output[:, -1]
        label = label[:, -1] 
        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(train_loss)

    return loss, acc, auc

In [26]:
def evaluate_epoch(model, test_dataloader, criterion, device="cpu"):
    model.eval()
    eval_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []
    
    with torch.no_grad():
        for item in tqdm(test_dataloader):
            x = item[0].to(device).long()
            target_id = item[1].to(device).long()
            ela_x = item[2].to(device).long()
            ela_target = item[3].to(device).long()
            part_x = item[4].to(device).long()
            part_target = item[5].to(device).long()
            ans_x = item[6].to(device).long()
            
            label = item[7].to(device).float()
#             print("In evaluation x shape is:{}\t taret_id shape is:{}\t label shape is:{}".format(x.shape, target_id.shape,label.shape))
            
            output, atten_weight = model(x, target_id, ela_x, ela_target, part_x, part_target, ans_x)
            
            output_loss = output[:, -1:]
#             print("Output shape is:{}\tlabel shape is:{}".format(output.shape, label.shape))
            loss = criterion(output_loss, label)
            eval_loss.append(loss.item())
            
            output = output[:,-1]
            label = label[:, -1] 
            pred = (torch.sigmoid(output) >= 0.5).long()
            
            num_corrects += (pred == label).sum().item()
            num_total += len(label)

            labels.extend(label.view(-1).data.cpu().numpy())
            outs.extend(output.view(-1).data.cpu().numpy())

    acc = num_corrects / num_total
#     print("In evaluate labels is:{}\t outs is:{}\t".format(labels, outs))

    auc = roc_auc_score(labels, outs)
#     auc = 0

    loss = np.mean(eval_loss)   
    
    return loss, acc, auc

In [27]:
epochs = 60
best_auc = 0.0
for epoch in range(epochs):
    loss, acc, auc = train_epoch(model, dataloader, optimizer, criterion, device)
    print("epoch - {} train_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, loss, acc, auc))
    
    if epoch %10 == 0 or epoch == 59:
        eval_loss, eval_acc, eval_auc = evaluate_epoch(model, test_dataloader, criterion, device)
        print("epoch - {} eval_loss - {:.2f} eval acc - {:.3f} eval auc - {:.3f}".format(epoch, eval_loss, eval_acc, eval_auc))
        if best_auc < eval_auc: 
            best_auc = eval_auc
            torch.save(model.state_dict(),'best_model.pt')


loss - 0.2014: 100%|██████████| 193/193 [01:09<00:00,  2.78it/s]
  0%|          | 0/385 [00:00<?, ?it/s]

epoch - 0 train_loss - 0.23 acc - 0.626 auc - 0.668


100%|██████████| 385/385 [04:55<00:00,  1.30it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 0 eval_loss - 0.63 eval acc - 0.645 eval auc - 0.704


loss - 0.1906: 100%|██████████| 193/193 [01:07<00:00,  2.84it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 1 train_loss - 0.21 acc - 0.676 auc - 0.738


loss - 0.2017: 100%|██████████| 193/193 [01:08<00:00,  2.81it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 2 train_loss - 0.20 acc - 0.684 auc - 0.749


loss - 0.1857: 100%|██████████| 193/193 [01:08<00:00,  2.80it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 3 train_loss - 0.20 acc - 0.688 auc - 0.754


loss - 0.2027: 100%|██████████| 193/193 [01:08<00:00,  2.82it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 4 train_loss - 0.20 acc - 0.687 auc - 0.754


loss - 0.1903: 100%|██████████| 193/193 [01:08<00:00,  2.81it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 5 train_loss - 0.20 acc - 0.688 auc - 0.754


loss - 0.1941: 100%|██████████| 193/193 [01:08<00:00,  2.80it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 6 train_loss - 0.20 acc - 0.689 auc - 0.755


loss - 0.1958: 100%|██████████| 193/193 [01:08<00:00,  2.81it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 7 train_loss - 0.20 acc - 0.689 auc - 0.755


loss - 0.1902: 100%|██████████| 193/193 [01:08<00:00,  2.81it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 8 train_loss - 0.20 acc - 0.690 auc - 0.757


loss - 0.2090: 100%|██████████| 193/193 [01:09<00:00,  2.77it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 9 train_loss - 0.20 acc - 0.691 auc - 0.759


loss - 0.1845: 100%|██████████| 193/193 [01:09<00:00,  2.79it/s]
  0%|          | 0/385 [00:00<?, ?it/s]

epoch - 10 train_loss - 0.20 acc - 0.690 auc - 0.757


100%|██████████| 385/385 [04:55<00:00,  1.30it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 10 eval_loss - 0.60 eval acc - 0.677 eval auc - 0.747


loss - 0.2010: 100%|██████████| 193/193 [01:08<00:00,  2.81it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 11 train_loss - 0.20 acc - 0.691 auc - 0.758


loss - 0.1892: 100%|██████████| 193/193 [01:09<00:00,  2.79it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 12 train_loss - 0.20 acc - 0.691 auc - 0.758


loss - 0.2059: 100%|██████████| 193/193 [01:08<00:00,  2.81it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 13 train_loss - 0.20 acc - 0.691 auc - 0.758


loss - 0.1924: 100%|██████████| 193/193 [01:08<00:00,  2.82it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 14 train_loss - 0.20 acc - 0.692 auc - 0.760


loss - 0.2074: 100%|██████████| 193/193 [01:09<00:00,  2.80it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 15 train_loss - 0.20 acc - 0.692 auc - 0.760


loss - 0.1944: 100%|██████████| 193/193 [01:09<00:00,  2.77it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 16 train_loss - 0.20 acc - 0.693 auc - 0.761


loss - 0.1783: 100%|██████████| 193/193 [01:08<00:00,  2.81it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 17 train_loss - 0.20 acc - 0.693 auc - 0.760


loss - 0.2015: 100%|██████████| 193/193 [01:09<00:00,  2.77it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 18 train_loss - 0.20 acc - 0.692 auc - 0.760


loss - 0.2427: 100%|██████████| 193/193 [01:09<00:00,  2.79it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 19 train_loss - 0.20 acc - 0.694 auc - 0.761


loss - 0.1998: 100%|██████████| 193/193 [01:08<00:00,  2.81it/s]
  0%|          | 0/385 [00:00<?, ?it/s]

epoch - 20 train_loss - 0.20 acc - 0.694 auc - 0.761


100%|██████████| 385/385 [04:55<00:00,  1.30it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 20 eval_loss - 0.59 eval acc - 0.682 eval auc - 0.751


loss - 0.2146: 100%|██████████| 193/193 [01:09<00:00,  2.77it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 21 train_loss - 0.20 acc - 0.693 auc - 0.761


loss - 0.1860: 100%|██████████| 193/193 [01:08<00:00,  2.81it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 22 train_loss - 0.20 acc - 0.694 auc - 0.761


loss - 0.1938: 100%|██████████| 193/193 [01:09<00:00,  2.78it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 23 train_loss - 0.20 acc - 0.694 auc - 0.761


loss - 0.2005: 100%|██████████| 193/193 [01:07<00:00,  2.84it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 24 train_loss - 0.20 acc - 0.693 auc - 0.761


loss - 0.1746: 100%|██████████| 193/193 [01:08<00:00,  2.80it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 25 train_loss - 0.20 acc - 0.691 auc - 0.760


loss - 0.2059: 100%|██████████| 193/193 [01:07<00:00,  2.84it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 26 train_loss - 0.20 acc - 0.695 auc - 0.763


loss - 0.1903: 100%|██████████| 193/193 [01:08<00:00,  2.83it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 27 train_loss - 0.20 acc - 0.695 auc - 0.762


loss - 0.1968: 100%|██████████| 193/193 [01:09<00:00,  2.78it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 28 train_loss - 0.20 acc - 0.693 auc - 0.761


loss - 0.2069: 100%|██████████| 193/193 [01:08<00:00,  2.81it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 29 train_loss - 0.20 acc - 0.693 auc - 0.761


loss - 0.2036: 100%|██████████| 193/193 [01:09<00:00,  2.78it/s]
  0%|          | 0/385 [00:00<?, ?it/s]

epoch - 30 train_loss - 0.20 acc - 0.694 auc - 0.762


100%|██████████| 385/385 [04:55<00:00,  1.30it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 30 eval_loss - 0.60 eval acc - 0.677 eval auc - 0.752


loss - 0.2065: 100%|██████████| 193/193 [01:08<00:00,  2.83it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 31 train_loss - 0.20 acc - 0.693 auc - 0.761


loss - 0.1981: 100%|██████████| 193/193 [01:09<00:00,  2.78it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 32 train_loss - 0.20 acc - 0.694 auc - 0.762


loss - 0.2012: 100%|██████████| 193/193 [01:08<00:00,  2.82it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 33 train_loss - 0.20 acc - 0.695 auc - 0.763


loss - 0.1915: 100%|██████████| 193/193 [01:07<00:00,  2.84it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 34 train_loss - 0.20 acc - 0.695 auc - 0.763


loss - 0.1808: 100%|██████████| 193/193 [01:09<00:00,  2.79it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 35 train_loss - 0.20 acc - 0.694 auc - 0.762


loss - 0.1989: 100%|██████████| 193/193 [01:09<00:00,  2.79it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 36 train_loss - 0.20 acc - 0.694 auc - 0.762


loss - 0.2092: 100%|██████████| 193/193 [01:08<00:00,  2.80it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 37 train_loss - 0.20 acc - 0.696 auc - 0.764


loss - 0.1924: 100%|██████████| 193/193 [01:07<00:00,  2.84it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 38 train_loss - 0.20 acc - 0.695 auc - 0.763


loss - 0.1962: 100%|██████████| 193/193 [01:10<00:00,  2.76it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 39 train_loss - 0.20 acc - 0.696 auc - 0.764


loss - 0.1922: 100%|██████████| 193/193 [01:08<00:00,  2.83it/s]
  0%|          | 0/385 [00:00<?, ?it/s]

epoch - 40 train_loss - 0.20 acc - 0.695 auc - 0.764


100%|██████████| 385/385 [04:55<00:00,  1.30it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 40 eval_loss - 0.59 eval acc - 0.684 eval auc - 0.756


loss - 0.1997: 100%|██████████| 193/193 [01:08<00:00,  2.83it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 41 train_loss - 0.20 acc - 0.697 auc - 0.765


loss - 0.1991: 100%|██████████| 193/193 [01:09<00:00,  2.77it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 42 train_loss - 0.20 acc - 0.698 auc - 0.766


loss - 0.1905: 100%|██████████| 193/193 [01:10<00:00,  2.74it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 43 train_loss - 0.20 acc - 0.696 auc - 0.765


loss - 0.1593: 100%|██████████| 193/193 [01:08<00:00,  2.82it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 44 train_loss - 0.20 acc - 0.697 auc - 0.765


loss - 0.1806: 100%|██████████| 193/193 [01:08<00:00,  2.81it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 45 train_loss - 0.20 acc - 0.697 auc - 0.766


loss - 0.1974: 100%|██████████| 193/193 [01:09<00:00,  2.79it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 46 train_loss - 0.20 acc - 0.697 auc - 0.765


loss - 0.1866: 100%|██████████| 193/193 [01:08<00:00,  2.82it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 47 train_loss - 0.20 acc - 0.697 auc - 0.766


loss - 0.2029: 100%|██████████| 193/193 [01:09<00:00,  2.79it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 48 train_loss - 0.20 acc - 0.696 auc - 0.765


loss - 0.2104: 100%|██████████| 193/193 [01:08<00:00,  2.80it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 49 train_loss - 0.20 acc - 0.696 auc - 0.765


loss - 0.2119: 100%|██████████| 193/193 [01:10<00:00,  2.74it/s]
  0%|          | 0/385 [00:00<?, ?it/s]

epoch - 50 train_loss - 0.20 acc - 0.699 auc - 0.768


100%|██████████| 385/385 [04:59<00:00,  1.28it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 50 eval_loss - 0.58 eval acc - 0.689 eval auc - 0.759


loss - 0.1766: 100%|██████████| 193/193 [01:10<00:00,  2.73it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 51 train_loss - 0.20 acc - 0.698 auc - 0.767


loss - 0.1682: 100%|██████████| 193/193 [01:11<00:00,  2.72it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 52 train_loss - 0.20 acc - 0.698 auc - 0.768


loss - 0.2035: 100%|██████████| 193/193 [01:10<00:00,  2.72it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 53 train_loss - 0.20 acc - 0.699 auc - 0.768


loss - 0.2155: 100%|██████████| 193/193 [01:16<00:00,  2.52it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 54 train_loss - 0.20 acc - 0.699 auc - 0.768


loss - 0.2038: 100%|██████████| 193/193 [01:14<00:00,  2.60it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 55 train_loss - 0.20 acc - 0.698 auc - 0.767


loss - 0.1928: 100%|██████████| 193/193 [01:14<00:00,  2.59it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 56 train_loss - 0.20 acc - 0.699 auc - 0.768


loss - 0.1755: 100%|██████████| 193/193 [01:16<00:00,  2.54it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 57 train_loss - 0.20 acc - 0.699 auc - 0.768


loss - 0.1690: 100%|██████████| 193/193 [01:15<00:00,  2.56it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 58 train_loss - 0.20 acc - 0.699 auc - 0.768


loss - 0.1937: 100%|██████████| 193/193 [01:14<00:00,  2.58it/s]
  0%|          | 0/385 [00:00<?, ?it/s]

epoch - 59 train_loss - 0.20 acc - 0.699 auc - 0.768


100%|██████████| 385/385 [06:13<00:00,  1.03it/s]


epoch - 59 eval_loss - 0.59 eval acc - 0.687 eval auc - 0.760


In [28]:
import gc
del dataset
gc.collect()

20

## Test

In [29]:
class TestDataset(Dataset):
    def __init__(self, samples, test_df, n_skill, n_ela_skill,n_part_skill,max_seq=MAX_SEQ): #HDKIM 100
        super(TestDataset, self).__init__()
        self.samples = samples
        self.user_ids = [x for x in test_df["user_id"].unique()]
        self.test_df = test_df
        self.n_ela_skill = n_ela_skill
        self.n_part_skill = n_part_skill
        self.n_skill = n_skill
        self.max_seq = max_seq

    def __len__(self):
        return self.test_df.shape[0]

    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]

        user_id = test_info["user_id"]
        target_id = test_info["content_id"]
        ela_target_id = test_info["prior_question_elapsed_time"]
        part_target_id = test_info["part"]

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        ela = np.zeros(self.max_seq, dtype=int)
        part = np.zeros(self.max_seq, dtype=int)

        if user_id in self.samples.index:
#             print("self.samples[user_id] is:{}\n length is:{}".format(
#                 self.samples[user_id], len(self.samples[user_id])))
            q_, qa_, ela_,part_ = self.samples[user_id]
            seq_len = len(q_)

            if seq_len >= self.max_seq:
                q = q_[-self.max_seq:]
                qa = qa_[-self.max_seq:]
                ela = ela_[-self.max_seq:]
                part = part_[-self.max_seq:]

            else:
                q[-seq_len:] = q_
                qa[-seq_len:] = qa_          
                ela[-seq_len:] = ela_
                part[-seq_len:] = part_

                
        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[1:].copy()
#         x += (qa[1:] == 1) * self.n_skill
        
        questions = np.append(q[2:], [target_id])
        
        ela_x = np.zeros(self.max_seq-1, dtype=int)
        ela_x = ela[1:].copy()
#         ela_x += (qa[1:] == 1) * self.n_ela_skill
        
        ela_target = np.append(ela[2:],[ela_target_id])  
        
        part_x = np.zeros(self.max_seq-1, dtype=int)
        part_x = part[1:].copy()
        
        part_target = np.append(part[2:],[part_target_id])     
        
        ans_x = np.zeros(self.max_seq-1, dtype=int)
        ans_x = qa[1:].copy()
        return torch.LongTensor(x), torch.LongTensor(questions),\
            torch.LongTensor(ela_x), torch.LongTensor(ela_target),\
            torch.LongTensor(part_x), torch.LongTensor(part_target),\
            torch.LongTensor(ans_x)


In [30]:
model = SAKTModel(n_skill,n_ela_skill,n_part_skill,embed_dim=128)
model.load_state_dict(torch.load('best_model.pt',map_location=device))
model.to(device)

SAKTModel(
  (embedding): Embedding(27047, 128)
  (pos_embedding): Embedding(159, 128)
  (e_embedding): Embedding(13524, 128)
  (ela_ans_embedding): Embedding(603, 128)
  (ela_embedding): Embedding(302, 128)
  (part_embedding): Embedding(8, 128)
  (ans_embedding): Embedding(2, 128)
  (multi_att): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=128, out_features=128, bias=True)
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (layer_normal): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (ffn): FFN(
    (lr1): Linear(in_features=128, out_features=128, bias=True)
    (relu): ReLU()
    (lr2): Linear(in_features=128, out_features=128, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (pred): Linear(in_features=128, out_features=1, bias=True)
)

In [31]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()
import psutil
model.eval()

#HDKIM
prev_test_df = None
#HDKIMHDKIM

for (test_df, sample_prediction_df) in tqdm(iter_test):
    #HDKIM
    if (prev_test_df is not None) & (psutil.virtual_memory().percent<90):
        print(psutil.virtual_memory().percent)
        prev_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prev_test_df = prev_test_df[prev_test_df.content_type_id == False]
        prev_group = prev_test_df[['user_id', 'part','content_id', 'answered_correctly','prior_question_elapsed_time']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values,
            r['prior_question_elapsed_time'].values,
            r['part'].values))
        for prev_user_id in prev_group.index:
            prev_group_content = prev_group[prev_user_id][0]
            prev_group_ac = prev_group[prev_user_id][1]
            prev_group_ela = prev_group[prev_user_id][2]
            prev_group_part = prev_group[prev_user_id][3]

            if prev_user_id in group.index:
                group[prev_user_id] = (np.append(group[prev_user_id][0],prev_group_content), 
                                       np.append(group[prev_user_id][1],prev_group_ac),
                                       np.append(group[prev_user_id][2],prev_group_ela),
                                       np.append(group[prev_user_id][3],prev_group_part))
 
            else:
                group[prev_user_id] = (prev_group_content,prev_group_ac, prev_group_ela,prev_group_part)
            if len(group[prev_user_id][0])>MAX_SEQ:
                new_group_content = group[prev_user_id][0][-MAX_SEQ:]
                new_group_ac = group[prev_user_id][1][-MAX_SEQ:]
                new_group_ela = group[prev_user_id][2][-MAX_SEQ:]
                new_group_part = group[prev_user_id][3][-MAX_SEQ:]

                group[prev_user_id] = (new_group_content,new_group_ac,new_group_ela,new_group_part)

    test_df['prior_question_elapsed_time'].fillna(elapsed_mean, inplace=True)
    test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].apply(lambda x: get_elapsed_time(x))
             
    test_df = pd.merge(test_df, question_df, left_on='content_id',right_on='question_id',how='left')
    prev_test_df = test_df.copy()
 
    #HDKIMHDKIM
    
    test_df = test_df[test_df.content_type_id == False]
   
    test_dataset = TestDataset(group, test_df, n_skill, n_ela_skill, n_part_skill)
    test_dataloader = DataLoader(test_dataset, batch_size=51200, 
                                 shuffle=False)
    
    outs = []

    for item in tqdm(test_dataloader):
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        ela_x = item[2].to(device).long()
        ela_target = item[3].to(device).long()
        part_x = item[4].to(device).long()
        part_target = item[5].to(device).long()  
        ans_x = item[6].to(device).long()  
        
        with torch.no_grad():
            output, att_weight = model(x, target_id,ela_x, ela_target,part_x, part_target,ans_x)
        
        
        output = torch.sigmoid(output)
        output = output[:, -1]

        # pred = (output >= 0.5).long()
        # loss = criterion(output, label)

        # val_loss.append(loss.item())
        # num_corrects += (pred == label).sum().item()
        # num_total += len(label)

        # labels.extend(label.squeeze(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())
        
    test_df['answered_correctly'] =  outs
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 37.94it/s]
1it [00:00,  5.21it/s]
100%|██████████| 1/1 [00:00<00:00, 37.27it/s]
2it [00:00,  5.37it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

29.0
29.0


100%|██████████| 1/1 [00:00<00:00, 46.93it/s]

100%|██████████| 1/1 [00:00<00:00, 38.18it/s]
4it [00:00,  6.55it/s]

29.0


4it [00:01,  3.62it/s]
