* Base Source: https://www.kaggle.com/wangsg/a-self-attentive-model-for-knowledge-tracing
* My First Work: https://www.kaggle.com/leadbest/sakt-self-attentive-knowledge-tracing-submitter

In [38]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [39]:
import gc
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [40]:
#HDKIM
MAX_SEQ = 100
#HDKIMHDKIM

## Load data

In [41]:
%%time
dtype = {'timestamp': 'int64', 'user_id': 'int32' ,'content_id': 'int16','content_type_id': 'int8','answered_correctly':'int8'}

train_df = pd.read_csv('/Users/hesu/Documents/KT/riiid/train.csv', usecols=[1, 2, 3,4,7], dtype=dtype)
train_df.head()

CPU times: user 29.5 s, sys: 4.28 s, total: 33.7 s
Wall time: 34.2 s


Unnamed: 0,timestamp,user_id,content_id,content_type_id,answered_correctly
0,0,115,5692,0,1
1,56943,115,5716,0,1
2,118363,115,128,0,1
3,131167,115,7860,0,1
4,137965,115,7922,0,1


In [42]:
train_df = train_df[train_df.content_type_id == False]

#arrange by timestamp
train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop = True)

## Preprocess

In [43]:
skills = train_df["content_id"].unique()
n_skill = len(skills)
print("number skills", len(skills))

number skills 13523


In [44]:
group = train_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))

del train_df
gc.collect()

0

In [45]:
group.head(10)

user_id
115      ([5692, 5716, 128, 7860, 7922, 156, 51, 50, 78...
124      ([7900, 7876, 175, 1278, 2065, 2063, 2064, 336...
2746     ([5273, 758, 5976, 236, 404, 382, 405, 873, 53...
5382     ([5000, 3944, 217, 5844, 5965, 4990, 5235, 605...
8623     ([3915, 4750, 6456, 3968, 6104, 5738, 6435, 54...
8701     ([3901, 6671, 4963, 6143, 8279, 3964, 4002, 75...
12741    ([5145, 9691, 9697, 5202, 4787, 5695, 7858, 56...
13134    ([3926, 564, 3865, 4231, 3684, 3988, 3968, 521...
24418    ([7900, 7876, 175, 1278, 2064, 2065, 2063, 336...
24600    ([7900, 7876, 175, 1278, 2065, 2063, 2064, 336...
dtype: object

In [46]:
class SAKTDataset(Dataset):
    def __init__(self, group, n_skill, max_seq=MAX_SEQ): #HDKIM 100
        super(SAKTDataset, self).__init__()
        self.max_seq = max_seq
        self.n_skill = n_skill
        self.samples = group
        
#         self.user_ids = [x for x in group.index]
        self.user_ids = []
        for user_id in group.index:
            q, qa = group[user_id]
            if len(q) < 5: #HDKIM 10
                continue
            self.user_ids.append(user_id)
            
            #HDKIM Memory reduction
            if len(q)>self.max_seq:
                group[user_id] = (q[-self.max_seq:],qa[-self.max_seq:])

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        q_, qa_ = self.samples[user_id]
        seq_len = len(q_)
#         print("seq_len is:",seq_len)
        
        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        if seq_len >= self.max_seq:
            q[:] = q_[-self.max_seq:]
            qa[:] = qa_[-self.max_seq:]
        else:
            q[-seq_len:] = q_
            qa[-seq_len:] = qa_
            # 这一步相当于用q_去填q的-seq_len到末尾位
        
#         print("q length is:", len(q))
        target_id = q[1:]
        label = qa[1:]
        # 这个target_id存储的是question_id序列
        # 这个label存储的是answer序列
        
        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[:-1].copy()
        x += (qa[:-1] == 1) * self.n_skill
        return x, target_id, label
        # 这个x存储的是对应的qid的ans如果为1，就在qid的基础上加13523
        # 对应的target_id存储的是后续一位的qid
        # label存储的是对应的label_id

In [47]:
dataset = SAKTDataset(group.head(1000), n_skill)
dataloader = DataLoader(dataset, batch_size=2048, shuffle=True, num_workers=8)

item = dataset.__getitem__(5)


In [48]:
print(item[0])
print(item[1])
print(item[2])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0 17424
 20194 18486  6143 21802  3964  4002   754 14633   770 10688 14752 14910
 14859 10687 14708]
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0 

In [87]:
class ValidDataset(Dataset):
    def __init__(self, samples, test_df, skills, max_seq=MAX_SEQ): #HDKIM 100
        super(ValidDataset, self).__init__()
        self.samples = samples
        self.user_ids = [x for x in test_df["user_id"].unique()]
        self.test_df = test_df
        self.skills = skills
        self.n_skill = len(skills)
        self.max_seq = max_seq

    def __len__(self):
        return self.test_df.shape[0]

    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]

        user_id = test_info["user_id"]
        target_id = test_info["content_id"]
        label = test_info['answered_correctly']

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)

        if user_id in self.samples.index:
            q_, qa_ = self.samples[user_id]
            
            seq_len = len(q_)

            if seq_len >= self.max_seq:
                q = q_[-self.max_seq:]
                qa = qa_[-self.max_seq:]
            else:
                q[-seq_len:] = q_
                qa[-seq_len:] = qa_          
        
        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[1:].copy()
        x += (qa[1:] == 1) * self.n_skill
        
        questions = np.append(q[2:], [target_id])
        
        return x, questions, np.array([label])

In [88]:
test_df = pd.read_csv('/Users/hesu/Documents/KT/riiid/valid.csv')
test_dataset = ValidDataset(group, test_df.head, skills)
test_dataloader = DataLoader(test_dataset, batch_size=2048, shuffle=False)

## Define model

In [89]:
class FFN(nn.Module):
    def __init__(self, state_size=200):
        super(FFN, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(state_size, state_size)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(seq_length):
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)
# 这个future mask返回的是一个上三角mask矩阵，用了99条question去预测当前question，一直到用1条question去预测当前question

class SAKTModel(nn.Module):
    def __init__(self, n_skill, max_seq=MAX_SEQ, embed_dim=128): #HDKIM 100
        super(SAKTModel, self).__init__()
        self.n_skill = n_skill
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(2*n_skill+1, embed_dim)
        # 这个embedding对应的是，融合了question对错信息的矩阵
        self.pos_embedding = nn.Embedding(max_seq-1, embed_dim)
        # 对应的是postion embedding
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)
        # 对应的是原始的quenstion embedding
        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=8, dropout=0.2)

        self.dropout = nn.Dropout(0.2)
        self.layer_normal = nn.LayerNorm(embed_dim) 

        self.ffn = FFN(embed_dim)
        self.pred = nn.Linear(embed_dim, 1)
    
    def forward(self, x, question_ids):
        # 这里的x是蕴含了当前question对错信息的pre_question_ids，是question_ids的前一位信息
        device = x.device        
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)
        # 这里的pos_id需要进行修改，这里直接是从前到后生成的，没有考虑padding的信息需要额外处理
        # 还是这种从前往后的才是最好的方式
        
        pos_x = self.pos_embedding(pos_id)
        x = x + pos_x
        # 融合对错的question信息加上position embedding

        e = self.e_embedding(question_ids)

        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = e.permute(1, 0, 2)
        att_mask = future_mask(x.size(0)).to(device)
        # 这个attn_mask是一个上三角mask矩阵
        att_output, att_weight = self.multi_att(e, x, x, attn_mask=att_mask)
        # 把当前question的信息作为query，把糅合了对错信息的作为key和value
        # 最后返回的其实是x的糅合attention分布后的加和表示
        att_output = self.layer_normal(att_output + e)
        # 最后把att_output和question信息加权在一起，输入到layer_normal矩阵中，进行新的归一化表示
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]

        x = self.ffn(att_output)
        x = self.layer_normal(x + att_output)
        x = self.pred(x)

        return x.squeeze(-1), att_weight

In [90]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SAKTModel(n_skill, embed_dim=128)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.99, weight_decay=0.005)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model.to(device)
criterion.to(device)

BCEWithLogitsLoss()

In [97]:
def train_epoch(model, train_iterator, optim, criterion, device="cpu"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []
    
    tbar = tqdm(train_iterator)
    for item in tbar:
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        label = item[2].to(device).float()
#         print("In training x shape is:{}\t taret_id shape is:{}\t label shape is:{}".format(x.shape, target_id.shape,label.shape))

        optim.zero_grad()
        output, atten_weight = model(x, target_id)
        loss = criterion(output, label)
        loss.backward()
        optim.step()
        train_loss.append(loss.item())

        output = output[:, -1]
        label = label[:, -1] 
        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
#     print("In training labels is:{}\t outs is:{}".format(labels, outs))
#     auc = roc_auc_score(labels, outs, multi_class='ovr')
    auc = 0
    loss = np.mean(train_loss)

    return loss, acc, auc

In [100]:
def evaluate_epoch(model, test_dataloader, criterion, device="cpu"):
    model.eval()
    eval_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []
    
    with torch.no_grad():
        for item in tqdm(test_dataloader):
            x = item[0].to(device).long()
            target_id = item[1].to(device).long()
            label = item[2].to(device).float()
#             print("In evaluation x shape is:{}\t taret_id shape is:{}\t label shape is:{}".format(x.shape, target_id.shape,label.shape))
            
            output, atten_weight = model(x, target_id)
            
            output_loss = output[:, -1:]
#             print("Output shape is:{}\tlabel shape is:{}".format(output.shape, label.shape))
            loss = criterion(output_loss, label)
            eval_loss.append(loss.item())
            
            output = output[:,-1]
            label = label[:, -1] 
            pred = (torch.sigmoid(output) >= 0.5).long()
            
            num_corrects += (pred == label).sum().item()
            num_total += len(label)

            labels.extend(label.view(-1).data.cpu().numpy())
            outs.extend(output.view(-1).data.cpu().numpy())

    acc = num_corrects / num_total
#     print("In evaluate labels is:{}\t outs is:{}\t".format(labels, outs))

#     auc = roc_auc_score(labels, outs, multi_class='ovr')
    auc = 0

    loss = np.mean(eval_loss)   
    
    return loss, acc, auc

In [101]:
epochs = 20
for epoch in range(epochs):
    loss, acc, auc = train_epoch(model, dataloader, optimizer, criterion, device)
    print("epoch - {} train_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, loss, acc, auc))
    eval_loss, eval_acc, eval_auc = evaluate_epoch(model, test_dataloader, criterion, device)
    print("epoch - {} eval_loss - {:.2f} eval acc - {:.3f} eval auc - {:.3f}".format(epoch, eval_loss, eval_acc, eval_auc))
    torch.save(model.state_dict(),'epoch-'+str(epoch)+'-model.pt')

  0%|          | 0/1 [00:00<?, ?it/s]

In training x shape is:torch.Size([997, 99])	 taret_id shape is:torch.Size([997, 99])	 label shape is:torch.Size([997, 99])


loss - 0.4755: 100%|██████████| 1/1 [00:03<00:00,  3.79s/it]
100%|██████████| 50/50 [00:00<00:00, 285.54it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

In training labels is:[0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.

In training x shape is:torch.Size([997, 99])	 taret_id shape is:torch.Size([997, 99])	 label shape is:torch.Size([997, 99])


loss - 0.4631: 100%|██████████| 1/1 [00:04<00:00,  4.20s/it]
100%|██████████| 50/50 [00:00<00:00, 279.04it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

In training labels is:[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.

In training x shape is:torch.Size([997, 99])	 taret_id shape is:torch.Size([997, 99])	 label shape is:torch.Size([997, 99])


  0%|          | 0/1 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [None]:
import gc
del dataset
gc.collect()