In [13]:
import time
import math
import gc
import torch

import pandas as pd
import numpy as np
import torch.nn as nn 
import torch.nn.functional as F

from tqdm.notebook import tqdm
from collections import deque, defaultdict
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## 1. Import datasets

In [14]:
# Constants 
TRAIN_PATH = 'Data/riiid_train.pkl.gzip'

In [15]:
AMOUNT = 100 # Number of questions per user
PAD = 0 # Value for padding
BATCH_SIZE = 100
NUM_ENCODER = 4
NUM_DECODER = 4
MAX_SEQ = 100

EMBED_DIMS = 32
ENC_HEADS = 8
DEC_HEADS = 8
TOTAL_EXE = 13523 # number of unique questions
TOTAL_CAT = 10000

DEVICE = 'cpu' if torch.cuda.is_available() else 'cpu'

In [16]:
train_df = pd.read_pickle(TRAIN_PATH)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101230332 entries, 0 to 101230331
Data columns (total 10 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   row_id                          int64  
 1   timestamp                       int64  
 2   user_id                         int32  
 3   content_id                      int16  
 4   content_type_id                 bool   
 5   task_container_id               int16  
 6   user_answer                     int8   
 7   answered_correctly              int8   
 8   prior_question_elapsed_time     float32
 9   prior_question_had_explanation  object 
dtypes: bool(1), float32(1), int16(2), int32(1), int64(2), int8(2), object(1)
memory usage: 3.7+ GB


## 2. Data Pre-Processing

In [17]:
# Remove lectures
train_df = train_df[train_df.content_type_id == 0]

# Find unique skills
skills = train_df["content_id"].unique()
n_skill = len(skills)

"""
1. Fill NA elapsed time
2. Change unit to seconds
3. Crop out the top 300 elapsed times
"""
train_df.prior_question_elapsed_time.fillna(0, inplace=True)
train_df.prior_question_elapsed_time /= 1000
train_df.prior_question_elapsed_time = train_df.prior_question_elapsed_time.astype(np.int)
# df_train.prior_question_elapsed_time.clip(upper=300)

# Group by user
user_df = train_df[["user_id", "content_id", "answered_correctly", "prior_question_elapsed_time", "task_container_id"]].groupby('user_id').apply(lambda r: (
            r.content_id.values,
            r.answered_correctly.values,
            r.prior_question_elapsed_time.values, 
            r.task_container_id.values
            ))

del train_df
gc.collect()

0

## 3. Creating the dataset

In [18]:
class RiiidDataset(Dataset):
    def __init__(self, user_df, max_seq=100):
        super(RiiidDataset, self).__init__()
        self.user_df = user_df
        self.max_seq = max_seq
        self.user_ids = []

        for user_id in self.user_df.index:
            exercise_id, answered_correctly, elapsed_time, container_id = self.user_df[user_id]
            # Remove users who did less than 10 exercises
            ex_num = len(exercise_id) # number of exercises
            if ex_num >= 10:
                self.user_ids.append(user_id)
                # idx = min(ex_num, max_seq)
                # entry = (exercise_id[:idx], answered_correctly[:idx],elapsed_time[:idx], container_id[:idx])
                # self.dataset.append(entry)

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        user_id = self.user_ids[idx]
        exercise_id, answered_correctly, elapsed_time, container_id = self.user_df[user_id]

        ex_num = len(exercise_id)

        exercise_id_temp = np.zeros(self.max_seq, dtype=int)
        answered_correctly_temp = np.zeros(self.max_seq, dtype=int)
        elapsed_time_temp = np.zeros(self.max_seq, dtype=int)
        container_id_temp = np.zeros(self.max_seq, dtype=int)

        if ex_num >= self.max_seq:
            exercise_id_temp[:] = exercise_id[-self.max_seq:]
            answered_correctly_temp[:] = answered_correctly[-self.max_seq:]
            elapsed_time_temp[:] = elapsed_time[-self.max_seq:]
            container_id_temp[:] = container_id[-self.max_seq:]
        else:
            exercise_id_temp[-ex_num:] = exercise_id
            answered_correctly_temp[-ex_num:] = answered_correctly
            elapsed_time_temp[-ex_num:] = elapsed_time
            container_id_temp[-ex_num:] = container_id

        exercise_ids = exercise_id_temp
        labels = answered_correctly_temp
        elapsed_times = elapsed_time_temp
        container_ids = container_id_temp

        return exercise_id_temp, answered_correctly_temp, answered_correctly_temp, container_id_temp


In [19]:
train, val = train_test_split(user_df, test_size=0.2)

train_dataset = RiiidDataset(train, max_seq=MAX_SEQ)
val_dataset = RiiidDataset(val, max_seq=MAX_SEQ)
train_loader = DataLoader(train_dataset,
                            batch_size=BATCH_SIZE,
                            num_workers=8,
                            shuffle=True)
val_loader = DataLoader(val_dataset,
                        batch_size=BATCH_SIZE,
                        num_workers=8,
                        shuffle=True)
del train, val, train_dataset, val_dataset
gc.collect()

29

## 4. Model

In [20]:
# Feed Forward Layer
class FFN(nn.Module):
    def __init__(self, in_features):
        super(FFN, self).__init__()
        self.linear1 = nn.Linear(in_features, in_features)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(in_features, in_features)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        return self.dropout(x)

In [21]:
"""
        SAINT+ Model has 3 input embeddings for the encoder part":
        1. Exercise ID
        2. Position
        3. Part (10000 unique task container id)
"""
class EncoderEmbedding(nn.Module):
    def __init__(self, n_exercises, n_categories, n_dims, seq_len):
        super(EncoderEmbedding, self).__init__()
        self.n_dims = n_dims
        self.seq_len = seq_len
        self.exercise_embedding = nn.Embedding(n_exercises, n_dims)
        self.position_embediing = nn.Embedding(seq_len, n_dims)
        self.part_embedding = nn.Embedding(n_categories, n_dims)

    def forward(self, exercises, categories):
        e = self.exercise_embedding(exercises)
        c = self.part_embedding(categories)
        seq = torch.arange(self.seq_len, device=device).unsqueeze(0)
        p = self.position_embediing(seq)
        return p + c + e

"""
        SAINT+ Model has 4 input embeddings for the decoder part":
        1. Correctness (response)
        2. Position
        3. Elapsed Time
        4. Lag Time (to be implemented)
"""
class DecoderEmbedding(nn.Module):
    def __init__(self, n_responses, n_dims, seq_len):
        super(DecoderEmbedding, self).__init__()
        self.n_dims = n_dims
        self.seq_len = seq_len
        self.response_embedding = nn.Embedding(n_responses, n_dims)
        self.elapsed_time_embedding = nn.Embedding(300, n_dims)
        self.position_embedding = nn.Embedding(seq_len, n_dims)

    def forward(self, responses, elapsed_times):
        e = self.response_embedding(responses)
        t = self.elapsed_time_embedding(elapsed_times)
        seq = torch.arange(self.seq_len, device=device).unsqueeze(0)
        p = self.position_embedding(seq)
        return p + e + t

In [22]:
# *Copied*
class StackedNMultiHeadAttention(nn.Module):
  def __init__(self,n_stacks,n_dims,n_heads,seq_len,n_multihead=1,dropout=0.2):
    super(StackedNMultiHeadAttention,self).__init__()
    self.n_stacks = n_stacks
    self.n_multihead = n_multihead
    self.n_dims = n_dims 
    self.norm_layers = nn.LayerNorm(n_dims)
    #n_stacks has n_multiheads each
    self.multihead_layers = nn.ModuleList(n_stacks*[nn.ModuleList(n_multihead*[nn.MultiheadAttention(embed_dim = n_dims,
                                                      num_heads = n_heads,
                                                        dropout = dropout),]),])
    self.ffn = nn.ModuleList(n_stacks*[FFN(n_dims)])
    self.mask = torch.triu(torch.ones(seq_len,seq_len),diagonal=1).to(dtype=torch.bool)
  
  def forward(self,input_q,input_k,input_v,encoder_output=None,break_layer=None):
    for stack in range(self.n_stacks):
        for multihead in range(self.n_multihead):
          norm_q = self.norm_layers(input_q)
          norm_k = self.norm_layers(input_k)
          norm_v = self.norm_layers(input_v) 
          heads_output,_ = self.multihead_layers[stack][multihead](query=norm_q.permute(1,0,2),
                                                                    key=norm_k.permute(1,0,2),
                                                                    value=norm_v.permute(1,0,2),
                                                                    attn_mask=self.mask.to(device))
          heads_output = heads_output.permute(1,0,2)
          #assert encoder_output != None and break_layer is not None     
          if encoder_output != None and multihead == break_layer:
            assert break_layer <= multihead, " break layer should be less than multihead layers and postive integer"
            input_k = input_v = encoder_output
            input_q =input_q + heads_output
          else:
            input_q =input_q+ heads_output
            input_k =input_k+ heads_output
            input_v =input_v +heads_output
        last_norm = self.norm_layers(heads_output)
        ffn_output = self.ffn[stack](last_norm)
        ffn_output =ffn_output+ heads_output
    return ffn_output

In [23]:
class SAINT(nn.Module):
    def __init__(self, n_skill, max_seq=MAX_SEQ, embed_dim=EMBED_DIMS):
        super(SAINT, self).__init__()
        self.n_skill = n_skill
        self.embed_dim = embed_dim

        self.encoder_embedding = EncoderEmbedding(n_skill, 10000, embed_dim, max_seq)
        self.decoder_embedding = DecoderEmbedding(n_skill, embed_dim, max_seq)

        self.encoder_layer = StackedNMultiHeadAttention(n_stacks=NUM_DECODER,n_dims=EMBED_DIMS,n_heads=DEC_HEADS,seq_len=MAX_SEQ,n_multihead=1,dropout=0.2)
        self.decoder_layer = StackedNMultiHeadAttention(n_stacks=NUM_ENCODER,n_dims=EMBED_DIMS, n_heads=ENC_HEADS,seq_len=MAX_SEQ,n_multihead=2,dropout=0.2)

        self.dropout = nn.Dropout(0.2)
        self.layer_normal = nn.LayerNorm(embed_dim) 

        self.ffn = FFN(embed_dim)
        self.pred = nn.Linear(embed_dim, 1)

    def future_mask(self, seq_length):
        future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
        return torch.from_numpy(future_mask)
    
    def forward(self, exercise_id, label, elapsed_time, container_id):
        enc_emb = self.encoder_embedding(exercise_id, container_id)
        dec_emb = self.decoder_embedding(label, elapsed_time)

        # att_mask = self.future_mask(len(exercise_id)).to(device)
        # att_mask =att_mask
        enc_output = self.encoder_layer(enc_emb, enc_emb, enc_emb)

        dec_output = self.decoder_layer(dec_emb, dec_emb, dec_emb, encoder_output=enc_emb, break_layer=1)
        # dec_output = self.ffn(dec_output)

        output = self.pred(dec_output)

        return output

In [43]:
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")


model = SAINT(n_skill)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.99, weight_decay=0.005)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model.to(device)
criterion.to(device)
train_iterator = train_loader
optim = optimizer
model.train()

train_loss = []
num_corrects = 0
num_total = 0
labels = []
outs = []

tbar = tqdm(train_iterator)
for item in tbar:
    exercise_ids = item[0].to(device).long()
    label = item[1].to(device).long()
    elapsed_times = item[2].to(device).long()
    container_ids= item[3].to(device).long()

    target_mask = (exercise_ids != 0)

    optim.zero_grad()
    output = model(exercise_ids, label, elapsed_times, container_ids)
    
    outputs = torch.masked_select(output[:,-1], target_mask)
    label = torch.masked_select(label, target_mask)
    
    loss = criterion(outputs.float(), label.float())
    loss.backward()
    optim.step()
    train_loss.append(loss.item())
    pred = (torch.sigmoid(output) >= 0.5).long()
    
    num_corrects += (pred == label).sum().item()
    num_total += len(label)

    labels.extend(label.view(-1).data.cpu().numpy())
    outs.extend(output.view(-1).data.cpu().numpy())

    tbar.set_description('loss - {:.4f}'.format(loss))

acc = num_corrects / num_total
auc = roc_auc_score(labels, outs)
loss = np.average(train_loss)
print("epoch - {} train_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, loss, acc, auc))


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3129.0), HTML(value='')))




ValueError: Found input variables with inconsistent numbers of samples: [17651563, 31280300]

In [26]:
def val_epoch(model, val_iterator, criterion, device="cpu"):
    model.eval()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    tbar = tqdm(val_iterator)
    for item in tbar:
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        label = item[2].to(device).float()
        target_mask = (target_id != 0)

        with torch.no_grad():
            output, atten_weight = model(x, target_id)
        
        output = torch.masked_select(output, target_mask)
        label = torch.masked_select(label, target_mask)

        loss = criterion(output, label)
        train_loss.append(loss.item())

        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.average(train_loss)

    return loss, acc, auc

## 5. Inferencing

In [42]:
epochs = 20

over_fit = 0
last_auc = 0
for epoch in range(epochs):
    train_loss, train_acc, train_auc = train_epoch(model, train_loader, optimizer, criterion, device)
    print("epoch - {} train_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, train_loss, train_acc, train_auc))
    
    val_loss, avl_acc, val_auc = val_epoch(model, val_loader, criterion, device)
    print("epoch - {} val_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, val_loss, avl_acc, val_auc))
    
    if val_auc > last_auc:
        last_auc = val_auc
        over_fit = 0
    else:
        over_fit += 1
        
    
    if over_fit >= 2:
        print("early stop epoch ", epoch)
        break

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1222.0), HTML(value='')))


epoch - 0 train_loss - 0.59 acc - 0.684 auc - 0.731


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=306.0), HTML(value='')))


epoch - 0 val_loss - 0.58 acc - 0.697 auc - 0.753


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1222.0), HTML(value='')))


epoch - 1 train_loss - 0.57 acc - 0.699 auc - 0.755


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=306.0), HTML(value='')))


epoch - 1 val_loss - 0.57 acc - 0.699 auc - 0.756


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1222.0), HTML(value='')))




KeyboardInterrupt: 