In [1]:
import math
import torch
import time

import pandas as pd
import numpy as np
import torch.nn as nn 
import torch.nn.functional as F

from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from collections import deque, defaultdict
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer

In [2]:
QUESTIONS_PATH = 'Data/questions.csv'
TRAIN_PATH = 'Data/riiid_train.pkl.gzip'

In [3]:
df_train = pd.read_pickle(TRAIN_PATH)

In [4]:
df_questions = pd.read_csv(QUESTIONS_PATH)

## Data preprocessing

In [5]:
# Get question parts
part_ids_map = dict(zip(df_questions.question_id, df_questions.part))
df_train['part_id'] = df_train['content_id'].map(part_ids_map)

In [7]:
# Remove lectures
df_train = df_train[df_train.content_type_id == 0]

# Calculate lag time
df_train['lagtime'] = df_train.groupby('user_id')['timestamp'].shift()
df_train['lagtime'] = df_train['timestamp'] - df_train['lagtime']
lagtime_mean = df_train['lagtime'].median()
df_train['lagtime'].fillna(lagtime_mean, inplace=True)
df_train['lagtime'] = df_train['lagtime']//1000
df_train["lagtime"].clip(upper=300, inplace=True)

# Fill NA values
df_train['prior_question_had_explanation'] = df_train['prior_question_had_explanation'].astype(np.float16).fillna(-1).astype(np.int8)
elapsed_time_mean = df_train['prior_question_elapsed_time'].median()
df_train["prior_question_elapsed_time"].fillna(elapsed_time_mean, inplace=True) # FIXME some random value fill in should it be like this?
df_train["prior_question_elapsed_time"] = df_train["prior_question_elapsed_time"] // 1000
df_train["prior_question_elapsed_time"].clip(upper=300, inplace=True)

In [9]:
dtype = {'user_id':'int32',
         'content_id':'int16',
         'answered_correctly':'int8',
         'prior_question_elapsed_time':'int16',
         'task_container_id':'int8',
         'part_id':'int8',
         'lagtime':'int16'}
df_train = df_train.astype(dtype)

In [10]:
df_train

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,part_id,lagtime
0,0,0,115,5692,False,1,3,1,21,-1,5,34
1,1,56943,115,5716,False,2,2,1,37,0,5,56
2,2,118363,115,128,False,0,0,1,55,0,1,61
3,3,131167,115,7860,False,3,0,1,19,0,1,12
4,4,137965,115,7922,False,4,1,1,11,0,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...
101230327,101230327,428564420,2147482888,3586,False,22,0,1,18,1,5,21
101230328,101230328,428585000,2147482888,6341,False,23,3,1,14,1,5,20
101230329,101230329,428613475,2147482888,4212,False,24,3,1,14,1,5,28
101230330,101230330,428649406,2147482888,6343,False,25,1,0,22,1,5,35


In [11]:
skills = df_train['content_id']
n_skill = len(skills)

In [12]:
AMOUNT = 100 # This is the parameter that gets the final AMOUNT questions of each user
PAD = 0 # value to use in the padding
def dataset_transform(dataset):
  final_dataset = {}
  user_id_to_idx = {}
  grp = dataset.groupby('user_id').tail(AMOUNT)
  
  for idx, row in tqdm(grp.groupby("user_id").agg({"content_id":list, 
                "answered_correctly":list, 
                "task_container_id":list, 
                "part_id":list, 
                "lagtime":list, 
                "prior_question_elapsed_time":list}).reset_index().iterrows()):

    # pad the required rows to have AMOUNT values
    if (len(row['content_id']) >= AMOUNT):
      final_dataset[idx] = {
            "user_id": row["user_id"],
            "content_id" : deque(row["content_id"], maxlen=AMOUNT),
            "answered_correctly" : deque(row["answered_correctly"], maxlen=AMOUNT),
            "task_container_id" : deque(row["task_container_id"], maxlen=AMOUNT),
            "lagtime" : deque(row["lagtime"], maxlen=AMOUNT),
            "prior_question_elapsed_time" : deque(row["prior_question_elapsed_time"], maxlen=AMOUNT),
            "part_id": deque(row["part_id"], maxlen=AMOUNT),
            "padded" : deque([False]*100, maxlen=AMOUNT)
        }
    else: # need to pad
        final_dataset[idx] = {
            "user_id": row["user_id"],
            "content_id" : deque([PAD]*(AMOUNT-len(row["content_id"])) + row["content_id"], maxlen=AMOUNT),
            "answered_correctly" : deque([PAD]*(AMOUNT-len(row["content_id"])) + row["answered_correctly"], maxlen=AMOUNT),
            "task_container_id" : deque([PAD]*(AMOUNT-len(row["content_id"])) + row["task_container_id"], maxlen=AMOUNT),
            "lagtime" : deque([PAD]*(AMOUNT-len(row["content_id"])) + row["lagtime"], maxlen=AMOUNT),
            "prior_question_elapsed_time" : deque([PAD]*(AMOUNT-len(row["content_id"])) + row["prior_question_elapsed_time"], maxlen=AMOUNT),
            "part_id": deque([PAD]*(AMOUNT-len(row["content_id"])) + row["part_id"], maxlen=AMOUNT),
            "padded" : deque([True]*(AMOUNT-len(row["content_id"])) + [False]*len(row["content_id"]), maxlen=AMOUNT)
        }

    user_id_to_idx[row['user_id']] = idx
  # FIXME new users? 
  return final_dataset, user_id_to_idx 

In [13]:
class RIIID(torch.utils.data.Dataset):

  def generate(idx):
        return {
            "content_id" : deque([PAD]*AMOUNT, maxlen=AMOUNT),
            "answered_correctly" : deque([PAD]*AMOUNT, maxlen=AMOUNT),
            "task_container_id" : deque([PAD]*AMOUNT, maxlen=AMOUNT),
            "lagtime" : deque([PAD]*AMOUNT, maxlen=AMOUNT),
            "prior_question_elapsed_time" : deque([PAD]*AMOUNT, maxlen=AMOUNT),
            "part_id": deque([PAD]*AMOUNT, maxlen=AMOUNT),
            "padded" : deque([True]*AMOUNT, maxlen=AMOUNT),
        }

  def __init__(self, data):
    self._data = data

  def __len__(self):
    return len(self._data)

  def __getitem__(self, idx):
    # if idx not in self._data:
    #   # new user
    #   self._data[idx] = RIIID.generate_base(idx)
    
    data = self._data[idx]
    return data['user_id'], data['content_id'], data['task_container_id'], data['part_id'], data['prior_question_elapsed_time'], data['padded'], data['answered_correctly'], data['lagtime']


def collate_fn(batch):
    _, content_id, task_id, part_id, prior_question_elapsed_time, padded, labels, lagtime = zip(*batch)
    content_id = torch.Tensor(content_id).long()
    task_id = torch.Tensor(task_id).long()
    part_id = torch.Tensor(part_id).long()
    lagtime = torch.Tensor(lagtime).long()
    prior_question_elapsed_time = torch.Tensor(prior_question_elapsed_time).long()
    padded = torch.Tensor(padded).bool()
    labels = torch.Tensor(labels)
    # remember the order
    return content_id, task_id, part_id, prior_question_elapsed_time, padded, labels, lagtime

In [14]:
df_user, df_idx = dataset_transform(df_train)
dataset = RIIID(data=df_user)
del df_train

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




## SAINT+ Model

In [15]:
# adam optimizer
LEARNING_RATE = 0.001
BETA_1 = 0.9
BETA_2 = 0.999
EPSILON = 1e-8
WARMUP = 4000

#SAINT
N_LAYERS = 4
WINDOW_SIZE = 100
MODEL_DIM = 512
DROPOUT = 0 # maybe we can try later 
BATCH_SIZE = 64

In [16]:
import torch
import torch.nn as nn

class SAINT(nn.Module):
  def __init__(self, d_model, nhead, num_encoder_layers, num_decoder_layers, dropout, dim_feedforward, device='cpu'):
    super(SAINT, self).__init__()
    self.model = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dropout=dropout, dim_feedforward=dim_feedforward).to(device)
    
    # Encoder embeddings
    self.exercise_embeddings = nn.Embedding(num_embeddings=13523, embedding_dim=d_model) # exercise_id
    self.enc_pos_embedding = nn.Embedding(d_model, d_model) # positional embeddings
    self.part_embeddings = nn.Embedding(num_embeddings=7+1, embedding_dim=d_model) # part_id_embeddings
    
    # Decoder embeddings
    self.prior_question_elapsed_time = nn.Embedding(num_embeddings=302, embedding_dim=d_model, padding_idx=301) # prior_question_elapsed_time
    self.dec_pos_embedding = nn.Embedding(d_model+1, d_model, padding_idx=d_model) # positional embeddings
    self.correctness_embeddings = nn.Embedding(num_embeddings=3, embedding_dim=d_model, padding_idx=2) # Correctness embeddings
    self.lagtime = nn.Embedding(num_embeddings=302, embedding_dim=d_model, padding_idx=301) #lag time embedding
    
    self.linear = nn.Linear(d_model, 1)

    self.device = device
    self.init_weights()

  def init_weights(self):
    initrange = 0.1
    # init embeddings
    self.exercise_embeddings.weight.data.uniform_(-initrange, initrange)
    self.part_embeddings.weight.data.uniform_(-initrange, initrange)
    self.prior_question_elapsed_time.weight.data.uniform_(-initrange, initrange)
    self.lagtime.weight.data.uniform_(-initrange, initrange)
    self.correctness_embeddings.weight.data.uniform_(-initrange, initrange)
    self.enc_pos_embedding.weight.data.uniform_(-initrange, initrange)
    self.dec_pos_embedding.weight.data.uniform_(-initrange, initrange)

  def forward(self, encoder_exercises, encoder_position, encoder_part, encoder_padding, decoder_correctness, decoder_position, decoder_elapsed_time, decoder_padding, decoder_lagtime):
    encoder_exercises = encoder_exercises.to(self.device)
    encoder_position = encoder_position.to(self.device)
    encoder_part = encoder_part.to(self.device)
    decoder_correctness = decoder_correctness.to(self.device)
    decoder_position = decoder_position.to(self.device)
    decoder_elapsed_time = decoder_elapsed_time.to(self.device)
    decoder_lagtime = decoder_lagtime.to(self.device)

    embedding_size = encoder_exercises.shape[1] # S / T
    mask_src = self.model.generate_square_subsequent_mask(sz=embedding_size).to(self.device)
    mask_tgt = self.model.generate_square_subsequent_mask(sz=embedding_size).to(self.device)
    mem_mask = self.model.generate_square_subsequent_mask(sz=embedding_size).to(self.device)
    
    # padded positions are masked from the self attention (when True)
    encoder_padding = encoder_padding.bool().to(self.device)
    decoder_padding = decoder_padding.bool().to(self.device)
    #Memory padding mask is the same as the src

    # Generate embeddings according to paper
    embedded_src = self.exercise_embeddings(encoder_exercises) + \
                   self.enc_pos_embedding(encoder_position) + \
                   self.part_embeddings(encoder_part)
    embedded_src = embedded_src.transpose(0, 1) # (S, N, E)

    embedded_dcdr = self.correctness_embeddings(decoder_correctness) + \
                    self.dec_pos_embedding(decoder_position) + \
                    self.prior_question_elapsed_time(decoder_elapsed_time) + \
                    self.lagtime(decoder_lagtime)

    embedded_dcdr = embedded_dcdr.transpose(0, 1) # (S, N, E)

    output = self.model(src=embedded_src, 
                        tgt=embedded_dcdr, 
                        src_mask = mask_src, 
                        tgt_mask = mask_tgt, 
                        memory_mask = mem_mask)
                        # src_key_padding_mask = encoder_padding,
                        # tgt_key_padding_mask = decoder_padding,
                        # memory_key_padding_mask = encoder_padding) # TODO add padding masks

    output = self.linear(output.transpose(1, 0))

    return output

In [17]:
import torch.nn as nn
device="cpu" if not torch.cuda.is_available() else "cuda" 
model = SAINT(d_model=WINDOW_SIZE, nhead=5, num_encoder_layers=N_LAYERS, num_decoder_layers=N_LAYERS, dropout=DROPOUT, dim_feedforward=MODEL_DIM, device=device).to(device)
model.model

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_features=512, bias=True)
        (dropout): Dropout(p=0, inplace=False)
        (linear2): Linear(in_features=512, out_features=100, bias=True)
        (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0, inplace=False)
        (dropout2): Dropout(p=0, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_features=512, bias=True)
        (dropout): Dropout(p=0, inplace=False)
        (linear2): Linear(in_f

In [18]:
CORRECTNESS_DEFAULT_TOKEN = 2
POSITION_DEFAULT_TOKEN = 100
ELAPSED_TIME_DEFAULT_TOKEN = 301
LAG_TIME_DEFAULT_TOKEN = 301

def get_batch_embeddings(content, part, correctness, elapsed_time, lagtime, padding, device="cpu"):
  #encoder has size n, decoder has default + n-1 

  size_x = content.shape[1]
  size_y = content.shape[0]

  # Encoder
  encoder_exercises = content.long()
  encoder_position = torch.arange(0, size_x).to(device).unsqueeze(0).repeat(size_y, 1).long()
  encoder_part = part.long()
  encoder_key_padding = padding.bool()

  # Decoder
  default_correct = torch.Tensor([CORRECTNESS_DEFAULT_TOKEN]).unsqueeze(0).repeat(size_y, 1).to(device)
  default_position = torch.Tensor([POSITION_DEFAULT_TOKEN]).unsqueeze(0).repeat(size_y, 1).to(device)
  default_elapsed_time = torch.Tensor([ELAPSED_TIME_DEFAULT_TOKEN]).unsqueeze(0).repeat(size_y, 1).to(device)
  default_lagtime = torch.Tensor([LAG_TIME_DEFAULT_TOKEN]).unsqueeze(0).repeat(size_y, 1).to(device)
  default_padding = torch.Tensor([True]).unsqueeze(0).repeat(size_y, 1).to(device)

  decoder_correctness = torch.cat((default_correct, correctness[:,:size_x-1]), -1).long()
  decoder_position = torch.cat((default_position, torch.arange(0, size_x-1).to(device).unsqueeze(0).repeat(size_y, 1)), -1).long()
  decoder_elapsed_time = torch.cat((default_elapsed_time, elapsed_time[:,:size_x-1]), -1).long()
  decoder_lagtime = torch.cat((default_lagtime, lagtime[:,:size_x-1]), -1).long()
  decoder_key_padding = torch.cat((default_padding, padding[:,:size_x-1]), -1).bool()

  return encoder_exercises, encoder_position, encoder_part, encoder_key_padding, decoder_correctness, decoder_position, decoder_elapsed_time, decoder_key_padding, decoder_lagtime

In [19]:
def train(model, dataloader, optimizer, criterion, device = 'cpu'):
  model.train()

  train_loss = []
  all_labels = []
  preds = []
  num_corrects = 0.0
  num_total = 0.0


  for idx, batch in tqdm(enumerate(dataloader)):
    # extract data
    content_id, task_id, part_id, prior_question_elapsed_time, mask, labels, lagtime = batch
    content_id = content_id.to(device)
    task_id = task_id.to(device)
    part_id = part_id.to(device)
    prior_question_elapsed_time = prior_question_elapsed_time.to(device)
    lagtime = lagtime.to(device)
    mask = mask.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()

    # get embeddings
    encoder_exercises, encoder_position, encoder_part, encoder_padding, decoder_correctness, decoder_position, decoder_elapsed_time, decoder_padding, decoder_lagtime = get_batch_embeddings(content_id, part_id, labels, prior_question_elapsed_time, lagtime, mask, device=device)

    # run model 
    output = model(encoder_exercises, encoder_position, encoder_part, encoder_padding, decoder_correctness, decoder_position, decoder_elapsed_time, decoder_padding, decoder_lagtime)  #FIXME masks!

    # get loss
    labels = labels.unsqueeze(2)

    loss = criterion(output, labels)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()
    

    # store data
    train_loss.append(loss.detach().cpu().numpy())
    m = nn.Sigmoid()
    output = (m(output) >= 0.5).float().squeeze()
    labels = labels.squeeze()

    num_corrects += (output == labels).sum().item()
    num_total += np.prod(labels.shape)
    all_labels.extend(labels.cpu().numpy())
    preds.extend(output.cpu().numpy())

  acc = num_corrects / num_total
  auc = roc_auc_score(all_labels, preds)
  loss = np.mean(train_loss)

  return loss, acc, auc


In [20]:
def val(model, dataloader, criterion, device="cpu"):
  model.eval()

  train_loss = []
  all_labels = []
  preds = []
  num_corrects = 0.0
  num_total = 0.0


  for idx, batch in tqdm(enumerate(dataloader)):
    # extract data
    content_id, task_id, part_id, prior_question_elapsed_time, mask, labels, lagtime = batch
    content_id = content_id.to(device)
    task_id = task_id.to(device)
    part_id = part_id.to(device)
    prior_question_elapsed_time = prior_question_elapsed_time.to(device)
    lagtime = lagtime.to(device)
    mask = mask.to(device)
    labels = labels.to(device)

    # get embeddings
    encoder_exercises, encoder_position, encoder_part, encoder_padding, decoder_correctness, decoder_position, decoder_elapsed_time, decoder_padding, decoder_lagtime = get_batch_embeddings(content_id, part_id, labels, prior_question_elapsed_time, lagtime, mask, device=device)

    # run model 
    output = model(encoder_exercises, encoder_position, encoder_part, encoder_padding, decoder_correctness, decoder_position, decoder_elapsed_time, decoder_padding, decoder_lagtime)  #FIXME masks!

    # get loss
    labels = labels.unsqueeze(2)
    loss = criterion(output, labels)
    
    # store data
    train_loss.append(loss.detach().cpu().numpy())
    m = nn.Sigmoid()
    output = (m(output) >= 0.5).float().squeeze()
    labels = labels.squeeze()

    num_corrects += (output == labels).sum().item()
    num_total += np.prod(labels.shape)
    
    all_labels.extend(labels.cpu().numpy())
    preds.extend(output.cpu().numpy())

  acc = num_corrects / num_total
  auc = roc_auc_score(all_labels, preds)
  loss = np.mean(train_loss)

  return loss, acc, auc


## Training and validation

In [21]:
train_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=8)

In [22]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, betas=(BETA_1, BETA_2), eps=EPSILON) #FIXME warmup
criterion = nn.BCEWithLogitsLoss()
criterion.to(device)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) # FIXME paper is Noam Scheme but it's not available this one is from the tutorial itself

In [23]:
best_val_loss = float("inf")
epochs = 20 
best_model = None

train_set, val_set = train_test_split(dataset, test_size=0.2, random_state=56)

for epoch in range(1, epochs + 1):
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=8)
    val_loader = torch.utils.data.DataLoader(val_set, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=8)

    epoch_start_time = time.time()
    print('Training')
    train_loss, train_acc, train_auc = train(model=model, dataloader=train_loader, optimizer=optimizer, criterion=criterion, device=device)
    print("epoch - {} train_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, train_loss, train_acc, train_auc))
    print('Validating')
    val_loss, val_acc, val_auc = val(model=model, dataloader=val_loader, criterion=criterion, device=device)
    print("epoch - {} val_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, val_loss, val_acc, val_auc))
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
        print('Updated model!')

    scheduler.step() 


Training


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…


epoch - 1 train_loss - 0.33 acc - 0.824 auc - 0.819
Validating


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…


epoch - 1 val_loss - 0.32 acc - 0.832 auc - 0.820
Updated model!
Training


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…


epoch - 2 train_loss - 0.33 acc - 0.826 auc - 0.820
Validating


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…


epoch - 2 val_loss - 0.32 acc - 0.833 auc - 0.821
Updated model!
Training


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




KeyboardInterrupt: 

In [None]:
# Save the model
SAVE_PATH = "saint_model.pt"
torch.save(model.state_dict(), SAVE_PATH)