In [57]:
import time
import math
import gc
import torch

import pandas as pd
import numpy as np
import torch.nn as nn 
import torch.nn.functional as F

from tqdm.notebook import tqdm
from collections import deque, defaultdict
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## 1. Import datasets

In [58]:
# Constants 
TRAIN_PATH = 'Data/riiid_train.pkl.gzip'

AMOUNT = 100 # Number of questions per user
PAD = 0 # Value for padding
BATCH_SIZE = 100
NUM_ENCODER = 4
NUM_DECODER = 4
MAX_SEQ = 100

EMBED_DIMS = 32
ENC_HEADS = 8
DEC_HEADS = 8
TOTAL_EXE = 13523 # number of unique questions
TOTAL_CAT = 10000

DEVICE = 'cpu' if torch.cuda.is_available() else 'cpu'

In [60]:
train_df = pd.read_pickle(TRAIN_PATH)
train_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,False,1,3,1,,
1,1,56943,115,5716,False,2,2,1,37000.0,False
2,2,118363,115,128,False,0,0,1,55000.0,False
3,3,131167,115,7860,False,3,0,1,19000.0,False
4,4,137965,115,7922,False,4,1,1,11000.0,False
...,...,...,...,...,...,...,...,...,...,...
101230327,101230327,428564420,2147482888,3586,False,22,0,1,18000.0,True
101230328,101230328,428585000,2147482888,6341,False,23,3,1,14000.0,True
101230329,101230329,428613475,2147482888,4212,False,24,3,1,14000.0,True
101230330,101230330,428649406,2147482888,6343,False,25,1,0,22000.0,True


## 2. Data Pre-Processing

In [61]:
# Remove lectures
train_df = train_df[train_df.content_type_id == 0]

# Find unique skills
skills = train_df["content_id"].unique()
n_skill = len(skills)

"""
1. Fill NA elapsed time
2. Change unit to seconds
3. Crop out the top 300 elapsed times
"""
train_df.prior_question_elapsed_time.fillna(0, inplace=True)
train_df.prior_question_elapsed_time /= 1000
train_df.prior_question_elapsed_time = train_df.prior_question_elapsed_time.astype(np.int)
# df_train.prior_question_elapsed_time.clip(upper=300)

# Group by user
user_df = train_df[["user_id", "content_id", "answered_correctly", "prior_question_elapsed_time", "task_container_id"]].groupby('user_id').apply(lambda r: (
            r.content_id.values,
            r.answered_correctly.values,
            r.prior_question_elapsed_time.values, 
            r.task_container_id.values
            ))

del train_df
gc.collect()

import pickle
with open('user_df.pickle', 'wb') as f:
  pickle.dump(user_df,f)

## 3. Creating the dataset

In [48]:
class RiiidDataset(Dataset):
    def __init__(self, user_df, max_seq=100):
        super(RiiidDataset, self).__init__()
        self.user_df = user_df
        self.max_seq = max_seq
        self.user_ids = []

        for user_id in self.user_df.index:
            exercise_id, answered_correctly, elapsed_time, container_id = self.user_df[user_id]
            # Remove users who did less than 10 exercises
            ex_num = len(exercise_id) # number of exercises
            if ex_num >= 10:
                self.user_ids.append(user_id)
                # idx = min(ex_num, max_seq)
                # entry = (exercise_id[:idx], answered_correctly[:idx],elapsed_time[:idx], container_id[:idx])
                # self.dataset.append(entry)

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        user_id = self.user_ids[idx]
        exercise_id, answered_correctly, elapsed_time, container_id = self.user_df[user_id]

        ex_num = len(exercise_id)

        exercise_id_temp = np.zeros(self.max_seq, dtype=int)
        answered_correctly_temp = np.zeros(self.max_seq, dtype=int)
        elapsed_time_temp = np.zeros(self.max_seq, dtype=int)
        container_id_temp = np.zeros(self.max_seq, dtype=int)

        if ex_num >= self.max_seq:
            exercise_id_temp= exercise_id[-self.max_seq:]
            answered_correctly_temp = answered_correctly[-self.max_seq:]
            elapsed_time_temp = elapsed_time[-self.max_seq:]
            container_id_temp= container_id[-self.max_seq:]
        else:
            exercise_id_temp[-ex_num:] = exercise_id
            answered_correctly_temp[-ex_num:] = answered_correctly
            elapsed_time_temp[-ex_num:] = elapsed_time
            container_id_temp[-ex_num:] = container_id

        return exercise_id_temp, answered_correctly_temp, elapsed_time_temp, container_id_temp


In [49]:
train, val = train_test_split(user_df, test_size=0.2)

train_dataset = RiiidDataset(train, max_seq=MAX_SEQ)
val_dataset = RiiidDataset(val, max_seq=MAX_SEQ)
train_loader = DataLoader(train_dataset,
                            batch_size=BATCH_SIZE,
                            num_workers=8,
                            shuffle=True)
val_loader = DataLoader(val_dataset,
                        batch_size=BATCH_SIZE,
                        num_workers=8,
                        shuffle=True)
del train, val, train_dataset, val_dataset
gc.collect()

388

## 4. Model

In [50]:
# Feed Forward Layer
class FFN(nn.Module):
    def __init__(self, in_features):
        super(FFN, self).__init__()
        self.linear1 = nn.Linear(in_features, in_features)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(in_features, in_features)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        return self.dropout(x)
"""
        SAINT+ Model has 3 input embeddings for the encoder part":
        1. Exercise ID
        2. Position
        3. Part (10000 unique task container id)
"""
class EncoderEmbedding(nn.Module):
    def __init__(self, n_exercises, n_categories, n_dims, seq_len):
        super(EncoderEmbedding, self).__init__()
        self.n_dims = n_dims
        self.seq_len = seq_len
        self.exercise_embedding = nn.Embedding(n_exercises, n_dims)
        self.position_embediing = nn.Embedding(seq_len, n_dims)
        self.part_embedding = nn.Embedding(n_categories, n_dims)

    def forward(self, exercises, categories):
        e = self.exercise_embedding(exercises)
        c = self.part_embedding(categories)
        seq = torch.arange(self.seq_len, device=device).unsqueeze(0)
        p = self.position_embediing(seq)
        return p + c + e

"""
        SAINT+ Model has 4 input embeddings for the decoder part":
        1. Correctness (response)
        2. Position
        3. Elapsed Time
        4. Lag Time (to be implemented)
"""
class DecoderEmbedding(nn.Module):
    def __init__(self, n_responses, n_dims, seq_len):
        super(DecoderEmbedding, self).__init__()
        self.n_dims = n_dims
        self.seq_len = seq_len
        self.response_embedding = nn.Embedding(n_responses, n_dims)
        self.elapsed_time_embedding = nn.Linear(1,n_dims,bias=False)
        self.position_embedding = nn.Embedding(seq_len, n_dims)

    def forward(self, responses, elapsed_times):
        e = self.response_embedding(responses)
        # t = self.elapsed_time_embedding(elapsed_times)
        seq = torch.arange(self.seq_len, device=device).unsqueeze(0)
        p = self.position_embedding(seq)
        return p + e

In [51]:
class SAINT(nn.Module):
    def __init__(self, n_skill, max_seq=MAX_SEQ, embed_dim=EMBED_DIMS):
        super(SAINT, self).__init__()
        self.n_skill = n_skill
        self.embed_dim = embed_dim

        self.encoder_embedding = EncoderEmbedding(n_skill, 10000, embed_dim, max_seq)
        self.decoder_embedding = DecoderEmbedding(n_skill, embed_dim, max_seq)

        # self.encoder_layer = StackedNMultiHeadAttention(n_stacks=NUM_DECODER,n_dims=EMBED_DIMS,n_heads=DEC_HEADS,seq_len=MAX_SEQ,n_multihead=1,dropout=0.2)
        # self.decoder_layer = StackedNMultiHeadAttention(n_stacks=NUM_ENCODER,n_dims=EMBED_DIMS, n_heads=ENC_HEADS,seq_len=MAX_SEQ,n_multihead=2,dropout=0.2)
        self.encoder_layer = nn.MultiheadAttention(embed_dim, num_heads=ENC_HEADS, dropout=0.2)
        self.decoder_layer = nn.MultiheadAttention(embed_dim, num_heads=DEC_HEADS, dropout=0.2)

        self.dropout = nn.Dropout(0.2)
        self.layer_normal = nn.LayerNorm(embed_dim) 

        self.ffn = FFN(embed_dim)
        self.pred = nn.Linear(embed_dim, 1)

    def future_mask(self, seq_length):
        future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
        return torch.from_numpy(future_mask)
    
    def forward(self, exercise_id, label, elapsed_time, container_id):
        enc_emb = self.encoder_embedding(exercise_id, container_id)

        self.elapsed_time = nn.Linear(1,EMBED_DIMS)
        elapsed_time=elapsed_time.unsqueeze(-1).float()
        ela_time = self.elapsed_time(elapsed_time)
        
        dec_emb = self.decoder_embedding(label, elapsed_time)
        dec_emb = dec_emb + ela_time

        attn_mask = self.future_mask(len(exercise_id)).to(device)
        
        enc_output, enc_weights = self.encoder_layer(enc_emb, enc_emb, enc_emb, attn_mask =attn_mask)
        enc_output = enc_output.permute(1,0,2)
        dec_output, dec_weights = self.decoder_layer(enc_output, dec_emb, dec_emb)
        # dec_output = self.ffn(dec_output)

        output = self.pred(dec_output)

        return output

In [52]:
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")


model = SAINT(n_skill)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.99, weight_decay=0.005)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model.to(device)


SAINT(
  (encoder_embedding): EncoderEmbedding(
    (exercise_embedding): Embedding(13523, 32)
    (position_embediing): Embedding(100, 32)
    (part_embedding): Embedding(10000, 32)
  )
  (decoder_embedding): DecoderEmbedding(
    (response_embedding): Embedding(13523, 32)
    (elapsed_time_embedding): Linear(in_features=1, out_features=32, bias=False)
    (position_embedding): Embedding(100, 32)
  )
  (encoder_layer): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=32, out_features=32, bias=True)
  )
  (decoder_layer): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=32, out_features=32, bias=True)
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (layer_normal): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
  (ffn): FFN(
    (linear1): Linear(in_features=32, out_features=32, bias=True)
    (relu): ReLU()
    (linear2): Linear(in_features=32, out_features=32, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (pred): Linear(in_features=

In [53]:
def train_epoch(model, train_iterator, optim,  criterion, device="cpu"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    tbar = tqdm(train_iterator)
    for item in tbar:
        exercise_ids = item[0].to(device).long()
        label = item[1].to(device).long()
        elapsed_times = item[2].to(device).long()
        container_ids= item[3].to(device).long()

        target_mask = (exercise_ids != 0)

        optim.zero_grad()
        output = model(exercise_ids, label, elapsed_times, container_ids)
        
        outputs = torch.masked_select(output.permute(1,0,2)[:,-1], target_mask)
        label = torch.masked_select(label, target_mask)
        
        loss = criterion(outputs.float(), label.float())
        loss.backward()
        optim.step()
        train_loss.append(loss.item())
        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(outputs.view(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.average(train_loss)
    return loss, acc, auc

In [54]:
def val_epoch(model, val_iterator, criterion, device="cpu"):
    model.eval()

    val_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    tbar = tqdm(val_iterator)
    for item in tbar:
        exercise_ids = item[0].to(device).long()
        label = item[1].to(device).long()
        elapsed_times = item[2].to(device).long()
        container_ids= item[3].to(device).long()

        target_mask = (exercise_ids != 0)

        output = model(exercise_ids, label, elapsed_times, container_ids)
        
        outputs = torch.masked_select(output.permute(1,0,2)[:,-1], target_mask)
        label = torch.masked_select(label, target_mask)

        loss = criterion(outputs.float(), label.float())
        loss.backward()
        val_loss.append(loss.item())
        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(outputs.view(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.average(val_loss)
    return loss, acc, auc

## 5. Inferencing

In [56]:
epochs = 20

over_fit = 0
last_auc = 0
for epoch in range(epochs):
    train_loss, train_acc, train_auc = train_epoch(model, train_loader, optimizer, criterion, device)
    print("epoch - {} train_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, train_loss, train_acc, train_auc))
    
    val_loss, avl_acc, val_auc = val_epoch(model, val_loader, criterion, device)
    print("epoch - {} val_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, val_loss, avl_acc, val_auc))
    
    if val_auc > last_auc:
        last_auc = val_auc
        over_fit = 0
    else:
        over_fit += 1
        
    
    if over_fit >= 2:
        print("early stop epoch ", epoch)
        break

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3129.0), HTML(value='')))


epoch - 0 train_loss - 0.68 acc - 5905.603 auc - 0.500


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))


epoch - 0 val_loss - 0.68 acc - 5905.434 auc - 0.500


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3129.0), HTML(value='')))


epoch - 1 train_loss - 0.68 acc - 5910.787 auc - 0.501


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))




KeyboardInterrupt: 