In [50]:
import gc
import time
import math
import torch

import pandas as pd
import numpy as np
import torch.nn as nn 
import torch.nn.functional as F

from tqdm.notebook import tqdm
from collections import deque, defaultdict
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [51]:
TRAIN_PATH = 'Data/riiid_train.pkl.gzip'
QUESTIONS_PATH = 'Data/questions.csv'

In [52]:
df_train = pd.read_pickle(TRAIN_PATH)

In [53]:
df_questions = pd.read_csv(QUESTIONS_PATH)

In [54]:
df_train['prior_question_had_explanation'] = df_train['prior_question_had_explanation'].astype(np.float16).fillna(-1).astype(np.int8)

part_ids_map = dict(zip(df_questions.question_id, df_questions.part))
df_train['part_id'] = df_train['content_id'].map(part_ids_map)

In [55]:
df_train = df_train[df_train.content_type_id == 0]

In [56]:
df_train["prior_question_elapsed_time"].fillna(26000, inplace=True) # FIXME some random value fill in should it be like this?
df_train["prior_question_elapsed_time"] = df_train["prior_question_elapsed_time"] // 1000
df_train["prior_question_elapsed_time"].clip(upper=300)

0            26.0
1            37.0
2            55.0
3            19.0
4            11.0
             ... 
101230327    18.0
101230328    14.0
101230329    14.0
101230330    22.0
101230331    29.0
Name: prior_question_elapsed_time, Length: 99271300, dtype: float32

In [57]:
AMOUNT = 100 # This is the parameter that gets the final AMOUNT questions of each user
PAD = 0 # value to use in the padding

In [58]:
from collections import deque, defaultdict
def dataset_transform(dataset):
  final_dataset = {}
  user_id_to_idx = {}
  grp = dataset.groupby('user_id').tail(AMOUNT)
  
  for idx, row in tqdm(grp.groupby("user_id").agg({"content_id":list, 
                "answered_correctly":list, 
                "task_container_id":list, 
                "part_id":list, 
                "prior_question_elapsed_time":list}).reset_index().iterrows()):

    # pad the required rows to have AMOUNT values
    if (len(row['content_id']) >= AMOUNT):
      final_dataset[idx] = {
            "user_id": row["user_id"],
            "content_id" : deque(row["content_id"], maxlen=AMOUNT),
            "answered_correctly" : deque(row["answered_correctly"], maxlen=AMOUNT),
            "task_container_id" : deque(row["task_container_id"], maxlen=AMOUNT),
            "prior_question_elapsed_time" : deque(row["prior_question_elapsed_time"], maxlen=AMOUNT),
            "part_id": deque(row["part_id"], maxlen=AMOUNT),
            "padded" : deque([False]*100, maxlen=AMOUNT)
        }
    else: # need to pad
        final_dataset[idx] = {
            "user_id": row["user_id"],
            "content_id" : deque([PAD]*(AMOUNT-len(row["content_id"])) + row["content_id"], maxlen=AMOUNT),
            "answered_correctly" : deque([PAD]*(AMOUNT-len(row["content_id"])) + row["answered_correctly"], maxlen=AMOUNT),
            "task_container_id" : deque([PAD]*(AMOUNT-len(row["content_id"])) + row["task_container_id"], maxlen=AMOUNT),
            "prior_question_elapsed_time" : deque([PAD]*(AMOUNT-len(row["content_id"])) + row["prior_question_elapsed_time"], maxlen=AMOUNT),
            "part_id": deque([PAD]*(AMOUNT-len(row["content_id"])) + row["part_id"], maxlen=AMOUNT),
            "padded" : deque([True]*(AMOUNT-len(row["content_id"])) + [False]*len(row["content_id"]), maxlen=AMOUNT)
        }

    user_id_to_idx[row['user_id']] = idx
  # FIXME new users? 
  return final_dataset, user_id_to_idx 

In [59]:
%%time 
df_user, df_idx = dataset_transform(df_train)
del df_train

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…


CPU times: user 1min 24s, sys: 3.34 s, total: 1min 27s
Wall time: 1min 27s


In [60]:
dic = {1:2, 2: 3,  3:4}
if (2 in dic):
    print("hello")

hello


# Model

In [61]:
import torch
import torch.nn as nn

class SAINT(nn.Module):
  def __init__(self, d_model, nhead, num_encoder_layers, num_decoder_layers, dropout, dim_feedforward, device='cpu'):
    super(SAINT, self).__init__()
    self.model = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dropout=dropout, dim_feedforward=dim_feedforward).to(device)
    
    # Encoder embeddings
    self.exercise_embeddings = nn.Embedding(num_embeddings=13523, embedding_dim=d_model) # exercise_id
    self.enc_pos_embedding = nn.Embedding(d_model, d_model) # positional embeddings
    self.part_embeddings = nn.Embedding(num_embeddings=7+1, embedding_dim=d_model) # part_id_embeddings
    
    # Decoder embeddings
    self.prior_question_elapsed_time = nn.Embedding(num_embeddings=301, embedding_dim=d_model) # prior_question_elapsed_time
    self.dec_pos_embedding = nn.Embedding(d_model, d_model) # positional embeddings
    self.correctness_embeddings = nn.Embedding(num_embeddings=2, embedding_dim=d_model) # Correctness embeddings
    # self.lag_time_embeddings = nn.Embedding(num_embeddings=?, embedding_dim=?) TODO
    
    self.linear = nn.Linear(d_model, 1)

    self.device = device
    self.init_weights()

  def init_weights(self):
    initrange = 0.1
    # init embeddings
    # FIXME should be Xavier uniform acording to paper
    self.exercise_embeddings.weight.data.uniform_(-initrange, initrange)
    self.part_embeddings.weight.data.uniform_(-initrange, initrange)
    self.prior_question_elapsed_time.weight.data.uniform_(-initrange, initrange)
    self.correctness_embeddings.weight.data.uniform_(-initrange, initrange)
    self.enc_pos_embedding.weight.data.uniform_(-initrange, initrange)
    self.dec_pos_embedding.weight.data.uniform_(-initrange, initrange)

  def forward(self, encoder_exercises, encoder_position, encoder_part, decoder_correctness, decoder_position, decoder_elapsed_time, mask_src=None, mask_tgt=None):
    encoder_exercises = encoder_exercises.to(self.device)
    encoder_position = encoder_position.to(self.device)
    encoder_part = encoder_part.to(self.device)
    decoder_correctness = decoder_correctness.to(self.device)
    decoder_position = decoder_position.to(self.device)
    decoder_elapsed_time = decoder_elapsed_time.to(self.device)

    mask_src = self.model.generate_square_subsequent_mask(sz=encoder_exercises.shape[1]).to(self.device)
    mask_tgt = self.model.generate_square_subsequent_mask(sz=encoder_exercises.shape[1]).to(self.device)
    mem_mask = self.model.generate_square_subsequent_mask(sz=encoder_exercises.shape[1]).to(self.device)

    

    # Generate embeddings according to paper
    embedded_src = self.exercise_embeddings(encoder_exercises) + \
                   self.enc_pos_embedding(encoder_position) + \
                   self.part_embeddings(encoder_part)
    embedded_src = embedded_src.transpose(0, 1) # (S, N, E)

    embedded_dcdr = self.correctness_embeddings(decoder_correctness) + \
                    self.dec_pos_embedding(decoder_position) + \
                    self.prior_question_elapsed_time(decoder_elapsed_time) 
                    # TODO add lag time embeddings
    embedded_dcdr = embedded_dcdr.transpose(0, 1) # (S, N, E)

    output = self.model(src=embedded_src, tgt=embedded_dcdr, src_mask = mask_src, tgt_mask = mask_tgt, memory_mask = mem_mask)
    output = self.linear(output).transpose(1, 0)

    return output

CORRECTNESS_DEFAULT_TOKEN = 0
POSITION_DEFAULT_TOKEN = 1
ELAPSED_TIME_DEFAULT_TOKEN = 2
LAG_TIME_DEFAULT_TOKEN = 3

def get_batch_embeddings(content, part, correctness, elapsed_time, device="cpu"):
  #encoder has size n, decoder has default + n-1 

  size_x = content.shape[1]
  size_y = content.shape[0]

  # Encoder
  encoder_exercises = content.long()
  encoder_position = torch.arange(0, size_x).to(device).unsqueeze(0).repeat(size_y, 1).long()
  encoder_part = part.long()

  # Decoder
  default_correct = torch.Tensor([CORRECTNESS_DEFAULT_TOKEN]).unsqueeze(0).repeat(size_y, 1).to(device)
  default_position = torch.Tensor([POSITION_DEFAULT_TOKEN]).unsqueeze(0).repeat(size_y, 1).to(device)
  default_elapsed_time = torch.Tensor([ELAPSED_TIME_DEFAULT_TOKEN]).unsqueeze(0).repeat(size_y, 1).to(device)

  decoder_correctness = torch.cat((default_correct, correctness[:,:size_x-1]), -1).long()
  decoder_position = torch.cat((default_position, torch.arange(0, size_x-1).to(device).unsqueeze(0).repeat(size_y, 1)), -1).long()
  decoder_elapsed_time = torch.cat((default_elapsed_time, elapsed_time[:,:size_x-1]), -1).long()
  # TODO lag time

  return encoder_exercises, encoder_position, encoder_part, decoder_correctness, decoder_position, decoder_elapsed_time

In [62]:
class Riiid(torch.utils.data.Dataset):
    
    def __init__(self, d):
        self.d = d
    
    def __len__(self):
        return len(self.d)
    
    def __getitem__(self, idx):
        # you can return a dict of these as well etc etc...
        # remember the order
        return idx, self.d[idx]["content_id"], self.d[idx]["task_container_id"], \
    self.d[idx]["part_id"], self.d[idx]["prior_question_elapsed_time"], self.d[idx]["padded"], \
    self.d[idx]["answered_correctly"]

def collate_fn(batch):
    _, content_id, task_id, part_id, prior_question_elapsed_time, padded, labels = zip(*batch)
    content_id = torch.Tensor(content_id).long()
    task_id = torch.Tensor(task_id).long()
    part_id = torch.Tensor(part_id).long()
    prior_question_elapsed_time = torch.Tensor(prior_question_elapsed_time).long()
    padded = torch.Tensor(padded).bool()
    labels = torch.Tensor(labels)
    # remember the order
    return content_id, task_id, part_id, prior_question_elapsed_time, padded, labels

## Create dataset

In [63]:
dataset = Riiid(d=df_user)

In [64]:
def train(model, dataloader, optimizer, criterion, device = 'cpu'):
  model.train()

  train_loss = []
  all_labels = []
  preds = []
  num_corrects = 0.0
  num_total = 0.0

  #src_mask = model.generate_square_subsequent_mask()
  # TODO what amount of num_workers should we use?
  print(f'Training in device {device}')
  for idx, batch in tqdm(enumerate(dataloader)):

    # need to get the values by order as in the collate_fn
    content_id, task_id, part_id, prior_question_elapsed_time, mask, labels = batch
    content_id = content_id.to(device)
    task_id = task_id.to(device)
    part_id = part_id.to(device)
    prior_question_elapsed_time = prior_question_elapsed_time.to(device)
    mask = mask.to(device)
    labels = labels.to(device)

    
    optimizer.zero_grad()
    
    #mask = model.generate_square_subsequent_mask(BATCH_SIZE)
    #print(mask.shape)
    output = model(content_id, part_id, labels, prior_question_elapsed_time, mask)

    loss = criterion(output[:,:,1], labels)
    loss.backward()
    # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) Tutorial does this... but why?
    optimizer.step()
    train_loss.append(loss.detach().data.cpu().numpy())

    output_prob = torch.sigmoid(output)[:,:,1]
    pred = output_prob >= 0.5
    # The num_corrects and num_total logic seems wrong
    num_corrects += (pred == labels).sum().item()
    num_total += np.prod(labels.shape)


    all_labels.extend(labels.cpu().numpy())
    preds.extend(pred.cpu().numpy())

  acc = num_corrects / num_total
  auc = roc_auc_score(all_labels, preds)
  loss = np.mean(train_loss)

  return loss, acc, auc
# train_loss, train_acc, train_auc = train(model, dataset)

In [65]:
# TODO do evaluate function on dataset 
def val(model, dataloader, criterion, device="cpu"):
  model.eval()

  val_loss = []
  all_labels = []
  preds = []
  num_corrects = 0.0
  num_total = 0.0

  print(f'Evaluating in device {device}')
  for idx, batch in tqdm(enumerate(dataloader)):
    # need to get the values by order as in the collate_fn
    content_id, task_id, part_id, prior_question_elapsed_time, mask, labels = batch
    content_id = content_id.to(device)
    task_id = task_id.to(device)
    part_id = part_id.to(device)
    prior_question_elapsed_time = prior_question_elapsed_time.to(device)
    mask = mask.to(device)
    labels = labels.to(device)

    #mask = model.generate_square_subsequent_mask(BATCH_SIZE)
    #print(mask.shape)
    output = model(content_id, part_id, labels, prior_question_elapsed_time, mask)

    loss = criterion(output[:,:,1], labels)
    # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) Tutorial does this... but why?
    val_loss.append(loss.detach().data.cpu().numpy())

    output_prob = torch.sigmoid(output)[:,:,1]
    pred = output_prob >= 0.5
    # The num_corrects and num_total logic seems wrong
    num_corrects += (pred == labels).sum().item()
    num_total += np.prod(labels.shape)

    all_labels.extend(labels.cpu().numpy())
    preds.extend(pred.cpu().numpy())
    
  acc = num_corrects / num_total
  auc = roc_auc_score(all_labels, preds)
  loss = np.mean(val_loss)

  return loss, acc, auc

In [66]:
# adam optimizer
LEARNING_RATE = 0.001
BETA_1 = 0.9
BETA_2 = 0.999
EPSILON = 1e-8
WARMUP = 4000

#SAINT
N_LAYERS = 4
WINDOW_SIZE = 100
MODEL_DIM = 512
DROPOUT = 0.2
BATCH_SIZE = 64
BATCH_SIZE = AMOUNT

criterion = nn.BCEWithLogitsLoss() #CrossEntropy is bad. but why this?
lr = 1e-3 # learning rate 

device = torch.device("cpu" if not torch.cuda.is_available() else 'cuda')

model = SAINT(d_model=WINDOW_SIZE, nhead=5, num_encoder_layers=N_LAYERS, num_decoder_layers=N_LAYERS, dropout=DROPOUT, dim_feedforward=MODEL_DIM, device=device).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) # FIXME paper is Noam Scheme but it's not available this one is from the tutorial itself

criterion.to(device)
model.to(device)

SAINT(
  (model): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=100, out_features=100, bias=True)
          )
          (linear1): Linear(in_features=100, out_features=512, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (linear2): Linear(in_features=512, out_features=100, bias=True)
          (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.2, inplace=False)
          (dropout2): Dropout(p=0.2, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=100, out_features=100, bias=True)
          )
          (linear1): Linear(in_features=100, out_features=512, bias=True)
          (drop

In [67]:
# best_val_loss = float("inf")
best_auc = 0
epochs = 100 # The number of epochs
best_model = None
overfitted = 0

import time

for epoch in range(1, epochs + 1):
    train_set, val_set = train_test_split(dataset, test_size=0.2)

    train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=8)
    val_loader = torch.utils.data.DataLoader(val_set, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=8)

    epoch_start_time = time.time()
    print('Training')
    train_loss, train_acc, train_auc = train(model=model, dataloader=train_loader, optimizer=optimizer, criterion=criterion, device=device)
    print("epoch - {} train_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, train_loss, train_acc, train_auc))
    print('Validating')
    val_loss, val_acc, val_auc = val(model=model, dataloader=val_loader, criterion=criterion, device=device)
    print("epoch - {} val_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, val_loss, val_acc, val_auc))
    if val_auc > best_auc:
        best_auc = val_auc
        best_model = model
        overfitted = 0
    else:
        overfitted +=1
    if (overfitted >1):
        print("Model overfitted")
        break

    scheduler.step() #changes learning rate...


Training
Training in device cuda


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




TypeError: forward() missing 1 required positional argument: 'decoder_elapsed_time'

In [46]:
SAVE_PATH = "SA(M)INT.pt"
torch.save(best_model.state_dict(), SAVE_PATH)

In [49]:
best_model

SAINT(
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_features=64, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=64, out_features=100, bias=True)
        (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_features=64, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): 

In [48]:
BATCH_SIZE = 100
criterion = nn.BCEWithLogitsLoss() #CrossEntropy is bad. but why this?
lr = 1e-3 # learning rate 

model = SAINT()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) # FIXME paper is Noam Scheme but it's not available this one is from the tutorial itself

device = "cpu" if not torch.cuda.is_available() else torch.device('cuda:0')
criterion.to(device)
model.to(device)
PATH = 'SA(M)INT.pt'
model.load_state_dict(torch.load(PATH))

RuntimeError: Error(s) in loading state_dict for SAINT:
	size mismatch for transformer_encoder.layers.0.self_attn.in_proj_weight: copying a param with shape torch.Size([300, 100]) from checkpoint, the shape in current model is torch.Size([96, 32]).
	size mismatch for transformer_encoder.layers.0.self_attn.in_proj_bias: copying a param with shape torch.Size([300]) from checkpoint, the shape in current model is torch.Size([96]).
	size mismatch for transformer_encoder.layers.0.self_attn.out_proj.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for transformer_encoder.layers.0.self_attn.out_proj.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.0.linear1.weight: copying a param with shape torch.Size([64, 100]) from checkpoint, the shape in current model is torch.Size([64, 32]).
	size mismatch for transformer_encoder.layers.0.linear2.weight: copying a param with shape torch.Size([100, 64]) from checkpoint, the shape in current model is torch.Size([32, 64]).
	size mismatch for transformer_encoder.layers.0.linear2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.0.norm1.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.0.norm1.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.0.norm2.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.0.norm2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.1.self_attn.in_proj_weight: copying a param with shape torch.Size([300, 100]) from checkpoint, the shape in current model is torch.Size([96, 32]).
	size mismatch for transformer_encoder.layers.1.self_attn.in_proj_bias: copying a param with shape torch.Size([300]) from checkpoint, the shape in current model is torch.Size([96]).
	size mismatch for transformer_encoder.layers.1.self_attn.out_proj.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for transformer_encoder.layers.1.self_attn.out_proj.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.1.linear1.weight: copying a param with shape torch.Size([64, 100]) from checkpoint, the shape in current model is torch.Size([64, 32]).
	size mismatch for transformer_encoder.layers.1.linear2.weight: copying a param with shape torch.Size([100, 64]) from checkpoint, the shape in current model is torch.Size([32, 64]).
	size mismatch for transformer_encoder.layers.1.linear2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.1.norm1.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.1.norm1.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.1.norm2.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.1.norm2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.self_attn.in_proj_weight: copying a param with shape torch.Size([300, 100]) from checkpoint, the shape in current model is torch.Size([96, 32]).
	size mismatch for transformer_decoder.layers.0.self_attn.in_proj_bias: copying a param with shape torch.Size([300]) from checkpoint, the shape in current model is torch.Size([96]).
	size mismatch for transformer_decoder.layers.0.self_attn.out_proj.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for transformer_decoder.layers.0.self_attn.out_proj.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.multihead_attn.in_proj_weight: copying a param with shape torch.Size([300, 100]) from checkpoint, the shape in current model is torch.Size([96, 32]).
	size mismatch for transformer_decoder.layers.0.multihead_attn.in_proj_bias: copying a param with shape torch.Size([300]) from checkpoint, the shape in current model is torch.Size([96]).
	size mismatch for transformer_decoder.layers.0.multihead_attn.out_proj.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for transformer_decoder.layers.0.multihead_attn.out_proj.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.linear1.weight: copying a param with shape torch.Size([64, 100]) from checkpoint, the shape in current model is torch.Size([64, 32]).
	size mismatch for transformer_decoder.layers.0.linear2.weight: copying a param with shape torch.Size([100, 64]) from checkpoint, the shape in current model is torch.Size([32, 64]).
	size mismatch for transformer_decoder.layers.0.linear2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.norm1.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.norm1.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.norm2.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.norm2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.norm3.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.norm3.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.self_attn.in_proj_weight: copying a param with shape torch.Size([300, 100]) from checkpoint, the shape in current model is torch.Size([96, 32]).
	size mismatch for transformer_decoder.layers.1.self_attn.in_proj_bias: copying a param with shape torch.Size([300]) from checkpoint, the shape in current model is torch.Size([96]).
	size mismatch for transformer_decoder.layers.1.self_attn.out_proj.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for transformer_decoder.layers.1.self_attn.out_proj.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.multihead_attn.in_proj_weight: copying a param with shape torch.Size([300, 100]) from checkpoint, the shape in current model is torch.Size([96, 32]).
	size mismatch for transformer_decoder.layers.1.multihead_attn.in_proj_bias: copying a param with shape torch.Size([300]) from checkpoint, the shape in current model is torch.Size([96]).
	size mismatch for transformer_decoder.layers.1.multihead_attn.out_proj.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for transformer_decoder.layers.1.multihead_attn.out_proj.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.linear1.weight: copying a param with shape torch.Size([64, 100]) from checkpoint, the shape in current model is torch.Size([64, 32]).
	size mismatch for transformer_decoder.layers.1.linear2.weight: copying a param with shape torch.Size([100, 64]) from checkpoint, the shape in current model is torch.Size([32, 64]).
	size mismatch for transformer_decoder.layers.1.linear2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.norm1.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.norm1.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.norm2.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.norm2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.norm3.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.norm3.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for exercise_embeddings.weight: copying a param with shape torch.Size([13523, 100]) from checkpoint, the shape in current model is torch.Size([13523, 32]).
	size mismatch for enc_pos_embedding.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for part_embeddings.weight: copying a param with shape torch.Size([8, 100]) from checkpoint, the shape in current model is torch.Size([8, 32]).
	size mismatch for prior_question_elapsed_time.weight: copying a param with shape torch.Size([301, 100]) from checkpoint, the shape in current model is torch.Size([301, 32]).
	size mismatch for dec_pos_embedding.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for correctness_embeddings.weight: copying a param with shape torch.Size([2, 100]) from checkpoint, the shape in current model is torch.Size([2, 32]).
	size mismatch for decoder.weight: copying a param with shape torch.Size([2, 100]) from checkpoint, the shape in current model is torch.Size([2, 32]).

In [24]:
train_set, val_set = train_test_split(dataset, test_size=0.2)

dataloader = torch.utils.data.DataLoader(val_set, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=8)

for idx, batch in tqdm(enumerate(dataloader)):
    # need to get the values by order as in the collate_fn
    content_id, task_id, part_id, prior_question_elapsed_time, mask, labels = batch
    content_id = content_id.to(device)
    task_id = task_id.to(device)
    part_id = part_id.to(device)
    prior_question_elapsed_time = prior_question_elapsed_time.to(device)
    mask = mask.to(device)
    labels = labels.to(device)
    output = model(content_id, part_id, prior_question_elapsed_time, mask)

    output_prob = output[:,:,1]
    pred = output_prob >= 0.50

    print(pred)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

 ..., False,  True, False],
        [False, False, False,  ..., False, False,  True],
        ...,
        [ True,  True,  True,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        [ True, False,  True,  ..., False, False, False]], device='cuda:0')
tensor([[False, False, False,  ..., False,  True, False],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ..., False, False, False],
        ...,
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]], device='cuda:0')
tensor([[ True, False,  True,  ...,  True, False, False],
        [ True,  True, False,  ..., False,  True, False],
        [ True,  True,  True,  ..., False, False, False],
        ...,
        [ True, False, False,  ..., False, False, False],
        [ True, False, False,  ..., False, False, False],
        [ True, False, Fals

In [47]:
model1 = SAINT()
device = "cpu" if not torch.cuda.is_available() else torch.device('cuda')
model1 = model1.to(device)
model1.device = device
model1.load_state_dict(torch.load(SAVE_PATH, map_location=torch.device(device)))

RuntimeError: Error(s) in loading state_dict for SAINT:
	size mismatch for transformer_encoder.layers.0.self_attn.in_proj_weight: copying a param with shape torch.Size([300, 100]) from checkpoint, the shape in current model is torch.Size([96, 32]).
	size mismatch for transformer_encoder.layers.0.self_attn.in_proj_bias: copying a param with shape torch.Size([300]) from checkpoint, the shape in current model is torch.Size([96]).
	size mismatch for transformer_encoder.layers.0.self_attn.out_proj.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for transformer_encoder.layers.0.self_attn.out_proj.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.0.linear1.weight: copying a param with shape torch.Size([64, 100]) from checkpoint, the shape in current model is torch.Size([64, 32]).
	size mismatch for transformer_encoder.layers.0.linear2.weight: copying a param with shape torch.Size([100, 64]) from checkpoint, the shape in current model is torch.Size([32, 64]).
	size mismatch for transformer_encoder.layers.0.linear2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.0.norm1.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.0.norm1.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.0.norm2.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.0.norm2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.1.self_attn.in_proj_weight: copying a param with shape torch.Size([300, 100]) from checkpoint, the shape in current model is torch.Size([96, 32]).
	size mismatch for transformer_encoder.layers.1.self_attn.in_proj_bias: copying a param with shape torch.Size([300]) from checkpoint, the shape in current model is torch.Size([96]).
	size mismatch for transformer_encoder.layers.1.self_attn.out_proj.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for transformer_encoder.layers.1.self_attn.out_proj.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.1.linear1.weight: copying a param with shape torch.Size([64, 100]) from checkpoint, the shape in current model is torch.Size([64, 32]).
	size mismatch for transformer_encoder.layers.1.linear2.weight: copying a param with shape torch.Size([100, 64]) from checkpoint, the shape in current model is torch.Size([32, 64]).
	size mismatch for transformer_encoder.layers.1.linear2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.1.norm1.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.1.norm1.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.1.norm2.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_encoder.layers.1.norm2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.self_attn.in_proj_weight: copying a param with shape torch.Size([300, 100]) from checkpoint, the shape in current model is torch.Size([96, 32]).
	size mismatch for transformer_decoder.layers.0.self_attn.in_proj_bias: copying a param with shape torch.Size([300]) from checkpoint, the shape in current model is torch.Size([96]).
	size mismatch for transformer_decoder.layers.0.self_attn.out_proj.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for transformer_decoder.layers.0.self_attn.out_proj.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.multihead_attn.in_proj_weight: copying a param with shape torch.Size([300, 100]) from checkpoint, the shape in current model is torch.Size([96, 32]).
	size mismatch for transformer_decoder.layers.0.multihead_attn.in_proj_bias: copying a param with shape torch.Size([300]) from checkpoint, the shape in current model is torch.Size([96]).
	size mismatch for transformer_decoder.layers.0.multihead_attn.out_proj.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for transformer_decoder.layers.0.multihead_attn.out_proj.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.linear1.weight: copying a param with shape torch.Size([64, 100]) from checkpoint, the shape in current model is torch.Size([64, 32]).
	size mismatch for transformer_decoder.layers.0.linear2.weight: copying a param with shape torch.Size([100, 64]) from checkpoint, the shape in current model is torch.Size([32, 64]).
	size mismatch for transformer_decoder.layers.0.linear2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.norm1.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.norm1.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.norm2.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.norm2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.norm3.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.0.norm3.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.self_attn.in_proj_weight: copying a param with shape torch.Size([300, 100]) from checkpoint, the shape in current model is torch.Size([96, 32]).
	size mismatch for transformer_decoder.layers.1.self_attn.in_proj_bias: copying a param with shape torch.Size([300]) from checkpoint, the shape in current model is torch.Size([96]).
	size mismatch for transformer_decoder.layers.1.self_attn.out_proj.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for transformer_decoder.layers.1.self_attn.out_proj.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.multihead_attn.in_proj_weight: copying a param with shape torch.Size([300, 100]) from checkpoint, the shape in current model is torch.Size([96, 32]).
	size mismatch for transformer_decoder.layers.1.multihead_attn.in_proj_bias: copying a param with shape torch.Size([300]) from checkpoint, the shape in current model is torch.Size([96]).
	size mismatch for transformer_decoder.layers.1.multihead_attn.out_proj.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for transformer_decoder.layers.1.multihead_attn.out_proj.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.linear1.weight: copying a param with shape torch.Size([64, 100]) from checkpoint, the shape in current model is torch.Size([64, 32]).
	size mismatch for transformer_decoder.layers.1.linear2.weight: copying a param with shape torch.Size([100, 64]) from checkpoint, the shape in current model is torch.Size([32, 64]).
	size mismatch for transformer_decoder.layers.1.linear2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.norm1.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.norm1.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.norm2.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.norm2.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.norm3.weight: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for transformer_decoder.layers.1.norm3.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([32]).
	size mismatch for exercise_embeddings.weight: copying a param with shape torch.Size([13523, 100]) from checkpoint, the shape in current model is torch.Size([13523, 32]).
	size mismatch for enc_pos_embedding.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for part_embeddings.weight: copying a param with shape torch.Size([8, 100]) from checkpoint, the shape in current model is torch.Size([8, 32]).
	size mismatch for prior_question_elapsed_time.weight: copying a param with shape torch.Size([301, 100]) from checkpoint, the shape in current model is torch.Size([301, 32]).
	size mismatch for dec_pos_embedding.weight: copying a param with shape torch.Size([100, 100]) from checkpoint, the shape in current model is torch.Size([32, 32]).
	size mismatch for correctness_embeddings.weight: copying a param with shape torch.Size([2, 100]) from checkpoint, the shape in current model is torch.Size([2, 32]).
	size mismatch for decoder.weight: copying a param with shape torch.Size([2, 100]) from checkpoint, the shape in current model is torch.Size([2, 32]).