# Riiid Answer Correctness Competition

**A Number : A02230980** 

**Name : Tyler Lott**

## Method
For my submission I used the general form set out in google's paper ["Attention is all you need."](https://arxiv.org/abs/1706.03762)
Basically a transformer with multihead attention is used for nlp translation. In this notebook the model is adapted to 
take in the interaction sequences and predict the probability of a question being correct or incorrect. 

In [1]:
# Imports 
import tensorflow as tf
import matplotlib.pyplot as plt
import time
import gc
import riiideducation
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'riiideducation.competition'

## Load the data
define data types while loading in to reduce memory

In [None]:
dtype = {
    'answered_correctly': 'int8',
    # 'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    # 'content_type_id': 'int8',
    'task_container_id': 'int16',
    # 'user_answer': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
}

dtype_questions = {
    'question_id': 'int32',
    # 'bundle_id': 'int32',
    'correct_answer': 'int8',
    'part': 'int8',
    # 'tags': 'object',
}

Read the training data from the competition

In [None]:
df = pd.read_csv(
    # '/kaggle/input/riiid-test-answer-prediction/train.csv',
    'data/train.csv',
    usecols=dtype.keys(),
    dtype=dtype,
    # nrows=10.123*10**7
    # nrows=10**7
)
# group by the user id and only keep the last 2000 interactions for each user
df = df[df.answered_correctly!=-1]
df = df.groupby('user_id').head(2000)

questions = pd.read_csv(
    # '/kaggle/input/riiid-test-answer-prediction/questions.csv', 
    'data/questions.csv',
    dtype=dtype_questions,
    usecols=dtype_questions.keys(),
    index_col='question_id'
)

## Transform the data

Some methods to transform the data to usable forms, we are sorting the timestamps into categories and normalizing the 
elapsed time. We are also adding 1 to each column with zero in it as that is used as a padding token later. 

In [None]:
def transform_questions(questions):
  part_ids = questions.part.max()+1
  return questions, part_ids


def transform_df(df, questions):
    df['prior_question_elapsed_time'] = df['prior_question_elapsed_time'].fillna(0).astype(np.float32)/300000
    # df['timestamp'] = df.timestamp.diff().fillna(0)
    df['timestamp'] = df['timestamp'].fillna(0).astype(np.float32)/87425000000
    bins = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9 ,2]
    labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
    df['timestamp'] =  pd.cut(df['timestamp'], bins=bins, labels=labels).fillna(1)
    df['prior_question_had_explanation'] = df['prior_question_had_explanation'].fillna(False).astype(int)
    content_ids = questions.index.max()+2
    df = df.join(questions, on='content_id')
    df['content_id'] += 1
    df['task_container_id'] += 1
    df['user_ids'] = df['user_id'].astype('category').cat.codes
    task_container_ids = 10001
    return df, content_ids, task_container_ids

In [None]:
questions, part_ids = transform_questions(questions)
df, content_ids, task_container_ids = transform_df(df, questions)
df.columns

## Group rows by user id and putting them in a hashtable

In [None]:
df = {uid: u.drop(columns='user_id') for uid, u in df.groupby('user_id')}

## Data generator

In [None]:
# just some stuff I ctrl C ctrl V from StackOverflow (with little changes)
# [1,2,3,4] --- w = 2 --[[1,2], [2,3], [3,4]] but 2D to 3D
def rolling_window(a, w):
    s0, s1 = a.strides
    m, n = a.shape
    return np.lib.stride_tricks.as_strided(
        a, 
        shape=(m-w+1, w, n), 
        strides=(s0, s0, s1)
    )

def make_time_series(x, windows_size):
  x = np.pad(x, [[ windows_size-1, 0], [0, 0]], constant_values=0)
  x = rolling_window(x, windows_size)
  return x

In [None]:
def add_features_to_user(user):
    # We add one to the column in order to have zeros as padding values
    # Start Of Sequence token will be 3. 
    user['answered_correctly'] = user['answered_correctly'].shift(fill_value=2)+1
    return user

Extending the keras Sequence class to create our dataset of users

In [None]:
class RiidSequence(tf.keras.utils.Sequence):

  def __init__(self, users, windows_size, start=0, end=None):
    self.users = users 
    self.windows_size = windows_size
    # to convert indices to our keys
    self.mapper = dict(zip(range(len(users)), users.keys()))
    # start and end to easy generate training and validation
    self.start = start
    self.end = end if end else len(users)
    # To know where the answered_correctly_column is
    self.answered_correctly_index = list(self.user_example().columns).index('answered_correctly')
        
  def __len__(self):
    return self.end-self.start

  def __getitem__(self, idx):
    uid = self.mapper[idx+self.start]
    user = self.users[uid].copy()
    y = user['answered_correctly'].to_numpy().copy()
    x = add_features_to_user(user)
    return make_time_series(x, self.windows_size), y

  def user_example(self):
    # returns a single user to check if it works
    uid = self.mapper[self.start]
    return add_features_to_user(self.users[uid].copy())

  # INFERENCE PART    
  def get_user_for_inference(self, user_row):
    # Picks a new user row and concats it to previous interactions if it was already stored.
    
    # If the sequence if shorter than the window size, then we pad it.

    uid = user_row[self.answered_correctly_index]
    user_row[self.answered_correctly_index] = 2 # SOS token
    user_row = user_row[np.newaxis, ...]
    if uid in self.users:
      x = np.concatenate([self.users[uid], user_row])
      # same as in training, we need to add one!!!
      x[:, self.answered_correctly_index] = np.roll(x[:, self.answered_correctly_index], 1) + 1
    else:
      x = user_row
     
    if x.shape[0] < self.windows_size:
      return np.pad(x, [[self.windows_size-x.shape[0], 0], [0, 0]])
    elif x.shape[0] > self.windows_size:
      return x[-self.windows_size:]
    else:
      return x

  def update_user(self, uid, user):
    # updates the user information by concatenating the new info to the end of the user sequence
    if uid in self.users:
      self.users[uid] = \
        np.concatenate([self.users[uid], user])[-self.windows_size:]
    else:
      self.users[uid] = user

test out the sequence generator

In [None]:
RiidSequence(df, 64).user_example().head(20)

In [None]:
x, y = RiidSequence(df, 100)[0]
# print(x)
x.shape, y.shape  # batch size, window size, features

## Transformer Model 

In [None]:
# POSITION ENCODING

def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates


def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)

## Masking

I'm unsure if a look ahead mask is needed in our case since we are only predicting one output, not a sequence, but I 
used it anyways and it seemed to work

In [None]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

def create_padding_mask(seqs):
  # We mask only those vectors of the sequence in which we have all zeroes 
  # (this is more scalable for some situations).
  mask = tf.cast(tf.reduce_all(tf.math.equal(seqs, 0), axis=-1), tf.float32)
  return mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

### Things for the multihead attention

In [None]:
# basic attention
def scaled_dot_product_attention(q, k, v, mask):
  matmul_qk = tf.matmul(q, k, transpose_b=True)
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
  output = tf.matmul(attention_weights, v)
  return output, attention_weights

    
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)

  # method to split the layers into different heads
  def split_heads(self, x, batch_size):
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    # feed through dense layer 
    q = self.wq(q)
    k = self.wk(k)
    v = self.wv(v)

    # split out into the number of heads passed through
    q = self.split_heads(q, batch_size)
    k = self.split_heads(k, batch_size)
    v = self.split_heads(v, batch_size)

    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

    # recombine all of the heads of attention
    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))

    output = self.dense(concat_attention)

    return output, attention_weights

# simple feed forward at the end of the attention layer
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model)
  ])

In [None]:
# the encoder layer that the question relavant stuff goes through
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    # passes x as all inputs to the multihead attention then normalizes
    attn_output, _ = self.mha(x, x, x, mask) 
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output) 

    # passes the outputs through a feed forward network then normalizes
    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output) 

    return out2
  

In [None]:
# decoder layer that the encoder output and the question relevant inputs will go through
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
  def call(self, x, enc_output, training, mask, look_ahead_mask=None):
    # encoder layer for the question relevant stuff
    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask) 
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)
    
    # encoder out gets put through the multiheaded attention layer as the q and k
    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, mask)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)
    
    # output of the combined attention layer is fed through a feed forward network
    ffn_output = self.ffn(out2) 
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)
    
    return out3, attn_weights_block1, attn_weights_block2

In [None]:
def get_series_model(n_features, content_ids, task_container_ids, part_ids, windows_size=100, d_model=32, num_heads=4, 
                     n_encoder_layers = 2):
    # Input
    inputs = tf.keras.Input(shape=(windows_size, n_features), name='inputs')
    mask = create_padding_mask(inputs)
    look_mask = create_look_ahead_mask(tf.shape(inputs)[1])
    pos_enc = positional_encoding(windows_size, d_model)    
    
    # Divide branches   
    timestamp = inputs[..., 0]
    content_id = inputs[..., 1]
    task_container_id = inputs[..., 2]
    elapsed_time = inputs[..., 4]
    answered_correctly = inputs[..., 3]
    prior_exp = inputs[..., 5]
    correct_answer = inputs[..., 6]
    part = inputs[..., 7]
    
    # Create embeddings
    
    # first chunk, contains : content id, container id, part id, position
    content_embeddings = tf.keras.layers.Embedding(content_ids, d_model)(content_id)
    task_embeddings = tf.keras.layers.Embedding(task_container_ids, d_model)(task_container_id)
    part_embeddings = tf.keras.layers.Embedding(part_ids, d_model)(part)
    answer_embeddings = tf.keras.layers.Embedding(4, d_model)(correct_answer)
    prior_exp_embeddings = tf.keras.layers.Embedding(2, d_model)(prior_exp)
    
    # third chunk, contains : answered correctly, elapsed time, time between, position
    answered_correctly_embeddings = tf.keras.layers.Embedding(4, d_model)(answered_correctly)
    elapsed_time_embeddings = tf.keras.layers.Dense(d_model, use_bias=False)(elapsed_time) # use a dense layer to embed this because its continuous
    timestamp_embeddings = tf.keras.layers.Embedding(20, d_model)(timestamp)
    
    # Add embeddings
    x = tf.keras.layers.Add()([
        pos_enc,
        content_embeddings,
        task_embeddings,
        part_embeddings,
        prior_exp_embeddings,
        answer_embeddings])
    print(x.shape)
    y = tf.keras.layers.Add()([
        pos_enc,
        answered_correctly_embeddings,
        elapsed_time_embeddings,
        timestamp_embeddings])

    # encoder layers
    for _ in range(n_encoder_layers):
        x = EncoderLayer(d_model=d_model, num_heads=num_heads, dff=d_model*4, rate=0.1)(x, mask=mask)

    # decoder layers
    for _ in range(n_encoder_layers):
        k, _, _ = DecoderLayer(d_model=d_model, num_heads=num_heads, dff=d_model*4, rate=0.1)(y, enc_output=x, mask=mask, look_ahead_mask=None)
    
    k = tf.keras.layers.GlobalAveragePooling1D()(k)
    k = tf.keras.layers.Dropout(0.1)(k)
    
    # single output for our prediction
    output = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(k)
    return tf.keras.Model(inputs, output, name='Transformer')

## Set Hyperparameters and Train

In [None]:
train_idx = int(len(df)*0.8)
windows_size = 100
epochs = 100
patience = 2
d_model = 64
num_heads = 8
n_encoder_layers = 3

### Create a custom learning rate schedule to adjust the learning rate as the model converges

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.99, epsilon=1e-9)

### plot of the learning rate

In [None]:
temp_learning_rate_schedule = CustomSchedule(d_model)

plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")

### Callback to save the best weights

In [None]:
SAVEPATH = f'checkpoints/Transformer_ep-{epochs}_{int(time.time())}.h5'
weights_callback = tf.keras.callbacks.ModelCheckpoint(filepath=SAVEPATH, 
                                                      save_weights_only=True, 
                                                      monitor='val_AUC', 
                                                      mode='max', save_best_only=True)

#### create train and validation sets and set the model parameters

In [None]:
s_train = RiidSequence(df, windows_size, start=0, end=train_idx)
s_val = RiidSequence(df, windows_size, start=train_idx)

n_features = s_train[0][0].shape[-1]
print(n_features)
print(content_ids)
print(task_container_ids)
print(part_ids)

### Create model and compile

In [None]:
model = get_series_model(
        n_features,
        content_ids,
        task_container_ids,
        part_ids,
        windows_size=windows_size,
        d_model=d_model,
        num_heads=num_heads,
        n_encoder_layers=n_encoder_layers)

model.compile(
    optimizer=optimizer, loss='binary_crossentropy',  metrics=[tf.keras.metrics.AUC(name='AUC'), tf.keras.metrics.BinaryAccuracy(name='acc')])

In [None]:
model.summary()
# load old weights to train from
model.load_weights('checkpoints/Transformer_ep-100_1606742208.h5')

In [None]:
model.fit(
    s_train, validation_data=s_val, epochs=epochs, shuffle=True, callbacks=(weights_callback), verbose=2)


## Make prediction and submitt

The API for submitting is funky so a lot of this is heavily influenced by other notebooks from the competition

In [None]:
# delte stuff we don't need
del s_val
del df
gc.collect()

In [None]:
# Read in our data
df = pd.read_csv(
    # '../input/riiid-test-answer-prediction/train.csv',
    'data/train.csv',
    usecols=dtype.keys(),
    dtype=dtype,
    # nrows=10**6
)
df = df[df.answered_correctly!=-1]
df = df.groupby('user_id').tail(windows_size)

In [None]:
# Transform our data
df, _, _ = transform_df(df, questions)
df = {uid: u.drop(columns='user_id') for uid, u in df.groupby('user_id')}


In [None]:
# create api environment
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
columns = list(RiidSequence(df, 64).user_example().columns)
columns[columns.index('answered_correctly')] = 'user_id'
columns = [c for c in columns if c not in questions.columns] + ['row_id']
columns

In [None]:
for test, sample_prediction in iter_test:
    
    try:
        prior_correct = eval(test['prior_group_answers_correct'].iloc[0])
        prior_correct = [a for a in prior_correct if a != -1]
    except:
        prior_correct = []
    
    # Add prior correct to test and update stored users
    if prior_correct:
        prior_test.insert(s_train.answered_correctly_index, 'answered_correctly', prior_correct)
        for uid, user in prior_test.groupby('user_id'):
            s_train.update_user(
                uid, user.drop(columns='user_id').to_numpy())

    # Filter test
    test = test.loc[
        test['content_type_id'] == 0,
        columns
    ]

    # Add global features
    test, _, _ = transform_df(test, questions)

    # Save test for later
    prior_test = test.drop(columns='row_id').copy()

    # Make x
    x = np.apply_along_axis(
        s_train.get_user_for_inference,
        1,
        test.drop(columns='row_id').to_numpy()
    )

    # Predict
    test['answered_correctly'] = model.predict(x, batch_size=x.shape[0])
    
    env.predict(test[['row_id', 'answered_correctly']])
