<a href="https://colab.research.google.com/github/TuckerArrants/knowledge-tracing/blob/main/riiid-seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import string, collections
import pandas as pd, numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from tqdm import tqdm
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Basic Idea

**The idea is simple. Consider the classic machine translation problem where we must train a model to translate English sentences into German ones. Instead of translating sequences of English words into sequences of German words, we are translating sequences of questions into sequences of answers. We can embed the questions in the same way that we embed words for NLP tasks, so one-hot encoding or more dense representations like those generated by fastText/those we train outselves (more [here](https://www.kaggle.com/tuckerarrants/riiid-fasttext-embeddings)).**

# Helper Functions

In [None]:
def to_seq(test_df, mode='train'):
    
    if mode is 'inference':
        test_df['content_id'] = test_df['content_id'].apply(str)
        test_df['prior_group_answers_correct'] = test_df['prior_group_answers_correct']
        seq_df = pd.DataFrame(columns=['user_id', 'content_id', 'prior_group_answers_correct'])

        for user in test_df['user_id'].unique():
            question_seq = ' '.join(test_df[test_df['user_id']==user]['content_id'].values)
            answer_seq = test_df[test_df['prior_group_answers_correct']==user]['prior_group_answers_correct'].values

            seq_df = seq_df.append({'user_id':user,
                               'content_id': f'{question_seq}',
                               'prior_group_answers_correct': answer_seq,
                     }, ignore_index=True)
            
            
    elif mode is 'train':
        test_df['content_id'] = test_df['content_id'].apply(str)
        #test_df['tags'] = test_df['tags'].apply(str)
        test_df['answered_correctly'] = test_df['answered_correctly']
        seq_df = pd.DataFrame(columns=['user_id', 'content_id', 'answered_correctly'])

        for user in test_df['user_id'].unique():
            question_seq = ' '.join(test_df[test_df['user_id']==user]['content_id'].values)
            answer_seq = test_df[test_df['user_id']==user]['answered_correctly'].values
            seq_df = seq_df.append({'user_id' : user,
                                    'content_id' : f'{question_seq}',
                                    'answered_correctly' : answer_seq
                                   }, ignore_index=True)
        
    return seq_df

In [None]:
def tokenize(questions, answers=None, mode='inference'):
    
    question_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    question_tokenizer.fit_on_texts(questions)
    questions = question_tokenizer.texts_to_sequences(questions)
    questions = tf.keras.preprocessing.sequence.pad_sequences(questions,
                                                  padding='post')
    
    if mode is not 'inference':
        answer_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
        answers = tf.keras.preprocessing.sequence.pad_sequences(answers,
                                                  padding='post')
    

        return questions, question_tokenizer, answers
    
    else:
        return questions

# Get Data

In [None]:
MIN_SEQ_LEN = 50
MAX_SEQ_LEN = 60

In [None]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,bundle_id,correct_answer,part,tags
0,8339,25379,24600,7876,0,1,2,0,24000.0,False,7876.0,3.0,1.0,10
1,8340,50137,24600,175,0,2,2,1,23000.0,False,175.0,2.0,1.0,9
2,8341,70181,24600,1278,0,3,3,1,22000.0,False,1278.0,3.0,2.0,143
3,8342,148601,24600,2064,0,4,2,0,18000.0,False,2063.0,1.0,3.0,157
4,8343,148601,24600,2065,0,4,2,1,18000.0,False,2063.0,2.0,3.0,136


In [None]:
questions_df = pd.read_csv('questions.csv').rename({'question_id':'content_id'},
                                                                                        axis=1)

train = pd.merge(train, questions_df, how='left', on='content_id')
print(train.shape)

(1200659, 18)


In [None]:
seq_train = to_seq(train, mode='train')
print(seq_train['user_id'].nunique())
seq_train

22803


Unnamed: 0,user_id,content_id,answered_correctly
0,24600,7876 175 1278 2064 2065 2063 3364 3365 3363 29...,"[0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,137455,7876 175 1278 2065 2063 2064 3365 3364 3363 29...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ..."
2,146023,7876 175 1278 2063 2064 2065 3365 3364 3363 29...,"[1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, ..."
3,157207,7876 175 1278 2065 2063 2064 3363 3364 3365 29...,"[0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, ..."
4,178445,7876 175 1278 2063 2065 2064 3363 3364 3365 29...,"[1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, ..."
...,...,...,...
22798,2147005852,3661 3968 3827 3593 9693 5580 5276 5371 5920 8...,"[0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, ..."
22799,2147025654,5045 6443 8321 6118 9428 8333 6060 9740 6286 9...,"[1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, ..."
22800,2147151535,907 6230 5210 5652 5889 5738 189 4535 5352 610...,"[0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, ..."
22801,2147464207,7876 175 1278 2063 2065 2064 3364 3363 3365 29...,"[1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
questions = [seq for seq in seq_train['content_id']]
answers = [seq for seq in seq_train['answered_correctly']]

# Preprocessing

In [None]:
questions_, question_tokenizer, answers_ = tokenize(questions, answers, mode='train')

# Model Training

**This model is almost the same as the model used in [this notebook](https://www.kaggle.com/tuckerarrants/openvaccine-gru-lstm) from the [OpenVaccine competition](https://www.kaggle.com/c/stanford-covid-vaccine).**

In [None]:
EPOCH_LIST = [3, 2, 1]
BATCH_SIZE_LIST = [8, 32, 64]
VERBOSE = 2

In [None]:
def gru_layer(hidden_dim, dropout):
    return tf.keras.layers.GRU(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer='orthogonal')


def build_model(convolve=False, conv_dim=512, 
                dropout=.4, sp_dropout=.2, embed_dim=100,
                hidden_dim=256, layers=2,
                seq_len=MAX_SEQ_LEN, pred_len=MAX_SEQ_LEN):
    
###############################################
#### Inputs
###############################################

    inputs = tf.keras.layers.Input(shape=(seq_len, ))
    categorical_feats = inputs
    numerical_feats = inputs

    embed = tf.keras.layers.Embedding(input_dim=len(question_tokenizer.word_index) + 1,
                                      output_dim=embed_dim,
                                      mask_zero=True)(categorical_feats)
    
    #embed = tf.keras.layers.concatenate([embed, numerical_feats], axis=2)
    hidden = tf.keras.layers.SpatialDropout1D(sp_dropout)(embed)
    
    if convolve:
        hidden = tf.keras.layers.Conv1D(conv_dim, 5, padding='same', activation=tf.keras.activations.swish)(hidden)

    for _ in range(layers):
        hidden = gru_layer(hidden_dim, dropout)(hidden)

    out = hidden[:, :pred_len]
    out = tf.keras.layers.Dense(1, activation='sigmoid')(out)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    adam = tf.optimizers.Adam()
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['AUC'])

    return model

model = build_model()
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 60)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 60, 100)           1183400   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 60, 100)           0         
_________________________________________________________________
gru (GRU)                    (None, 60, 256)           274944    
_________________________________________________________________
gru_1 (GRU)                  (None, 60, 256)           394752    
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 60, 256)]         0         
_________________________________________________________________
dense (Dense)                (None, 60, 1)            

In [None]:
for epoch, batch_size in zip(EPOCH_LIST, BATCH_SIZE_LIST):

    history = model.fit(questions_, answers_,
                        epochs=epoch, batch_size=batch_size,
                        validation_split=.2,
                        verbose=VERBOSE)

Epoch 1/3
2281/2281 - 52s - loss: 0.5736 - auc: 0.7519 - val_loss: 0.5313 - val_auc: 0.7812
Epoch 2/3
2281/2281 - 51s - loss: 0.5214 - auc: 0.7854 - val_loss: 0.5137 - val_auc: 0.7859
Epoch 3/3
2281/2281 - 50s - loss: 0.5065 - auc: 0.7915 - val_loss: 0.5069 - val_auc: 0.7863
Epoch 1/2
571/571 - 14s - loss: 0.4937 - auc: 0.8004 - val_loss: 0.5063 - val_auc: 0.7858
Epoch 2/2
571/571 - 14s - loss: 0.4892 - auc: 0.8017 - val_loss: 0.5043 - val_auc: 0.7858
286/286 - 7s - loss: 0.4831 - auc: 0.8051 - val_loss: 0.5060 - val_auc: 0.7846


# Inference

In [None]:
#import riiideducation
#env = riiideducation.make_env()
#iter_test = env.iter_test()

In [None]:
TEST_DFS = []

for t, (test_df, sample_prediction_df) in enumerate(iter_test):
    
    print(f'Iteration {t} beginning...')
    
#################################################
#### Update DataFrames
#################################################

    test_df['content_id'] = test_df['content_id'].apply(str)
    
    if len(eval(test_df.iloc[0]['prior_group_answers_correct'])) == 0:
        test_actual = test_df
        TEST_DFS.append(test_df)

    else:
        TEST_DFS[-1]['prior_group_answers_correct'] = eval(test_df.iloc[0]['prior_group_answers_correct'])
        TEST_DFS[-1]['prior_group_responses'] = eval(test_df.iloc[0]['prior_group_responses'])
    
#################################################
#### Format Sequentially
#################################################
        
        SEQ_TEST_DF = pd.DataFrame(columns=['user_id', 'content_id'])
        TEST_DF = pd.concat(TEST_DFS)

        for user in TEST_DF['user_id'].values:
                SEQ_TEST_DF = SEQ_TEST_DF.append({'user':user,
                                                  'content_id':' '.join(TEST_DF[TEST_DF['user_id']==user]['content_id'].values)
                                                }, ignore_index=True)

        test_actual = pd.DataFrame(columns=['user_id', 'content_id'])

        for row in test_df.reset_index().index:
            if test_df.iloc[0]['user_id'] in SEQ_TEST_DF['user_id'].values:

                prev_questions = SEQ_TEST_DF[SEQ_TEST_DF['user_id']==user]['content_id'].values
                cur_question = test_df.iloc[row]['content_id']

                test_actual = test_actual.append({'user':user,
                                                   'content_id': prev_questions + ' ' + cur_question
                                                 }, ignore_index=True)

            else:
                test_actual = test_actual.append({'user':user,
                                                   'content_id': test_df.iloc[row]['content_id']
                                                 }, ignore_index=True)
                
        TEST_DFS.append(test_df)


    question_data = tokenize([str(_) for _ in test_actual['content_id'].values],
                                     mode='inference')
    

#################################################
#### Inference Part
#################################################

    mask = []
    for seq in question_data:
        seq_ = []
        for ques in seq:
            if ques == 0:
                pass
            else:
                ques = 1
            seq_.append(ques)
        mask.append(seq_)
    pred_mask = np.array(mask)
    
    preds = model.predict(question_data)[:, :, 0]
    preds = [elem[-1] for elem in [[ele for ele in sub if ele != 0] for sub in np.multiply(preds, pred_mask)] ]
    
    test_df['answered_correctly'] = preds
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

print('Inference Complete')