In [None]:
import riiideducation
import pandas as pd

# You can only call make_env() once, so don't lose it!
env = riiideducation.make_env()

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
max_num = 1000000
train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', low_memory=True,nrows = max_num,
                       dtype={'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 'content_type_id': 'int8',
                              'task_container_id': 'int16', 'user_answer': 'int8', 'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32', 
                             'prior_question_had_explanation': 'boolean',
                             }
                      )
# we destimate 'lectures' information due we don't have access in the real scenario and will be noisy input.
# additional useful information
questions_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv', low_memory=True)
# we must point into "content_type_id == 0" thus those are the real scenario cases
# 0 means the event was a question being posed to the user
train_df = train_df.loc[train_df['content_type_id'] == 0]
# we target a nominal value [1,0]
print(train_df['answered_correctly'].unique())

In [None]:
# additional information for real training
questions_df

In [None]:
# examples for real training
train_df

In [None]:
# create the whole dataset joining "questions" 
train_df = train_df.merge(questions_df, left_on='content_id', right_on='question_id', how= 'left', copy = False)
train_df

In [None]:
# pune subsets into target columns + additional question columns
train_df = train_df[['timestamp','prior_question_elapsed_time','prior_question_had_explanation', 'correct_answer','part' ,'tags', 'answered_correctly']]
# remove noisy inputs (NaN, Null .......)
train_df = train_df.dropna()
# convert 'prior_question_had_explanation' to numerical.
train_df["prior_question_had_explanation"] = train_df["prior_question_had_explanation"].astype(int)
# Those constitude the enginered features to learn from
train_df

In [None]:
import transformers
from transformers import DistilBertForSequenceClassification, DistilBertConfig, set_seed, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score


from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

import torch
import numpy as np
import logging
from transformers.trainer_utils import is_main_process

training_args = TrainingArguments(
    output_dir='/kaggle/working/results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size for evaluation
    logging_dir='/kaggle/working/logs',            # directory for storing logs
    load_best_model_at_end= True,
    metric_for_best_model= "f1",
    evaluation_strategy="steps",
    eval_steps= 2000,
)

logger = logging.getLogger(__name__)

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
)

# Log on each process the small summary:
logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
# Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank):
    transformers.utils.logging.set_verbosity_info()
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()
logger.info("Training/evaluation parameters %s", training_args)
# replicability status
set_seed(0)

# we abstract from numbers to strings (we think that this new encoding brings further precision)
train_texts = []
train_labels = []
# Iterate over each row to generate the dataset

for index, rows in train_df.iterrows(): 
    # Create text for current numeric values
    tmp_txt = str(rows.timestamp)+" "+str(rows.prior_question_elapsed_time)+" "+str(rows.prior_question_had_explanation)+" "+str(rows.correct_answer)+" "+str(rows.part) + " "+str(rows.tags)
    tmp_label = rows.answered_correctly
    train_labels.append(int(tmp_label))  
    train_texts.append(tmp_txt) 
    
# prepare custom tokenizer training
vocab_file ="/kaggle/working/riid.train.raw"
with open(vocab_file, 'w') as file:
    for item in train_texts:
        file.write("%s\n" % item)
        
# split data (5 % dev and 95 % train)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.05)

In [None]:
train_texts[:10]

In [None]:
train_labels[:10]

In [None]:
# change to a non-internet version
tokenizer.train(trainer, [vocab_file])
train_encodings = tokenizer.encode_batch(train_texts)
val_encodings = tokenizer.encode_batch(val_texts)

In [None]:
for elem in train_encodings[:10]: 
    print(elem.ids)
    print(elem.attention_mask)
    print("\n")

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, max_len):
        self.encodings = encodings
        self.labels = labels
        self.max_len = max_len

    def __getitem__(self, idx):
        item = {}
        item['input_ids'] = torch.tensor(self.encodings[idx].ids + [0]*(self.max_len - len(self.encodings[idx].ids))) 
        item['attention_mask'] = torch.tensor(self.encodings[idx].attention_mask + [0]*(self.max_len - len(self.encodings[idx].attention_mask))) 
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

max_len = max([len(lst.ids) for lst in (train_encodings + val_encodings)])
train_dataset = Dataset(train_encodings, train_labels, max_len)
val_dataset = Dataset(val_encodings, val_labels, max_len)

In [None]:
for i in range(2):
    print(train_dataset.__getitem__(i))

In [None]:
for i in range(2):
    print(val_dataset.__getitem__(i))

In [None]:
# change to a non-internet version
configuration = DistilBertConfig(vocab_size=tokenizer.get_vocab_size())
print(configuration)
model = DistilBertForSequenceClassification(configuration)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    try:    
        return {
            "roc_auc_score": roc_auc_score(labels, predictions),
            "accuracy_score": accuracy_score(labels, predictions),
            "precision": precision_score(labels, predictions),
            "recall": recall_score(labels, predictions),
            "f1": f1_score(labels, predictions),
        }
    except:
        return {
            "roc_auc_score": 0,
            "accuracy_score": 0,
            "precision": 0,
            "recall": 0,
            "f1": 0,
        }

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)


trainer.train()
trainer.save_model()

In [None]:
# You can only iterate through a result from `env.iter_test()` once
# so be careful not to lose it once you start iterating.
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    # we loop for every prediction batch adding the "answered_correctly" column with predicted info.
    # first we avoid the lectures
    test_df = test_df.loc[test_df['content_type_id'] == 0]
    # we abstract from numbers to strings (we think that this new encoding brings further precision)
    tmp_df = test_df.merge(questions_df, left_on='content_id', right_on='question_id', how= 'left')
    
    test_texts = []
    test_labels = []
    # Iterate over each row to generate the dataset
    for index, rows in tmp_df.iterrows(): 
        # Create text for current numeric values
        tmp_txt = str(rows.timestamp)+" "+str(rows.prior_question_elapsed_time)+" "+str(rows.prior_question_had_explanation)+" "+str(rows.correct_answer)+" "+str(rows.part) + " "+str(rows.tags)
        test_labels.append(0)  # unknown
        test_texts.append(tmp_txt) 
        
    test_encodings = tokenizer.encode_batch(train_texts)
    test_dataset = Dataset(test_encodings, test_labels, max_len)
    predictions, labels, metrics = trainer.predict(test_dataset)
    predictions = np.argmax(predictions, axis=1)
    test_df['answered_correctly'] = predictions
    print(test_df[['row_id', 'answered_correctly']])
    env.predict(test_df[['row_id', 'answered_correctly']])