# Imports

Uncomment and execute in Colab Notebooks

In [None]:
# # execute this cell when loading the notebook for the first time
# ! pip install transformers
# ! pip install datasets

# ! git clone https://github.com/SamTheMar/Question_Answering_CoQA.git

# from google.colab import drive
# drive.mount('/content/drive')

# drive_folder = '/content/drive/MyDrive'

In [None]:
# # execute this cell each time the runtime is restarted

# %cd -0
# %cd Question_Answering_CoQA

# import os
# os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [None]:
import pandas as pd
import json
import numpy as np
import tensorflow as tf
import random

from datasets import Dataset, DatasetDict

from functools import partial

%load_ext autoreload
%autoreload 2

# Data preprocessing

## Setting Seed

In [None]:
def set_reproducibility(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    #os.environ['TF_DETERMINISTIC_OPS'] = '1'

seed = 42
set_reproducibility(seed)

## Dataset Download

In [None]:
from utils.download import download_data

# Train data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='./coqa', url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='./coqa', url_path=test_url, suffix='test')  # <-- Why test? See next slides for an answer!

## Creating DataFrame

In [None]:
def create_df(url):
    with open(url, 'r') as json_file:
        data = json.load(json_file)['data']

    dataframe_rows = []

    for x in data:
        story = x['story']

        for q, a in zip(x['questions'], x['answers']):

            question = q['input_text']
            answer = a['input_text']
            span_text = a['span_text']
            span_start = a['span_start']
            span_end = a['span_end']

            # create single dataframe row
            dataframe_row = {
                "story": story,
                "question": question,
                "answer": answer,
                "span_text": span_text,
                "span_start": span_start,
                "span_end": span_end,
            }

            dataframe_rows.append(dataframe_row)

    return pd.DataFrame(dataframe_rows)

In [None]:
df_train = create_df('./coqa/train.json')
df_test = create_df('./coqa/test.json')
df_train['split'] = 'train'
df_test['split'] = 'test'

df = pd.concat([df_train, df_test], ignore_index=True)

## Remove unanswerable questions

In [None]:
df.loc[(df['answer'] == 'unknown')]

In some cases, the correct answer is the word 'unknown'.

In [None]:
df.loc[(df['answer'] == 'unknown') & (df['span_text'] != 'unknown')]

Therefore, to really remove the unanswerable questions, we only drop the rows in which the span text is also 'unknown'.

In [None]:
index = df.loc[(df['answer'] == 'unknown') & (df['span_text'] == 'unknown')].index

df = df.drop(index).reset_index(drop=True)

Finally, we convert all text to lowercase.

In [None]:
for key in ['story', 'question', 'answer', 'span_text']:
    df[key] = df[key].apply(lambda x: x.lower())
df

## Data Inspection

Let's see how our preprocessed data looks like.

In [None]:
print(f"Dataset size: {df.shape}")
print(f"Dataset columns: {df.columns.values}")
print(f"Some examples:")
df.loc[:5]

We create a new dataframe just for analysis purposes. We want to see what the most common words and bigrams for beginning questions are.

In [None]:
df_analysis = df.copy()
df_analysis['q_first_word'] = df_analysis['question'].str.lower().str.extract(r'(\w+)')
df_analysis['q_first_two_words'] = df_analysis['question'].str.lower().str.extract(r'^((?:\S+\s+){1}\S+).*')

Top ranking first word in question

In [None]:
df_analysis.groupby('q_first_word').size().sort_values(ascending=False).head(15)

Top ranking first bigrams in question

In [None]:
df_analysis.groupby('q_first_two_words').size().sort_values(ascending=False).head(15)

Percentage of rielaborated or non-rielaborated answers. Non-rielaborated answers are the span of the story given by ``span_start`` and ``span_end``.

In [None]:
sia = []
for i in range(df.shape[0]):
    sia.append(df["answer"][i] in df["span_text"][i])
print(f'Percentage of rielaborated answers: {sia.count(False)/len(sia)*100:.2f}%')
print(f'Percentage of not rielaborated answers: {sia.count(True)/len(sia)*100:.2f}%')


## Train, Validation and Test splits

Since the provided dataset only has train and test splits, we need to take part of the original train split and make it the validation split.

We choose an 80/20 split for train/validation.

In [None]:
from sklearn.model_selection import train_test_split

stories = df["story"].loc[df['split'] != 'test'].unique()

story_train, story_val = train_test_split(stories, test_size=0.2, random_state=seed)

conditions = [
    (df['story'].isin(story_train)),
    (df['story'].isin(story_val)),
    (df['split'] == 'test')]
choices = ['train', 'val', 'test']
df['split'] = np.select(conditions, choices)

df

In [None]:
#veryfing split train val ratio
item_counts = df["split"].value_counts()
print("Value counts")
print(item_counts, '\n')

len_train_val = len(df.loc[(df['split'] == 'train') | (df['split'] == 'val')])
print(f"Train split {item_counts['train']/len_train_val:.2f}")
print(f"Val split {item_counts['val']/len_train_val:.2f}")

Now we split the dataframe in 3 and put it in a DatasetDict object.

In [None]:
train_data = df.loc[df['split'] == 'train'].reset_index(drop=True)
val_data = df.loc[df['split'] == 'val'].reset_index(drop=True)
test_data = df.loc[df['split'] == 'test'].reset_index(drop=True)

datasets = DatasetDict({
    'train': Dataset.from_pandas(train_data),
    'val': Dataset.from_pandas(val_data),
    'test': Dataset.from_pandas(test_data)
    })

# datasets_small is only used for debugging
datasets_small = DatasetDict({
    'train': Dataset.from_pandas(train_data[:2]),
    'val': Dataset.from_pandas(val_data[:1]),
    'test': Dataset.from_pandas(test_data[:1])
    })

# Span Detection

## Model definition

Define the pre-trained model checkpoint.

In [None]:
model_checkpoint = "distilbert-base-cased"

max_length = 380
doc_stride = 128

Obtain the tokenizer and prepare the training features.

In [None]:
from utils.preprocessing import prepare_train_features_span
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenized_datasets = datasets.map(
    partial(prepare_train_features_span, tokenizer=tokenizer, max_length=380, doc_stride=128),
    batched=True,
    remove_columns=datasets["train"].column_names,
    num_proc=3,
)

tokenized_datasets = datasets_small.map(
    partial(prepare_train_features_span, tokenizer=tokenizer, max_length=380, doc_stride=128),
    batched=True,
    batch_size=1,
    remove_columns=datasets["train"].column_names,
    num_proc=1,
)

## Training

Set the pre-trained weigths.

In [None]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Fine-tune the pre-trained model.

In [None]:
import os

import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from utils.training_utils_tf import MyHistory, plot_history

# set keras to use mixed precision
keras.mixed_precision.set_global_policy("mixed_float16")


# define callbacks
try:
    checkpoint_folder = os.path.join(drive_folder, "checkpoints", model_checkpoint)
except NameError as e:
    checkpoint_folder = os.path.join("checkpoints", model_checkpoint)

checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.hdf5")
history_path = os.path.join(checkpoint_folder, "history.npy")

checkpoint_callback = ModelCheckpoint(filepath=checkpoint_path, monitor='val_loss')
hist_callback = MyHistory(history_path)


# train the model
optimizer = keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer)

history = model.fit(tokenized_datasets["train"].with_format("numpy"),
                    batch_size=8,
                    epochs=3,
                    validation_data=tokenized_datasets["val"].with_format("numpy"),
                    callbacks=[checkpoint_callback, hist_callback])

Test on some story and question.

In [None]:
# inputs = tokenizer([str(datasets["test"]['story'])], [str(datasets["test"]['question'])], return_tensors="np")
story = """Keras is an API designed for human beings, not machines. Keras follows best
practices for reducing cognitive load: it offers consistent & simple APIs, it minimizes
the number of user actions required for common use cases, and it provides clear &
actionable error messages. It also has extensive documentation and developer guides. """
question = "What is Keras?"

inputs = tokenizer([story], [question], return_tensors="np")

outputs = model(inputs)
start_position = tf.argmax(outputs.start_logits, axis=1)
end_position = tf.argmax(outputs.end_logits, axis=1)
print(int(start_position), int(end_position[0]))

# Sequence2Sequence

# Model definition

In [None]:
model_checkpoint = "prajjwal1/bert-tiny"

encoder_max_length = 512
decoder_max_length = 128

Obtain the tokenizer and prepare the training features


In [None]:
from utils.preprocessing import prepare_train_features_sequence_to_sequence
from transformers import BertTokenizerFast

# TODO: maybe it could be AutoTokenizer, to be tested
tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)

tokenized_datasets = datasets.map(
    partial(prepare_train_features_sequence_to_sequence,
            tokenizer=tokenizer,
            encoder_max_length=encoder_max_length,
            decoder_max_length=decoder_max_length),
    batched=True,
    remove_columns=datasets["train"].column_names,
    num_proc=3,
)

tokenized_datasets.set_format(type="torch")

## Training

Set the pre-trained weights and configure the model.

In [None]:
from transformers import EncoderDecoderModel

model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_checkpoint, model_checkpoint)

model.config.decoder.is_decoder = True
model.config.decoder.add_cross_attention = True

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = decoder_max_length

model.config.early_stopping = True
model.config.no_repeat_ngram_size = 1
model.config.length_penalty = 2.0
model.config.repetition_penalty = 3.0
model.config.num_beams = 10
model.config.vocab_size = model.config.encoder.vocab_size

Set the training arguments

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from utils.eval import compute_metrics

batch_size = 8

checkpoint_folder = os.path.join("checkpoints", "seq2seq", model_checkpoint)
# save checkpoint in google drive folder if on Colab
try:
    checkpoint_folder = os.path.join(drive_folder, checkpoint_folder)
except NameError as e: pass

training_args = Seq2SeqTrainingArguments(
    output_dir=checkpoint_folder,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    evaluate_during_training=True,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    logging_steps=1024,
    save_steps=2048,
    warmup_steps=1024,
    #max_steps=1500, # delete for full training
    num_train_epochs = 3, #TRAIN_EPOCHS
    overwrite_output_dir=True,
    save_total_limit=1,
    fp16=True,
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=partial(compute_metrics, tokenizer=tokenizer),
    train_dataset=train_data,
    eval_dataset=val_data,
)

trainer.train()

Test on some story and question.

In [None]:
context = """Keras is an API designed for human beings, not machines. Keras follows best
practices for reducing cognitive load: it offers consistent & simple APIs, it minimizes
the number of user actions required for common use cases, and it provides clear &
actionable error messages. It also has extensive documentation and developer guides. """
question = "What is Keras?"

inputs = tokenizer(question, context, padding="max_length", truncation = "only_second", max_length=encoder_max_length, return_tensors="pt")
input_ids = inputs.input_ids.to("cuda")
attention_mask = inputs.attention_mask.to("cuda")

outputs = model.generate(input_ids,
                         attention_mask=attention_mask,
                         num_beams=15,
                         repetition_penalty=3.0,
                         length_penalty=2.0,
                         num_return_sequences = 1
)

# all special tokens including will be removed
output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print('risposta: ', output_str)