# Imports

In [None]:
save_folder = './'

Uncomment and execute in Colab

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')
# save_folder = '/content/gdrive/MyDrive'

In [None]:
# !pip install allennlp_models
# !pip install transformers
# !pip install datasets

Uncomment and execute in Kaggle

In [None]:
# save_folder = './'

# !pip install allennlp_models

In [None]:
import pandas as pd
import numpy as np

import collections
import random
import torch
import json
import os

from functools import partial
from tqdm.auto import tqdm

from transformers import AutoModelForQuestionAnswering, EncoderDecoderModel
from transformers import TrainingArguments, Trainer
from transformers import default_data_collator
from transformers import AutoTokenizer

from sklearn.model_selection import train_test_split
from allennlp_models.rc.tools import squad
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader

## Define Seed setting

In [None]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)

# Data preprocessing

A thorough explanation on how the CoQA dataset was created can be found in this [paper](https://arxiv.org/pdf/1808.07042.pdf).

## Dataset Download

In [None]:
import os
import urllib.request
from tqdm import tqdm

class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

def download_data(data_path, url_path, suffix):
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    data_path = os.path.join(data_path, f'{suffix}.json')

    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")

# Train data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='./coqa', url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='./coqa', url_path=test_url, suffix='test')


## Creating DataFrame

We create a DataFrame with the following attributes:

 - ``source``: the original source of the story.
 - ``story``: text onto which the model will perform question answering
 - ``question``
 - ``answer``: ground truth for the answer
 - ``span_text``: span of the story in which the answer is to be found
 - ``span_start``: index of the first character of ``span_text``
 - ``span_end``: index of the last character of ``span_text``

In [None]:
def create_df(url, history=False):
    with open(url, 'r') as json_file:
        data = json.load(json_file)['data']

    dataframe_rows = []

    for x in data:
        story = x['story']
        source = x['source']

        for q, a in zip(x['questions'], x['answers']):
            if story == a['input_text']:
                continue

            question = q['input_text']
            answer = a['input_text']
            span_text = a['span_text']
            span_start = a['span_start']
            span_end = a['span_end']

            # create single dataframe row
            dataframe_row = {
                "source": source,
                "story": story,
                "question": question,
                "answer": answer,
                "span_text": span_text,
                "span_start": span_start,
                "span_end": span_end,
            }

            if history:
                record = ""
                for i in range(q["turn_id"]-1):
                    if x['story'] == x['answers'][i]['input_text']:
                        continue
                    record = " ".join([record,
                                       x['questions'][i]['input_text'],
                                       x['answers'][i]['input_text']])
                    record = record + '.'

                dataframe_row["history"] = " ".join([record, question]).lstrip()

            dataframe_rows.append(dataframe_row)

    return pd.DataFrame(dataframe_rows)


We create the train and test DataFrames, and merge them into a single DataFrame to perform preprocessing, such as removing unanswerable questions. Only the train and validation sets are provided. We will use the original validation set as the test set, and then we will split the test set to obtain our validation set.

In [None]:
df_train = create_df('./coqa/train.json', history=True)
df_test = create_df('./coqa/test.json', history=True)
df_train['split'] = 'train'
df_test['split'] = 'test'

df = pd.concat([df_train, df_test], ignore_index=True)

## Remove unanswerable questions

In [None]:
df.loc[(df['answer'] == 'unknown')]

In some cases, the correct answer is the word ``'unknown'``.

In [None]:
df.loc[(df['answer'] == 'unknown') & (df['span_text'] != 'unknown')]

Therefore, to really remove the unanswerable questions, we only drop the rows in which the ``span_text`` is also ``'unknown'``.

In [None]:
index = df.loc[(df['answer'] == 'unknown') & (df['span_text'] == 'unknown')].index

df = df.drop(index).reset_index(drop=True)

Finally, we convert all text to lowercase.

In [None]:
for key in ['story', 'question', 'answer', 'span_text', 'history']:
    df[key] = df[key].apply(lambda x: x.lower())
df

## Data Inspection

Let's see how our preprocessed data looks like.

In [None]:
print(f"Dataset size: {df.shape}")
print(f"Dataset columns: {df.columns.values}")
print(f"Some examples:")
df.loc[:5]

We create a new DataFrame just for analysis purposes. We want to see what the most common words and bigrams for beginning questions are.

In [None]:
df_analysis = df.copy()
df_analysis['q_first_word'] = df_analysis['question'].str.extract(r'(\w+)')
df_analysis['q_first_two_words'] = df_analysis['question'].str.extract(r'^((?:\S+\s+){1}\S+).*')

Top ranking first word in question

In [None]:
df_analysis.groupby('q_first_word').size().sort_values(ascending=False).head(15)

Top ranking first bigrams in question

In [None]:
df_analysis.groupby('q_first_two_words').size().sort_values(ascending=False).head(15)

Percentage of rielaborated or non-rielaborated answers. Non-rielaborated answers are the span of the story given by ``span_start`` and ``span_end``.

In [None]:
sia = []
for i in range(df.shape[0]):
    sia.append(df["answer"][i] in df["span_text"][i])
print(f'Percentage of rielaborated answers: {sia.count(False)/len(sia)*100:.2f}%')
print(f'Percentage of not rielaborated answers: {sia.count(True)/len(sia)*100:.2f}%')

## Train, Validation and Test splits

Like moentioned before, since the provided dataset only has train and test splits, we need to take part of the original train split and make it the validation split.

The split is done by story.

We choose an 80/20 split for train/validation.

In [None]:
def split(df):
    stories = df["story"].loc[df['split'] != 'test'].unique()

    story_train, story_val = train_test_split(stories, test_size=0.2, random_state=42)

    conditions = [
        (df['story'].isin(story_train)),
        (df['story'].isin(story_val)),
        (df['split'] == 'test')]
    choices = ['train', 'val', 'test']
    df['split'] = np.select(conditions, choices)

    return df

df = split(df)

veryfing that the split ratio done on the stories corresponds to the split ratio of the question answer pairs

In [None]:
item_counts = df["split"].value_counts()
print("Value counts")
print(item_counts, '\n')

len_train_val = len(df.loc[(df['split'] == 'train') | (df['split'] == 'val')])
print(f"Train split {item_counts['train']/len_train_val:.2f}")
print(f"Val split {item_counts['val']/len_train_val:.2f}")

Now we split the DataFrame in 3 and put it in a ``DatasetDict`` object from the Huggingface library.

In [None]:
def to_DatasetDict(df):
    train_data = df.loc[df['split'] == 'train'].reset_index()
    val_data = df.loc[df['split'] == 'val'].reset_index()
    test_data = df.loc[df['split'] == 'test'].reset_index()

    return DatasetDict({
        'train': Dataset.from_pandas(train_data),
        'val': Dataset.from_pandas(val_data),
        'test': Dataset.from_pandas(test_data)
    })

# Model Definition

For this assignment we train 2 models:
 - the first model is a span extractor. It finds the text span in which the answer is located
 - the second model performs a sequence-to-sequence transformation, taking as input the span extracted by the first model

We also need to tokenize the input text. Huggingface alreadt provides us a large set of tokenizers from pretrained model vocabularies.

In [None]:
def get_model(model_name:str='roberta'):
    """Return the tokenizer, the span extractor model and the sequence-to-sequence model."""
    if model_name == 'roberta':
        model_checkpoint = "distilroberta-base"
    elif model_name == 'bert-tiny':
        model_checkpoint = "prajjwal1/bert-tiny"
    else:
        print(f'ValueError: {model_name} is not a valid model')
        print('Valid Model: "roberta", "bert-tiny"')
        return

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    #Span Model
    span_model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

    #Seq2Seq MOdel
    s2s_model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_checkpoint, model_checkpoint)
    s2s_model.config.decoder.is_decoder = True
    s2s_model.config.decoder.add_cross_attention = True
    s2s_model.config.decoder_start_token_id = tokenizer.cls_token_id
    s2s_model.config.pad_token_id = tokenizer.pad_token_id
    s2s_model.config.vocab_size = s2s_model.config.encoder.vocab_size

    return tokenizer, span_model, s2s_model


# Question generation with text passage $P$ and question $Q$

## Span Detection

### Tokenize data

We need to prepare the training features of the span detector. At first we tokenize the input text.
For short stories, we add padding tokens to the right.
Since the model we will be using can only process a limited number of tokens at one time, we need to truncate the input text if it is too long.
In order not to lose any data, instead of simply truncating the stories after a given length, we use a sliding window to split them into multiple parts.

In [None]:
def prepare_train_features_span(examples, tokenizer, max_length=380, doc_stride=128, history=False):
    """Tokenize our examples with truncation and padding, but keep the overflows using a
    stride.

    This results in one example possibly resulting in several features when a story is
    longer than ``max_length``.
    Each of those new features has a story that overlaps with the story of the previous one.

    To be used in the map method of ``datasets.DatasetDict``. Before using it, fix all the
    parameters using ``functools.partial``.

    Adapted from https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb

    Parameters
    ----------
    examples : datasets.Dataset
    tokenizer : transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerFast
        pretrained tokenizer for the model for which to use the features.
    max_length : int, optional
        maximum length of the tokenization, by default 380.
    doc_stride : int, optional
        The authorized overlap between two part of the story when splitting, by default 128.
        Do not ask why it called stride instead of overlap, ask those at Huggingface.

    Returns
    -------
    datasets.Dataset | datasets.DatasetDict
        depending where map was called from tokenized examples.
        At the end, the additional features will be:
         - ``'input_ids'``
         - ``'attention_mask'``
         - ``'start_positions'``: index of the starting token of the answer span
         - ``'end_positions'``: index of the final token of the answer span
    """
    question = examples["history"] if history else examples["question"]

    tokenized_examples = tokenizer(
        question,
        examples["story"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a
    # map from a feature to its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original
    # context. This will help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what
        # is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this
        # span of text.
        sample_index = sample_mapping[i]

        # Start/end character index of the answer in the text.
        start_char = examples["span_start"][sample_index]
        end_char = examples["span_end"][sample_index]

        # Start token index of the current span in the text.
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        # End token index of the current span in the text.
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # Detect if the answer is out of the span (in which case this feature is labeled with the
        # CLS index).
        if offsets[token_start_index][0] > start_char or offsets[token_end_index][1] < end_char:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Otherwise move the token_start_index and token_end_index to the two ends of the
            # answer.
            # Note: we could go after the last offset if the answer is the last word (edge
            # case).
            while (
                token_start_index < len(offsets)
                and offsets[token_start_index][0] <= start_char
            ):
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples


### Remove unanswerable questions

The questions from which the text span has been truncated away from the story have become unanswerable. Therefore, we need to remove them from the dataset.

In [None]:
def remove_unanswerable_question(dataset:Dataset) -> Dataset:
    answerable_question = [i for i in tqdm(range(len(dataset)))
                                if (dataset[i]['start_positions'] != 0 and
                                    dataset[i]['end_positions'] != 0)]
    return dataset.select(answerable_question)

### Train Span

Here we define the training loop for the span extraction model.

In [None]:
def span_trainer(
    model,
    train_dataset,
    val_dataset,
    tokenizer,
    folder,
    learning_rate=5e-5,
    weight_decay=0.01,
    epochs=3,
    max_length=380,
    doc_stride=128,
    batch_size=8,
    history=False
):
    column_names = train_dataset.column_names
    
    print('--------------START TOKENIZETION--------------')
    train_dataset = train_dataset.map(
        partial(prepare_train_features_span, tokenizer=tokenizer,
                max_length=max_length, doc_stride=doc_stride, history=history),
        batched=True,
        num_proc=3,
        remove_columns=column_names,
    )
    print('--------------TRAIN DATA TOKENIZED--------------')

    val_dataset = val_dataset.map(
        partial(prepare_train_features_span, tokenizer=tokenizer,
                max_length=max_length, doc_stride=doc_stride, history=history),
        batched=True,
        num_proc=3,
        remove_columns=column_names,
    )
    print('--------------VAL DATA TOKENIZED--------------')
    
    print('--------------START CLEANING DATA--------------')
    train_dataset = remove_unanswerable_question(train_dataset)
    print('--------------TEST DATA CLEANED--------------')
    val_dataset = remove_unanswerable_question(val_dataset)
    print('--------------VAL DATA CLEANED--------------')

    args = TrainingArguments(
        folder,
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=weight_decay,
        save_total_limit = 2,
        save_strategy = 'no',
        load_best_model_at_end=False,
        overwrite_output_dir=True,
        )

    trainer = Trainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=default_data_collator,
        tokenizer=tokenizer,
    )
    
    print('--------------START TRAINING SPAN--------------')
    trainer.train()
    print('--------------END TRAINING SPAN--------------')

    return trainer


### Tokenize test features

For the test set the model cannot access the ground truth, so it is not possible to know a priori whether true span is in the story or not. Therefore, we keep all the truncated stories. For this reason, the preprocessing must be carried out differently.

For each pair of story and question $(P,Q)$, we can potentially have multiple answers. In the post-processing phase, the answer with the highest likelihood is chosen.

In [None]:
def prepare_test_features(examples, tokenizer, max_length=380, doc_stride=128, history=False):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.

    question = examples["history"] if history else examples["question"]

    tokenized_examples = tokenizer(
        question,
        examples["story"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
    # corresponding example_id and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["index"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

### Post process Data

This is the part where the answer with the highest likelihood is chosen.

In [None]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20, max_answer_length=50):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["index"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        # min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []

        context = example["story"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                        or len(offset_mapping[start_index]) == 0
                        or len(offset_mapping[end_index]) == 0
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}

        predictions[example["index"]] = best_answer["text"]

    return predictions

### Span generation

Here we put everything together.

In [None]:
def span_generator(dataset, test_dataset, tokenizer, trainer, max_length=380, doc_stride=128, history=False):
    column_names = test_dataset.column_names

    # Validation Feature Creation

    print('--------------START TOKENIZETION--------------')
    test_dataset = test_dataset.map(
        partial(prepare_test_features, tokenizer=tokenizer,
                max_length=max_length, doc_stride=doc_stride, history=history),
        batched=True,
        num_proc=2,
        remove_columns=column_names)
    print('--------------TEST DATA TOKENIZETION-------------')

    print('--------------START GENERATION SPAN--------------')
    raw_predictions = trainer.predict(test_dataset)

    print('--------------START POST-PROCESSING--------------')
    final_predictions = postprocess_qa_predictions(dataset, test_dataset, raw_predictions.predictions)

    return list(final_predictions.values())


## Seq2Seq

### Tokenize Data

In [None]:
def prepare_train_features_s2s(examples, tokenizer, encoder_max_length=512, decoder_max_length=128, history=False):
    """Tokenize our examples. The example is just truncated if the span exceeds ``encoder_max_length``.

    To be used in the map method of ``datasets.DatasetDict``. Before using it, fix all the
    parameters using ``functools.partial``.

    Parameters
    ----------
    examples : datasets.Dataset
    tokenizer : transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerFast
        pretrained tokenizer for the model for which to use the features.
    encoder_max_length : int, optional
        maximum length of the tokenization of the encoder input, by default 512.
    decoder_max_length : int, optional
        maximum length of the tokenization of the decoder input, by default 128.

    Returns
    -------
    datasets.Dataset | datasets.DatasetDict
        tokenized examples. At the end, the additional features will be:
         - ``'input_ids'``
         - ``'attention_mask'``
         - ``'decoder_input_ids'``
         - ``'decoder_attention_mask'``
         - ``'labels'``
    """
    question = question = examples["history"] if history else examples["question"]
    span = examples["span_text"]
    answer = examples["answer"]

    # tokenize the inputs and story
    tokenized_inputs = tokenizer(question, span, truncation="only_second", max_length=encoder_max_length, padding="max_length")

    # tokenize the answers
    tokenized_outputs = tokenizer(answer, truncation=True, max_length=decoder_max_length, padding="max_length")

    examples["input_ids"] = tokenized_inputs.input_ids
    examples["attention_mask"] = tokenized_inputs.attention_mask
    examples["decoder_input_ids"] = tokenized_outputs.input_ids
    examples["decoder_attention_mask"] = tokenized_outputs.attention_mask
    examples["labels"] = tokenized_outputs.input_ids.copy()

    # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`.
    # We have to make sure that the PAD token is ignored
    examples["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in examples["labels"]]

    return examples


### Metrics

The metric we want to compute is the average f1 score. It is used during validation.

In [None]:
def compute_metrics(tokenizer, labels_ids, pred_ids):
    """Training metrics for sequence-to-sequence encoder-decoder.
    The tokenizer must be fixed, e.g. using ``functools.partial``."""

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    return [squad.compute_f1(pred, lbl) for pred, lbl in zip(pred_str, label_str)]


### Train Seq2Seq

Training loop for sequence to sequence

In [None]:
def s2s_trainer(
    model,
    train_dataset,
    val_dataset,
    tokenizer,
    folder,
    encoder_max_length=512,#128
    decoder_max_length=64,
    epochs=3,
    batch_size=8,
    learning_rate=5e-5,
    history=False
):
    column_names = train_dataset.column_names

    print('--------------START TOKENIZETION--------------')
    train_dataset = train_dataset.map(
        partial(prepare_train_features_s2s,
                tokenizer=tokenizer,
                encoder_max_length=encoder_max_length,
                decoder_max_length=decoder_max_length,
                history=history),
        batched=True,
        remove_columns=column_names,
        num_proc=3,
    )
    print('--------------TRAIN DATA TOKENIZED--------------')
    
    train_dataset.set_format(type="torch")

    val_dataset = val_dataset.map(
        partial(prepare_train_features_s2s,
                tokenizer=tokenizer,
                encoder_max_length=encoder_max_length,
                decoder_max_length=decoder_max_length, 
                history=history),
        batched=True,
        remove_columns=column_names,
        num_proc=3,
    )
    print('--------------VAL DATA TOKENIZED--------------')

    val_dataset.set_format(type="torch")

    ## Load with a DataLoader...
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


    model.to('cuda')

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    print('--------------START TRAINING ENCODER DECODER--------------')
    for epoch in range(1, epochs+1):

        # Train
        model.train()

        train_loss = 0 # cumulative loss

        loop = tqdm(train_loader)
        for batch in loop:
            input_ids = batch["input_ids"].to('cuda')
            attention_mask = batch["attention_mask"].to('cuda')
            labels = batch["labels"].to('cuda')
            # Forward Pass
            output = model(input_ids=input_ids, 
                             attention_mask=attention_mask, 
                             labels=labels)
            # Find the Loss
            loss = output.loss
            # Clear the gradients
            optimizer.zero_grad()
            # Calculate gradients 
            loss.backward()
            # Update Weights
            optimizer.step()
            # Calculate Loss
            train_loss += loss.item()

            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())

        # Compute average loss per epoch
        avg_train_loss = train_loss / len(train_loader)

        # Validate
        model.eval()
        valid_loss = 0 
        f1 = 0
        loop = tqdm(val_loader)
        with torch.no_grad():
            for batch in loop:
                input_ids = batch["input_ids"].to('cuda')
                attention_mask = batch["attention_mask"].to('cuda')
                labels = batch["labels"].to('cuda')
                # Forward Pass
                output = model(input_ids=input_ids, 
                             attention_mask=attention_mask, 
                             labels=labels)
                # Make predictions
                pred_ids = model.generate(input_ids=input_ids, 
                                              attention_mask=attention_mask)
                # Find the Loss
                loss = output.loss
                # Calculate Loss
                valid_loss += loss.item()
                # Metric: compute f1-score
                batch_f1 = np.mean(compute_metrics(tokenizer, labels, pred_ids))
                f1 += batch_f1

                loop.set_description(f'Valid {epoch}')
                loop.set_postfix(loss=loss.item(), f1=batch_f1.item())

            avg_val_loss = valid_loss / len(val_loader)
            avg_f1_score = f1 / len(val_loader)

        print(f'Epoch {epoch}:\t train-loss = {avg_train_loss:.2f}\t val-loss = {avg_val_loss:.2f}\t val-f1 = {avg_f1_score:.3f} ({avg_f1_score*100:.1f}%)')

    print('--------------END TRAINING ENCODER DECODER--------------')
    torch.save(model.state_dict(), folder)


### Generate answer

In [None]:
def generate(batch, tokenizer, model, max_length=512, history=False):
    question = batch["history"] if history else batch["question"]

    inputs = tokenizer(question, batch['span_text'], padding="max_length",
                       truncation='only_second', max_length=max_length, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens includsing will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch

# Train and Evaluation

### Define training loop

In [None]:
def train_generate(model_name,
                   dataset_test,
                   train_dataset,
                   val_dataset,
                   test_dataset,
                   max_length,
                   doc_stride,
                   encoder_max_length,
                   decoder_max_length,
                   batch_size,
                   map_batch_size,
                   history,
                   path_name):

    tokenizer, span_model, s2s_model = get_model(model_name)

    trainer = span_trainer(model=span_model,
                        train_dataset=train_dataset,
                        val_dataset=val_dataset,
                        tokenizer=tokenizer,
                        max_length=max_length,
                        doc_stride=doc_stride,
                        folder=''.join([path_name, '/span']),
                        batch_size=batch_size,
                        history=history)

    s2s_trainer(model=s2s_model,
                train_dataset=train_dataset,
                val_dataset=val_dataset,
                tokenizer=tokenizer,
                encoder_max_length=encoder_max_length,
                decoder_max_length=decoder_max_length,
                folder=''.join([path_name, '/seq2seq']),
                batch_size=batch_size,
                history=history)

    span = span_generator(dataset=dataset_test,
                        test_dataset=test_dataset,
                        tokenizer=tokenizer,
                        max_length=max_length,
                        doc_stride=doc_stride,
                        trainer=trainer,
                        history=history)

    print('--------------START GENERATION ANSWERS--------------')

    results = test_dataset.add_column("span_generated", span).map(partial(generate,
                                                                        tokenizer=tokenizer,
                                                                        model=s2s_model,
                                                                        history=history),
                                                                  batched=True,
                                                                  batch_size=map_batch_size,
                                                                  remove_columns=['index', 'span_start', 'span_end', 'history', 'split'])
    print('--------------ANSWERS GENERATED --------------')

    return results

## General setup

Here we set the random seed and the model to use.
As seeds we use:
 - 42
 - 2022
 - 1337

As models we use:
 - roberta
 - bert-tiny

We run each combination of every seed and model, for a total of 6 different runs.

The model is trained both with and without history.

In [None]:
seed = 42
set_seed(seed)

#model_name = 'roberta'
model_name = 'bert-tiny'

In [None]:
df = split(df)
datasets = to_DatasetDict(df)

train_dataset = datasets['train']
val_dataset = datasets['val']
test_dataset = datasets['test']

### Train and generate without history

In [None]:
results_path = os.path.join(save_folder, f'{model_name}_{seed}')
results = train_generate(model_name=model_name,
                         dataset_test=datasets['test'],
                         train_dataset=train_dataset,
                         val_dataset=val_dataset,
                         test_dataset=test_dataset,
                         max_length=380,
                         doc_stride=128,
                         encoder_max_length=128,
                         decoder_max_length=64,
                         batch_size=16,
                         map_batch_size=128,
                         history=False,
                         path_name=results_path)

f1 = np.mean([squad.compute_f1(pred, lbl) for pred, lbl in zip(results['pred'], results['answer'])])*100
results.to_json(os.path.join(results_path, 'output.json'))
f1

### Train and generate with history

In [None]:
results_path = os.path.join(save_folder, f'{model_name}_history_{seed}')
results = train_generate(model_name=model_name,
                         dataset_test=datasets['test'],
                         train_dataset=train_dataset,
                         val_dataset=val_dataset,
                         test_dataset=test_dataset,
                         max_length=512,
                         doc_stride=80,
                         encoder_max_length=512,
                         decoder_max_length=64,
                         batch_size=16,
                         map_batch_size=128,
                         history=True,
                         path_name=results_path)

f1 = np.mean([squad.compute_f1(pred, lbl) for pred, lbl in zip(results['pred'], results['answer'])])*100
results.to_json(os.path.join(results_path, 'output.json'))
f1

# Download output

In [None]:
# correct generated json, because the to_json method of the Huggingface library generates an invalid file

def read_output_json(filepath: str):
    """Needed because these jsons are not really valid json files, but whatever."""
    output = []
    with open(filepath, "r") as f:
        line = f.readline()
        while line != '':
            output.append(json.loads(line))
            line = f.readline()

    return output

output = read_output_json(os.path.join(results_path, 'output.json'))

with open(os.path.join(results_path, 'output.json'), 'w') as f:
    f.write(json.dumps(output, indent=2))

Download the generated ``output.json``:

 - <a href="./roberta_42/output.json"> Roberta 42 </a>
 - <a href="./roberta_2022/output.json"> Roberta 2022 </a>
 - <a href="./roberta_1337/output.json"> Roberta 1337 </a>
 - <a href="./roberta_history_42/output.json"> Roberta history 42 </a>
 - <a href="./roberta_history_2022/output.json"> Roberta history 2022 </a>
 - <a href="./roberta_history_1337/output.json"> Roberta history 1337 </a>

 - <a href="./bert-tiny_42/output.json"> bert-tiny 42 </a>
 - <a href="./bert-tiny_2022/output.json"> bert-tiny 2022 </a>
 - <a href="./bert-tiny_1337/output.json"> bert-tiny 1337 </a>
 - <a href="./bert-tiny_history_42/output.json"> bert-tiny history 42 </a>
 - <a href="./bert-tiny_history_2022/output.json"> bert-tiny history 2022 </a>
 - <a href="./bert-tiny_history_1337/output.json"> bert-tiny history 1337 </a>

You can also download the already generated outputs <a href="https://github.com/SamTheMar/Question_Answering_CoQA/tree/dev/outputs/"> here </a>.



# Performance Analysis

In [None]:
import pandas as pd
import numpy as np

In [None]:
folder = 'outputs/'
dfs = {}
for filename in os.listdir(folder):
    # transform the filename from e.g. "BT_output_42.json" to the key "bert_42"
    # transform the filename from e.g. "BT_output_history_42.json" to the key "bert_history_42"
    key = filename.split('.')[0]
    key = key.split('_')
    name = 'bert' if key[0] == 'BT' else 'roberta'
    key = '_'.join([name] + key[2:])

    dfs[key] = pd.read_json(os.path.join(folder, filename))


Here we add a column in which we assign a unique index to each story. This is useful for printing out the answers.

In [None]:
for df in dfs.values():
    stories = dfs['bert_42']['story'].unique()
    df['story_index'] = df['story'].apply(lambda x: np.where(stories==x)[0][0])

And now we compute the F1 score for each predicted answer.

In [None]:
# Execute this if you dowloaded from the GitHub repo
from utils.evaluate_v2 import compute_f1

for df in dfs.values():
    df['f1'] = [compute_f1(pred, lbl) for pred, lbl in zip(df['pred'], df['answer'])]

In [None]:
# Execute this if you have allennlp_models installed

for df in dfs.values():
    df['f1'] = [squad.compute_f1(pred, lbl) for pred, lbl in zip(df['pred'], df['answer'])]

This function pretty prints the answers. You can either specify the stories or the question you want to print. Multiple dataframes for different models can be passed, in which case the answers of each model will be printed.

In [None]:
def print_answers(*df, story_index=None, question_index=None):
    """print the answers of the model.

    Parameters
    ----------
    df : pd.DataFrame
        dataframe with the output results. Pass multiple dataframes to have more predicted answers.
    story_index : int, array or array-like
        indices of the stories to print. Pass None to print all the stories. Default is None.
    question_index : int, array or array-like
        indices of the questions to print. Overrides ``story_index``. Default is None.
    """
    stories = df[0]['story'].unique()

    if question_index is not None:
        story_index = df[0]['story_index'].iloc[question_index].unique()

    if story_index is not None:
        stories = stories[story_index]

        if type(story_index) is int:
            stories = [stories]
            story_index = [story_index]

    for story in stories:
        df2 = [d.copy() for d in df]
        if question_index is not None:
            df2 = [d.iloc[question_index].reset_index() for d in df2]
        df2 = [d.loc[d['story']==story].reset_index() for d in df2]

        print(55*'=', 'Story', 60*'=')
        print(story)
        print()
        print(f"Story index: {df2[0]['story_index'].loc[df2[0]['story']==story].values[0]}")
        print(f"Source: {df2[0]['source'][0]}")

        print(50*'-', 'Question answering', 50*'-')
        for i in df2[0].index:
            print(f"\nQuestion {df2[0]['index'][i]}:".ljust(18), df2[0]['question'][i])
            print("True answer:", ' '*4, df2[0]['answer'][i])
            for j in range(len(df2)):
                print("\nPred answer:", ' '*4, df2[j]['pred'][i])
                print("F1 score:", ' '*7, f"{df2[j]['f1'][i]:.2g}")
            print()

## Group by story

Here we print the 5 worst answers for each source.

In [None]:
def get_worst_answers_by_source_index(df, worst=5):
    sources = df['source'].unique().tolist()
    new_index = []
    for s in sources:
        new_index.extend(
            df[df['source'] == s].sort_values(by=['f1'], ascending=True)[:worst].index
        )
    return new_index

def get_worst_answers_by_source(df, worst=5):
    return df.reindex(get_worst_answers_by_source_index(df, worst=worst))

In [None]:
question_index = get_worst_answers_by_source_index(dfs['roberta_42'], worst=5)

print_answers(dfs['roberta_42'], dfs['roberta_1337'], dfs['roberta_2022'], question_index=question_index)

## Mean F1 score

We print the mean F1 score for all the models and all the seeds. In addition, we also evaluate the mean F1 score for the questions with yes/no answers.

In [None]:
yn_index = dfs['roberta_42'].loc[dfs['roberta_42']['answer'] == 'yes'].index
yn_index.append(dfs['roberta_42'].loc[dfs['roberta_42']['answer'] == 'no'].index)

results = []
for key, df in dfs.items():
    f1 = df['f1'].mean()
    f1_yn = df.iloc[yn_index]['f1'].mean()
    results.append({'model': key, 'f1': f1, 'f1_yn': f1_yn})

df_f1 = pd.DataFrame(results)
df_f1

Here we show the mean F1 score across different training runs for the same model.

In [None]:
model_names = ['bert', 'bert_history', 'roberta', 'roberta_history']
df_f1_agg = pd.DataFrame({'model': model_names})

rolling_window = df_f1.drop('model', axis=1).rolling(window=3, center=True)
df_f1_agg = df_f1_agg.join(rolling_window.mean()[1:][::3].reset_index(drop=True))
df_f1_agg = df_f1_agg.join(rolling_window.std()[1:][::3].reset_index(drop=True).rename(lambda x: x + '_std', axis='columns'))

df_f1_agg

In [None]:
import matplotlib.pyplot as plt
plt.style.use('default')

plt.errorbar(model_names, df_f1_agg['f1'], yerr=df_f1_agg['f1_std'], fmt='k,', capsize=2, markersize=0)
plt.bar(model_names, df_f1_agg['f1'])
plt.show()

In [None]:
print(df_f1_agg.to_latex(float_format='%.3f', index=False))