# Text Mining Project
### Question Answering on SQUAD

- autor:
Samuele Marino

install library for colab

In [5]:
!pip install transformers datasets accelerate --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.5/191.5 KB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h

## Import

In [27]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, default_data_collator, get_scheduler
from datasets import load_dataset, load_metric, ClassLabel, Sequence, DatasetDict
from IPython.display import display, HTML
from torch.utils.data import DataLoader
from accelerate import Accelerator
from torch.optim import AdamW
from functools import partial
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import collections
import random
import torch
import os

## Datasets

Stanford Question Answering Dataset (__SQuAD__) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.
- __SQuAD1.1__, the previous version of the SQuAD dataset, contains 100,000+ question-answer pairs on 500+ articles.
- __SQuAD2.0__ combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering.

Download dataset

In [7]:
datasets = load_dataset("squad")
datasets_v2 = load_dataset("squad_v2")

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/801k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Have a quick look on the data

In [8]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [9]:
datasets_v2

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

Only the train and validation sets are provided. I will use the original validation set as the test set, and then I will split the test set to obtain our validation set.

In [10]:
train_valid = datasets['train'].train_test_split(test_size=0.2)
squad = DatasetDict({
    'train': train_valid['train'],
    'validation': train_valid['test'],
    'test': datasets['validation']})
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 70079
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 17520
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [11]:
train_valid_v2 = datasets_v2['train'].train_test_split(test_size=0.2)
squad_v2 = DatasetDict({
    'train': train_valid_v2['train'],
    'validation': train_valid_v2['test'],
    'test': datasets_v2['validation']})
squad_v2

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 104255
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 26064
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

Show some elements of the datasets

In [12]:
def show_random_elements(dataset, num_examples=5):
    # Can't pick more elements than there are in the dataset
    assert num_examples <= len(dataset)
    
    picks =  random.sample(range(1, len(dataset)-1), num_examples)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [13]:
show_random_elements(squad["train"])

Unnamed: 0,id,title,context,question,answers
0,572800512ca10214002d9b0b,Strasbourg,"Between the German invasion of Poland on 1 September 1939 and the Anglo-French declaration of War against the German Reich on 3 September 1939, the entire city (a total of 120,000 people) was evacuated, like other border towns as well. Until the arrival of the Wehrmacht troops mid-June 1940, the city was, for ten months, completely empty, with the exception of the garrisoned soldiers. The Jews of Strasbourg had been evacuated to Périgueux and Limoges, the University had been evacuated to Clermont-Ferrand.",How many people were evacuated during the invasion?,"{'text': ['120,000'], 'answer_start': [172]}"
1,5730b5b4069b531400832296,Sumer,"Sumerian religion seems to have been founded upon two separate cosmogenic myths. The first saw creation as the result of a series of hieros gami or sacred marriages, involving the reconciliation of opposites, postulated as a coming together of male and female divine beings; the gods. This continued to influence the whole Mesopotamian mythos. Thus in the Enuma Elish the creation was seen as the union of fresh and salt water; as male Abzu, and female Tiamat. The product of that union, Lahm and Lahmu, ""the muddy ones"", were titles given to the gate keepers of the E-Abzu temple of Enki, in Eridu, the first Sumerian city. Describing the way that muddy islands emerge from the confluence of fresh and salty water at the mouth of the Euphrates, where the river deposited its load of silt, a second hieros gamos supposedly created Anshar and Kishar, the ""sky-pivot"" or axle, and the ""earth pivot"", parents in turn of Anu (the sky) and Ki (the earth). Another important Sumerian hieros gamos was that between Ki, here known as Ninhursag or ""Lady Sacred Mountain"", and Enki of Eridu, the god of fresh water which brought forth greenery and pasture.",What does one myth see creation as being the result of?,"{'text': ['a series of hieros gami'], 'answer_start': [121]}"
2,5727eae03acd2414000defdd,Northwestern_University,"Northwestern requires that all new buildings be LEED-certified. Silverman Hall on the Evanston campus was awarded Gold LEED Certification in 2010; Wieboldt Hall on the Chicago campus was awarded Gold LEED Certification in 2007, and the Ford Motor Company Engineering Design Center on the Evanston campus was awarded Silver LEED Certification in 2006. New construction and renovation projects will be designed to provide at least a 20% improvement over energy code requirements where technically feasible. The university also released at the beginning of the 2008–09 academic year the Evanston Campus Framework Plan, which outlines plans for future development of the Evanston Campus. The plan not only emphasizes the sustainable construction of buildings, but also discusses improving transportation by optimizing pedestrian and bicycle access. Northwestern has had a comprehensive recycling program in place since 1990. Annually more than 1,500 tons are recycled at Northwestern, which represents 30% of the waste produced on campus. Additionally, all landscape waste at the university is composted.",What does the Evanston Campus Framework Plan outline?,"{'text': ['future development of the Evanston Campus'], 'answer_start': [641]}"
3,56d07aa6234ae51400d9c314,Solar_energy,"In the last two decades, photovoltaics (PV), also known as solar PV, has evolved from a pure niche market of small scale applications towards becoming a mainstream electricity source. A solar cell is a device that converts light directly into electricity using the photoelectric effect. The first solar cell was constructed by Charles Fritts in the 1880s. In 1931 a German engineer, Dr Bruno Lange, developed a photo cell using silver selenide in place of copper oxide. Although the prototype selenium cells converted less than 1% of incident light into electricity, both Ernst Werner von Siemens and James Clerk Maxwell recognized the importance of this discovery. Following the work of Russell Ohl in the 1940s, researchers Gerald Pearson, Calvin Fuller and Daryl Chapin created the crystalline silicon solar cell in 1954. These early solar cells cost 286 USD/watt and reached efficiencies of 4.5–6%. By 2012 available efficiencies exceed 20% and the maximum efficiency of research photovoltaics is over 40%.",What has happened to photovoltaic in the past 20 years?,"{'text': ['evolved from a pure niche market of small scale applications towards becoming a mainstream electricity source'], 'answer_start': [73]}"
4,57261e2bec44d21400f3d90f,Royal_Dutch_Shell,"In February 2010 Shell and Cosan formed a 50:50 joint-venture, Raízen, comprising all of Cosan's Brazilian ethanol, energy generation, fuel distribution and sugar activities, and all of Shell's Brazilian retail fuel and aviation distribution businesses. In March 2010, Shell announced the sale of some of its assets, including its liquid petroleum gas (LPG) business, to meet the cost of a planned $28bn capital spending programme. Shell invited buyers to submit indicative bids, due by 22 March, with a plan to raise $2–3bn from the sale. In June 2010, Royal Dutch Shell agreed to acquire all the business of East Resources for a cash consideration of $4.7 billion. The transaction included East Resources' tight gas fields.",In what year did Shell and Cosan form a 50:50 joint venture?,"{'text': ['2010'], 'answer_start': [12]}"


In [14]:
show_random_elements(squad_v2["train"])

Unnamed: 0,id,title,context,question,answers
0,57335f23d058e614000b595e,Alfred_North_Whitehead,"In higher organisms (like people), these two modes of perception combine into what Whitehead terms ""symbolic reference"", which links appearance with causation in a process that is so automatic that both people and animals have difficulty refraining from it. By way of illustration, Whitehead uses the example of a person's encounter with a chair. An ordinary person looks up, sees a colored shape, and immediately infers that it is a chair. However, an artist, Whitehead supposes, ""might not have jumped to the notion of a chair"", but instead ""might have stopped at the mere contemplation of a beautiful color and a beautiful shape."" This is not the normal human reaction; most people place objects in categories by habit and instinct, without even thinking about it. Moreover, animals do the same thing. Using the same example, Whitehead points out that a dog ""would have acted immediately on the hypothesis of a chair and would have jumped onto it by way of using it as such."" In this way symbolic reference is a fusion of pure sense perceptions on the one hand and causal relations on the other, and that it is in fact the causal relationships that dominate the more basic mentality (as the dog illustrates), while it is the sense perceptions which indicate a higher grade mentality (as the artist illustrates).",How might an artist view a chair differently than a typical person?,"{'text': ['""might have stopped at the mere contemplation of a beautiful color and a beautiful shape.""'], 'answer_start': [543]}"
1,573636bf9c79961900ff7e09,Hunting,"In contrast, Botswana has recently been forced to ban trophy hunting following a precipitous wildlife decline. The numbers of antelope plummeted across Botswana, with a resultant decline in predator numbers, while elephant numbers remained stable and hippopotamus numbers rose. According to the government of Botswana, trophy hunting is at least partly to blame for this, but many other factors, such as poaching, drought and habitat loss are also to blame. Uganda recently did the same, arguing that ""the share of benefits of sport hunting were lopsided and unlikely to deter poaching or improve [Uganda's] capacity to manage the wildlife reserves.""",What animal numbers remain stable in Botswana?,"{'text': ['elephant'], 'answer_start': [214]}"
2,5727d08a2ca10214002d9734,On_the_Origin_of_Species,"Evolutionary ideas, although not natural selection, were accepted by German biologists accustomed to ideas of homology in morphology from Goethe's Metamorphosis of Plants and from their long tradition of comparative anatomy. Bronn's alterations in his German translation added to the misgivings of conservatives, but enthused political radicals. Ernst Haeckel was particularly ardent, aiming to synthesise Darwin's ideas with those of Lamarck and Goethe while still reflecting the spirit of Naturphilosophie. Their ambitious programme to reconstruct the evolutionary history of life was joined by Huxley and supported by discoveries in palaeontology. Haeckel used embryology extensively in his recapitulation theory, which embodied a progressive, almost linear model of evolution. Darwin was cautious about such histories, and had already noted that von Baer's laws of embryology supported his idea of complex branching.","While evolutionary ideas were accepted by German biologists, what was not?","{'text': ['natural selection'], 'answer_start': [33]}"
3,570e6b020b85d914000d7eb9,Sanskrit,"In order to explain the common features shared by Sanskrit and other Indo-European languages, many scholars have proposed the Indo-Aryan migration theory, asserting that the original speakers of what became Sanskrit arrived in what is now India and Pakistan from the north-west some time during the early second millennium BCE. Evidence for such a theory includes the close relationship between the Indo-Iranian tongues and the Baltic and Slavic languages, vocabulary exchange with the non-Indo-European Uralic languages, and the nature of the attested Indo-European words for flora and fauna.",What is the theory called dealing with the transfer of Sanskrit to India?,"{'text': ['Indo-Aryan migration theory'], 'answer_start': [126]}"
4,5a79ed7f17ab25001a8a01ec,Association_football,"The governing bodies in each country operate league systems in a domestic season, normally comprising several divisions, in which the teams gain points throughout the season depending on results. Teams are placed into tables, placing them in order according to points accrued. Most commonly, each team plays every other team in its league at home and away in each season, in a round-robin tournament. At the end of a season, the top team is declared the champion. The top few teams may be promoted to a higher division, and one or more of the teams finishing at the bottom are relegated to a lower division.",Who avoid league systems?,"{'text': [], 'answer_start': []}"


## Model definition

In [15]:
model_checkpoint = "distilroberta-base"
#model_checkpoint = 'bert-base-uncased'
#model_checkpoint = "prajjwal1/bert-mini"
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
#Span Model
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
print(f"\nROBERTA is trained for sequences up to {model.config.max_position_embeddings} tokens")

Downloading (…)lve/main/config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be 


ROBERTA is trained for sequences up to 514 tokens


## Prepare features 

In [16]:
def prepare_train_features(examples, max_length=384, doc_stride=128):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [17]:
def prepare_test_features(examples, max_length=384, doc_stride=128):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

## Post process and Metric

In [18]:
def postprocess_qa_predictions(all_start_logits, all_end_logits, examples, features, n_best_size=20, max_answer_length=30, squad_v2=False):
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                        or len(offset_mapping[start_index]) == 0
                        or len(offset_mapping[end_index]) == 0
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

    return predictions

In [19]:
def compute_metrics(predictions, examples, squad_v2=False):
    if squad_v2:
        formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()]
    else:
        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    
    metric = load_metric("squad_v2" if squad_v2 else "squad")
    
    return metric.compute(predictions=formatted_predictions, references=theoretical_answers)

## Train and Evaluation


In [40]:
def train(model,
          tokenized_train_dataset, 
          tokenized_val_dataset, 
          raw_val_dataset,
          folder,
          num_train_epochs=3,
          learning_rate=2e-5,
          batch_size=64,
          squad_v2=False):

    tokenized_train_dataset.set_format("torch")
    val_for_model = tokenized_val_dataset.remove_columns(["example_id", "offset_mapping"])
    val_for_model.set_format("torch")

    train_dataloader = DataLoader(tokenized_train_dataset,
                                  shuffle=True,
                                  collate_fn=default_data_collator,
                                  batch_size=batch_size)
    
    eval_dataloader = DataLoader(val_for_model,
                                 collate_fn=default_data_collator,
                                 batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-08)

    accelerator = Accelerator()
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)

    lr_scheduler = get_scheduler("linear",
                                 optimizer=optimizer,
                                 num_warmup_steps=0,
                                 num_training_steps=num_train_epochs*len(train_dataloader))

    for epoch in range(1, num_train_epochs + 1):
        # Training
        model.train()
        train_loss = 0 # cumulative loss
        loop = tqdm(train_dataloader)
        for batch in loop:
            # Forward Pass
            outputs = model(**batch)
            # Find the Loss
            loss = outputs.loss
            # Calculate gradients 
            accelerator.backward(loss)
            # Update Weights
            optimizer.step()
            lr_scheduler.step()
            # Clear the gradients
            optimizer.zero_grad()
            # Calculate Loss
            train_loss += loss.item()

            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())

        # Compute average loss per epoch
        avg_train_loss = train_loss / len(train_dataloader)

        # Evaluation
        model.eval()
        exact_match = 0
        f1 = 0 
        start_logits = []
        end_logits = []

        loop = tqdm(eval_dataloader)
        for batch in loop:
            with torch.no_grad():
                # Forward Pass
                outputs = model(**batch)
                loop.set_description(f'Valid {epoch}')

            start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
            end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())


        start_logits = np.concatenate(start_logits)
        end_logits = np.concatenate(end_logits)

        prediction = postprocess_qa_predictions(start_logits, end_logits, raw_val_dataset, tokenized_val_dataset, squad_v2=squad_v2)

        metrics = compute_metrics(prediction, raw_val_dataset, squad_v2)
        f1_score = metrics['f1']
        exact_match_score = metrics['exact'] if squad_v2 else metrics['exact_match']

        print(f'Epoch {epoch}:\t train-loss = {avg_train_loss:.2f}\t val-f1 = {f1_score:.2f}\t exact_match = {exact_match_score:.2f}')


        # Save and upload
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        path = ''.join([folder, '/', str(epoch), '/'])
        if not os.path.exists(path):
            os.makedirs(path)
        unwrapped_model.save_pretrained(path, save_function=accelerator.save)

In [21]:
def generate(model, tokenized_test, raw_test_dataset, batch_size=128, squad_v2=False):
  test_for_model = tokenized_test.remove_columns(["example_id", "offset_mapping"])
  test_for_model.set_format("torch")

  test_dataloader = DataLoader(test_for_model, 
                               collate_fn=default_data_collator, 
                               batch_size=batch_size)

  model, test_dataloader = Accelerator().prepare(model, test_dataloader)

  model.eval()

  start_logits = []
  end_logits = []

  for batch in tqdm(test_dataloader):
      with torch.no_grad():
          outputs = model(**batch)

      start_logits.append((outputs.start_logits).cpu().numpy())
      end_logits.append((outputs.end_logits).cpu().numpy())


  start_logits = np.concatenate(start_logits)
  end_logits = np.concatenate(end_logits)

  return postprocess_qa_predictions(start_logits, end_logits, raw_test_dataset, tokenized_test, squad_v2=squad_v2)

#metrics = compute_metrics(prediction, squad['test'], squad_v2)

## Result

In [22]:
max_length = 256 # The maximum length of a feature (question and context)
doc_stride = 64 # The authorized overlap between two part of the context when splitting it is needed.
batch_size = 32
path='/content/squad'
path_v2='/content/squad_v2'

### SQuAD

In [23]:
tokenized_train_dataset = squad["train"].map(
    partial(prepare_train_features, max_length=max_length, doc_stride=doc_stride),
    batched=True,
    num_proc=3,
    remove_columns=squad["train"].column_names,
)

tokenized_val_dataset = squad["validation"].map(
    partial(prepare_test_features, max_length=max_length, doc_stride=doc_stride),
    batched=True,
    num_proc=3,
    remove_columns=squad["validation"].column_names
)

      

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

      

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

In [41]:
train(model,
      tokenized_train_dataset, 
      tokenized_val_dataset, 
      squad["validation"],
      num_train_epochs=3,
      folder=path,
      learning_rate=2e-5,
      batch_size=batch_size,
      squad_v2=False)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Post-processing 10 example predictions split into 13 features.


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:	 train-loss = 4.67	 val-f1 = 2.82	 exact_match = 0.00


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Post-processing 10 example predictions split into 13 features.


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 2:	 train-loss = 4.54	 val-f1 = 5.32	 exact_match = 0.00


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Post-processing 10 example predictions split into 13 features.


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 3:	 train-loss = 4.41	 val-f1 = 5.32	 exact_match = 0.00


In [None]:
torch.save(model.state_dict(), '/content/squad.pth')

In [None]:
tokenized_test_dataset = squad['test'].map(
    partial(prepare_test_features, max_length=max_length, doc_stride=doc_stride),
    batched=True,
    remove_columns=squad["test"].column_names,
)

  0%|          | 0/11 [00:00<?, ?ba/s]

In [None]:
prediction = generate(model, tokenized_test_dataset, squad['test'], batch_size=256)

  0%|          | 0/47 [00:00<?, ?it/s]

Post-processing 10570 example predictions split into 11912 features.


  0%|          | 0/10570 [00:00<?, ?it/s]

In [None]:
compute_metrics(prediction, squad['test'], squad_v2=False)

{'exact_match': 75.4399243140965, 'f1': 84.07224904047398}

### SQuAD2

In [None]:
tokenized_train_dataset_v2 = squad_v2["train"].map(
    partial(prepare_train_features, max_length=max_length, doc_stride=doc_stride),
    batched=True,
    num_proc=3,
    remove_columns=squad_v2["train"].column_names,
)

tokenized_val_dataset_v2 = squad_v2["validation"].map(
    partial(prepare_test_features, max_length=max_length, doc_stride=doc_stride),
    batched=True,
    num_proc=3,
    remove_columns=squad_v2["validation"].column_names
)

      

#1:   0%|          | 0/35 [00:00<?, ?ba/s]

#2:   0%|          | 0/35 [00:00<?, ?ba/s]

#0:   0%|          | 0/35 [00:00<?, ?ba/s]

      

#1:   0%|          | 0/9 [00:00<?, ?ba/s]

#0:   0%|          | 0/9 [00:00<?, ?ba/s]

#2:   0%|          | 0/9 [00:00<?, ?ba/s]

In [None]:
train(model,
      tokenized_train_dataset_v2, 
      tokenized_val_dataset_v2, 
      squad_v2["validation"],
      folder=path_v2,
      num_train_epochs=3,
      learning_rate=2e-5,
      batch_size=batch_size,
      squad_v2=True)

  0%|          | 0/3604 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

Post-processing 26064 example predictions split into 28774 features.


  0%|          | 0/26064 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

Epoch 1:	 train-loss = 1.17	 val-f1 = 70.76	 exact_match = 63.33


In [None]:
torch.save(model.state_dict(), '/content/squad_v2.pth')

In [None]:
tokenized_test_dataset_v2 = squad_v2['test'].map(
    partial(prepare_test_features, max_length=max_length, doc_stride=doc_stride),
    batched=True,
    remove_columns=squad_v2["test"].column_names,
)

  0%|          | 0/12 [00:00<?, ?ba/s]

In [None]:
prediction_v2 = generate(model, tokenized_test_dataset_v2, squad_v2['test'], batch_size=256, squad_v2=True)

  0%|          | 0/53 [00:00<?, ?it/s]

Post-processing 11873 example predictions split into 13502 features.


  0%|          | 0/11873 [00:00<?, ?it/s]

In [None]:
compute_metrics(prediction_v2, squad_v2['test'], squad_v2=True)

In [None]:
compute_metrics(prediction_v2, squad_v2['test'], squad_v2=True)

{'exact': 65.57736039754064,
 'f1': 68.72817217548223,
 'total': 11873,
 'HasAns_exact': 67.81376518218623,
 'HasAns_f1': 74.12442446685198,
 'HasAns_total': 5928,
 'NoAns_exact': 63.34735071488646,
 'NoAns_f1': 63.34735071488646,
 'NoAns_total': 5945,
 'best_exact': 65.57736039754064,
 'best_exact_thresh': 0.0,
 'best_f1': 68.72817217548241,
 'best_f1_thresh': 0.0}