In [1]:
!pip install transformers datasets
!pip install accelerate -U
!pip install transformers[torch]



# Question answering

Question answering tasks return an answer given a question. If you’ve ever asked a virtual assistant like Alexa, Siri or Google what the weather is, then you’ve used a question answering model before. There are two common types of question answering tasks:

- Extractive: extract the answer from the given context;
- Abstractive: generate an answer from the context that correctly answers the question.

In [2]:
from datasets import load_dataset

from transformers import AutoTokenizer, \
                         DefaultDataCollator, \
                         AutoModelForQuestionAnswering, \
                         TrainingArguments, Trainer

In [3]:
#squad = load_dataset("squad")
squad = load_dataset("squad", split="train[:5000]")
squad = squad.train_test_split(test_size=0.2)
squad



DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1000
    })
})

In [4]:
squad['train'][0]

{'id': '56bf9b57a10cfb14005511b3',
 'title': 'Beyoncé',
 'context': 'At the 52nd Annual Grammy Awards, Beyoncé received ten nominations, including Album of the Year for I Am... Sasha Fierce, Record of the Year for "Halo", and Song of the Year for "Single Ladies (Put a Ring on It)", among others. She tied with Lauryn Hill for most Grammy nominations in a single year by a female artist. In 2010, Beyoncé was featured on Lady Gaga\'s single "Telephone" and its music video. The song topped the US Pop Songs chart, becoming the sixth number-one for both Beyoncé and Gaga, tying them with Mariah Carey for most number-ones since the Nielsen Top 40 airplay chart launched in 1992. "Telephone" received a Grammy Award nomination for Best Pop Collaboration with Vocals.',
 'question': 'Who else appeared with Beyonce in Telephone?',
 'answers': {'text': ['Lady Gaga'], 'answer_start': [352]}}

Now we have to load the tokenizer to process the question and context fields

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

There are a few preprocessing steps particular to question answering tasks you should be aware of:

- Some examples in a dataset may have a very long context that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the context by setting `truncation="only_second"`;
- Next, map the start and end positions of the answer to the original context by setting `return_offset_mapping=True`;
- With the mapping in hand, now you can find the start and end tokens of the answer. Use the `sequence_ids` method to find which part of the offset corresponds to the question and which corresponds to the context.

In [6]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]  # Stripping the question field
    inputs = tokenizer(
        questions,
        examples["context"],
        # max_length=384,
        max_length=None,
        truncation="only_second",  # Truncate to a maximum length specified by the max_length argument or the maximum length accepted by the model if no max_length is provided
        return_offsets_mapping=True,  # Maps the start and end of the answer
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")  # Getting the start and end of the answer
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    # Function to get the start and the end of the answer based on the start position of
    # The answer and the len of the answer field
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [7]:
tokenized_squad = squad.map(preprocess_function,
                            batched=True,
                            remove_columns=squad["train"].column_names)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
tokenized_squad

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1000
    })
})

Creating a batch of examples using DefaultDataCollator. Unlike other data collators Transformers, the `DefaultDataCollator` does not apply any additional preprocessing such as padding.

In [9]:
data_collator = DefaultDataCollator()

In [10]:
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The next steps are:

1. Define your training hyperparameters in `TrainingArguments`. The only required parameter is `output_dir` which specifies where to save your model. You can push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model);
2. Pass the training arguments to Trainer along with the model, dataset, tokenizer, and data collator;
3. Call `train()` to finetune your model.

In [11]:
training_args = TrainingArguments(
    output_dir="outputs/my_awesome_qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    # push_to_hub=True,
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,2.409247
2,2.828900,1.844275
3,2.828900,1.726899


TrainOutput(global_step=750, training_loss=2.3873059895833335, metrics={'train_runtime': 676.17, 'train_samples_per_second': 17.747, 'train_steps_per_second': 1.109, 'total_flos': 1567837200384000.0, 'train_loss': 2.3873059895833335, 'epoch': 3.0})

To inference a model, the simplest way is using `transformers.pipeline`, as it is necessary to do some postprocessing.

- The model outputs a list of possible initial indexes and end indexes
- It is necessary to take just the output with the highest probability
- It is necessary to take the initial and end indexes and search in the context to make the output

To make these operations in Tensorflow or Pytorch, there is a reference: https://huggingface.co/docs/transformers/tasks/question_answering

In [23]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="outputs/my_awesome_qa_model/checkpoint-500")

{'score': 0.4729115664958954, 'start': 147, 'end': 153, 'answer': 'Mattel'}

In [27]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [28]:
question_answerer(question=question, context=context)

{'score': 0.17290298640727997,
 'start': 10,
 'end': 95,
 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'}