<a href="https://colab.research.google.com/github/SaudIqbal-IITM/Best-README-Template/blob/master/bert_question_answering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip3 install pyarrow>=6.0.0 datasets transformers

In [3]:
import os

import collections
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import pyarrow as pa

import transformers
import datasets
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, default_data_collator

In [4]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data

In [5]:
raw_data = pd.read_csv("/content/drive/MyDrive/Question-answering.csv", encoding='ISO-8859-1')

In [6]:
raw_data_training, raw_data_validation, raw_data_test = raw_data.iloc[6001:7000+1], raw_data.iloc[9400:9599+1], raw_data.iloc[9600:9749+1]

In [7]:
raw_data_training

Unnamed: 0,Context,QuestionAnswerSets,Title,ValidationRole
6001,Anthropologists have most frequently employed ...,"{<|""Question"" -> ""What term have Anthropologis...",Identity (social science),Training
6002,Boundaries can be inclusive or exclusive depen...,"{<|""Question"" -> ""What are the two types of gr...",Identity (social science),Training
6003,"The ""Neo-Eriksonian"" identity status paradigm ...","{<|""Question"" -> ""What identity status paradig...",Identity (social science),Training
6004,Many people gain a sense of positive self-este...,"{<|""Question"" -> ""What do many people gain fro...",Identity (social science),Training
6005,The first favours a primordialist approach whi...,"{<|""Question"" -> ""What approach takes the sens...",Identity (social science),Training
...,...,...,...,...
6996,Melbourne has an integrated public transport s...,"{<|""Question"" -> ""Which rail station was the w...",Melbourne,Training
6997,Water storage and supply for Melbourne is mana...,"{<|""Question"" -> ""Who manages the water storag...",Melbourne,Training
6998,The discovery of gold in Victoria in mid 1851 ...,"{<|""Question"" -> ""Melbourne experienced rapid ...",Melbourne,Training
6999,At the time of Australia's federation on 1 Jan...,"{<|""Question"" -> ""Where was the first federal ...",Melbourne,Training


In [8]:
def preprocess_data(raw_data):
    data = []
    for i, entry in raw_data.iterrows():
        title = entry["Title"]

        context = entry["Context"]

        question_answer_sets = entry["QuestionAnswerSets"].split("|")[1:-1]

        question_answer_sets = [question_answer for question_answer in question_answer_sets if (question_answer != "{<" and question_answer != ">, <" and question_answer != ">}")]
        for question_answer in question_answer_sets:        
            question_idx = question_answer.find("Question")
            answers_idx = question_answer.find("Answer")
            answer_positions_idx = question_answer.find("AnswerPositions")
            question_id_idx = question_answer.find("QuestionID")

            question = question_answer[question_idx+14:answers_idx-4]
            answers = question_answer[answers_idx+11:answer_positions_idx-3]
            answer_positions = question_answer[answer_positions_idx+20:question_id_idx-3]
            question_id = question_answer[question_id_idx+16:-2]

            answers, answer_positions = answers[1:-1].split(', '), answer_positions[1:-1].split(', ')

            answers, answer_positions = [answer[2:-1] for answer in answers], [int(answer_position) for answer_position in answer_positions]

            data.append(
                {
                    'answers': 
                        {
                            'answer_start': answer_positions, 
                            'text': answers
                        },
                     'context': context,
                     'id': question_id,
                     'question': question,
                     'title': title
                }
            )
                        
    return datasets.Dataset(pa.Table.from_pandas(pd.DataFrame(data)))

In [9]:
dataset = datasets.DatasetDict(
    {
        "train": preprocess_data(raw_data_training),
        "validation": preprocess_data(raw_data_validation),
        "test": preprocess_data(raw_data_test)
    }
)

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['answers', 'context', 'id', 'question', 'title'],
        num_rows: 4781
    })
    validation: Dataset({
        features: ['answers', 'context', 'id', 'question', 'title'],
        num_rows: 1323
    })
    test: Dataset({
        features: ['answers', 'context', 'id', 'question', 'title'],
        num_rows: 641
    })
})

In [11]:
dataset["train"][0]

{'answers': {'answer_start': [57], 'text': ['identity']},
 'context': "Anthropologists have most frequently employed the term 'identity' to refer to this idea of selfhood in a loosely Eriksonian way (Erikson 1972) properties based on the uniqueness and individuality which makes a person distinct from others. Identity became of more interest to anthropologists with the emergence of modern concerns with ethnicity and social movements in the 1970s. This was reinforced by an appreciation, following the trend in sociological thought, of the manner in which the individual is affected by and contributes to the overall social context. At the same time, the Eriksonian approach to identity remained in force, with the result that identity has continued until recently to be used in a largely socio-historical way to refer to qualities of sameness in relation to a person's connection to others and to a particular group of people.",
 'id': '570966c3ed30961900e840a',
 'question': 'What term have Anthr

In [12]:
model_checkpoint = "distilbert-base-uncased"

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [14]:
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [15]:
max_length = 512
doc_stride = 128

In [16]:
pad_on_right = tokenizer.padding_side == "right"

In [17]:
def prepare_training_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]

    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [18]:
tokenized_datasets = dataset.map(prepare_training_features, batched=True, remove_columns=dataset["train"].column_names)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Fine-Tuning

In [19]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [20]:
model_name = model_checkpoint.split("/")[-1]

In [21]:
batch_size = 16
lr = 2e-5
wd = 1e-2
epochs = 3

In [22]:
args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy="epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=wd
)

In [23]:
data_collator = default_data_collator

In [24]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [25]:
trainer.train()

***** Running training *****
  Num examples = 4786
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 900


Epoch,Training Loss,Validation Loss
1,No log,2.860033
2,3.194700,2.642044
3,3.194700,2.657029


***** Running Evaluation *****
  Num examples = 1323
  Batch size = 16
Saving model checkpoint to distilbert-base-uncased-finetuned-squad/checkpoint-500
Configuration saved in distilbert-base-uncased-finetuned-squad/checkpoint-500/config.json
Model weights saved in distilbert-base-uncased-finetuned-squad/checkpoint-500/pytorch_model.bin
tokenizer config file saved in distilbert-base-uncased-finetuned-squad/checkpoint-500/tokenizer_config.json
Special tokens file saved in distilbert-base-uncased-finetuned-squad/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1323
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1323
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=900, training_loss=2.722464667426215, metrics={'train_runtime': 828.8778, 'train_samples_per_second': 17.322, 'train_steps_per_second': 1.086, 'total_flos': 1875917210259456.0, 'train_loss': 2.722464667426215, 'epoch': 3.0})

In [26]:
trainer.save_model("test-squad-trained")

Saving model checkpoint to test-squad-trained
Configuration saved in test-squad-trained/config.json
Model weights saved in test-squad-trained/pytorch_model.bin
tokenizer config file saved in test-squad-trained/tokenizer_config.json
Special tokens file saved in test-squad-trained/special_tokens_map.json


# Evaluation

In [27]:
def prepare_validation_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["example_id"] = []
    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        
        context_index = 1 if pad_on_right else 0

        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [28]:
validation_features = dataset["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=dataset["validation"].column_names
)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [29]:
raw_predictions = trainer.predict(validation_features)

The following columns in the test set don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id. If offset_mapping, example_id are not expected by `DistilBertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1323
  Batch size = 16


In [30]:
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

In [31]:
max_answer_length = 30

In [32]:
examples = dataset["validation"]
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}

features_per_example = collections.defaultdict(list)

for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

In [33]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20, max_answer_length=max_answer_length):
    all_start_logits, all_end_logits = raw_predictions
    
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]

        min_null_score = None 
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            offset_mapping = features[feature_index]["offset_mapping"]

            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue

                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}
        
        predictions[example["id"]] = best_answer["text"]

    return predictions

In [34]:
final_predictions = postprocess_qa_predictions(dataset["validation"], validation_features, raw_predictions.predictions)

Post-processing 1323 example predictions split into 1323 features.


  0%|          | 0/1323 [00:00<?, ?it/s]

In [35]:
metric = load_metric("squad")

In [36]:
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]

references = [{"id": ex["id"], "answers": ex["answers"]} for ex in dataset["validation"]]

In [37]:
df1 = pd.DataFrame(formatted_predictions)
df2 = pd.DataFrame(references)

df1 = df1.astype({"id": "str"})
df2 = df2.astype({"id": "str"})

df = pd.merge(df1, df2, on='id', how="inner")

formatted_predictions = df[["id", "prediction_text"]].to_dict("records")
references = df[["id", "answers"]].to_dict("records")

In [38]:
metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 8.238851095993953, 'f1': 17.5557404960312}