# Install Libraries

In [1]:
!pip install datasets
!pip install transfromers
!pip install accelerate
!pip install evaluate
!pip install stanza

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dataset

## Load QUAC Dataset

In [3]:
from datasets import load_dataset, DatasetDict, Dataset

In [4]:
qa_raw_dataset = load_dataset("quac")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/31.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.60M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11567 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Unwrap nested lists

In [5]:
from tqdm.auto import tqdm
cleaned_dataset = DatasetDict()

def unwrap_lists(dataset: Dataset):
    result = []
    for item in tqdm(dataset):
        for i in range(len(item["questions"])):
            result_dict = {
                "context": item["context"],
                "question": item["questions"][i],
                "answer": [{
                    "text": item["answers"]["texts"][i][0],
                    "answer_start": item["answers"]["answer_starts"][i][0]
                }]
            }
            result.append(result_dict)

    return Dataset.from_list(result)

In [6]:
qa_raw_dataset["train"] = unwrap_lists(qa_raw_dataset["train"])
qa_raw_dataset["validation"] = unwrap_lists(qa_raw_dataset["validation"])

qa_raw_dataset["validation"] = qa_raw_dataset["validation"].shuffle()

  0%|          | 0/11567 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [7]:
qa_raw_dataset

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 83568
    })
    validation: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 7354
    })
})

## Divide validation samples into validation and test sets

In [8]:
cleaned_dataset["train"] = qa_raw_dataset["train"]
cleaned_dataset["validation"] = qa_raw_dataset["validation"]

splitted_validation_set = cleaned_dataset["validation"].train_test_split(test_size=0.4)
cleaned_dataset["validation"] = splitted_validation_set["test"]
cleaned_dataset["test"] = splitted_validation_set["train"]

In [9]:
cleaned_dataset

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 83568
    })
    validation: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 2942
    })
    test: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 4412
    })
})

## Decrease number of train samples (too many samples) and shuffle

In [10]:
cleaned_dataset["train"] = cleaned_dataset["train"].shuffle()

In [11]:
cleaned_dataset["train"] = cleaned_dataset["train"].train_test_split(train_size=0.25)["train"]

In [12]:
cleaned_dataset

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 20892
    })
    validation: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 2942
    })
    test: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 4412
    })
})

# Model

## Load DistilBERT Model (light version of BERT)

In [13]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Load RoBERTa model

In [None]:
tokenizer_roberta = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
model_roberta = AutoModelForQuestionAnswering.from_pretrained("FacebookAI/roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Preprocess data

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]

    # Change tokenizer for different models
    inputs = tokenizer_roberta(
        questions,
        examples["context"],
        max_length=386,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answer"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i][0]
        start_char = answer["answer_start"]
        end_char = answer["answer_start"] + len(answer["text"])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
tokenized_train_dataset = cleaned_dataset["train"].map(preprocess_function, batched=True, remove_columns=cleaned_dataset["train"].column_names)
tokenized_validation_dataset = cleaned_dataset["validation"].map(preprocess_function, batched=True, remove_columns=cleaned_dataset["validation"].column_names)

Map:   0%|          | 0/20892 [00:00<?, ? examples/s]

Map:   0%|          | 0/2942 [00:00<?, ? examples/s]

## Models Training

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

### DistilBERT Training

In [None]:
train_args = TrainingArguments(
    output_dir="quac_dataset_model",
    evaluation_strategy="epoch",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
)

In [None]:
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,3.064098
2,3.372300,2.894895


Epoch,Training Loss,Validation Loss
1,No log,3.064098
2,3.372300,2.894895
3,3.372300,2.855173


TrainOutput(global_step=981, training_loss=3.1056985806495288, metrics={'train_runtime': 2395.9319, 'train_samples_per_second': 26.159, 'train_steps_per_second': 0.409, 'total_flos': 6173597826710496.0, 'train_loss': 3.1056985806495288, 'epoch': 3.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/quac_dataset_model_2")

### RoBERTa Training

In [None]:
train_args_roberta = TrainingArguments(
    output_dir="quac_dataset_model_roberta",
    evaluation_strategy="epoch",
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    num_train_epochs=3,
)

In [None]:
trainer_roberta = Trainer(
    model=model_roberta,
    args=train_args_roberta,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer_roberta,
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer_roberta.train()
trainer_roberta.save_model("/content/drive/MyDrive/quac_dataset_model_roberta_2")

Epoch,Training Loss,Validation Loss
1,2.9107,2.366757
2,2.3169,2.327497
3,1.8506,2.518314


# Evaluation

In [14]:
from transformers import pipeline

## Loading Stanza

In [15]:
import stanza

In [16]:
nlp = stanza.Pipeline(lang='en', processors='tokenize', use_gpu=True)

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/tokenize/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/mwt/combined.pt:   0%|         …

INFO:stanza:Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Done loading processors!


## Loading Self Finetuned Models

In [19]:
bert_qa_pipeline = pipeline("question-answering", model="/content/drive/MyDrive/quac_dataset_model_2", device=-1)
roberta_qa_pipeline = pipeline("question-answering", model="/content/drive/MyDrive/quac_dataset_model_roberta_2", device=-1)

## Loading Already Finetuned Models

In [20]:
deepset_model = pipeline("question-answering", model="deepset/roberta-base-squad2", device=-1)
google_model = pipeline("question-answering", model="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", device=-1)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at google-bert/bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Calculate predictions and extract ground truth

In [21]:
def normalize_text(s):
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokenized_predictions(dataset: Dataset, model_pipeline):
    predictions = []
    for item in tqdm(dataset):
        model_prediction = model_pipeline(question=item["question"], context=item["context"])

        tokenized_prediction = nlp(normalize_text(model_prediction["answer"]))
        dict_prediction = tokenized_prediction.to_dict()

        prediction = []
        for sentence in dict_prediction:
            prediction.extend(sentence)

        for i in range(len(prediction)):
            prediction[i] = prediction[i]["text"]

        predictions.append(prediction)

    return predictions

In [22]:
bert_predictions = get_tokenized_predictions(cleaned_dataset["test"], bert_qa_pipeline)
roberta_predictions = get_tokenized_predictions(cleaned_dataset["test"], roberta_qa_pipeline)
deepset_predictions = get_tokenized_predictions(cleaned_dataset["test"], deepset_model)
google_predictions = get_tokenized_predictions(cleaned_dataset["test"], google_model)

  0%|          | 0/4412 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
ground_truth = []
for item in tqdm(cleaned_dataset["test"]):
    tokenized_ground_truth = nlp(normalize_text(item["answer"][0]["text"]))
    dict_ground_truth = tokenized_ground_truth.to_dict()
    ground_t = []
    for sentence in dict_ground_truth:
        ground_t.extend(sentence)

    for i in range(len(ground_t)):
        ground_t[i] = ground_t[i]["text"]

    ground_truth.append(ground_t)

  0%|          | 0/4412 [00:00<?, ?it/s]

In [None]:
import json

def save_to_file(d, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(d, f, ensure_ascii=False, indent=2)

In [None]:
save_to_file(bert_predictions, "bert_predictions.json")
save_to_file(roberta_predictions, "roberta_predictions.json")
save_to_file(deepset_predictions, "deepset_predictions.json")
save_to_file(google_predictions, "google_predictions.json")
save_to_file(ground_truth, "ground_truth.json")

## Load Metrics

In [None]:
def compute_em(prediction, reference):
    return int(prediction == reference)

def compute_f1(prediction, truth):
    pred_tokens = prediction
    truth_tokens = truth

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)

## Load Data

In [None]:
def load_from_file(filename):
    with open(filename, "r", encoding="utf-8") as f:
        json_str = "".join(f.readlines())
        return json.loads(json_str)

In [None]:
bert_predictions = load_from_file("bert_predictions.json")
roberta_predictions = load_from_file("roberta_predictions.json")
deepset_predictions = load_from_file("deepset_predictions.json")
google_predictions = load_from_file("google_predictions.json")
ground_truth = load_from_file("ground_truth.json")

## Evaluate on Exact Match Metric

In [None]:
def get_em_score(predictions, references):
    em_avg = 0

    for i in tqdm(range(len(predictions))):
        em_avg += compute_em(predictions[i], references[i])

    return em_avg / len(predictions)

In [None]:
get_em_score(bert_predictions, ground_truth)

  0%|          | 0/4412 [00:00<?, ?it/s]

0.19990933816863102

In [None]:
get_em_score(roberta_predictions, ground_truth)

  0%|          | 0/4412 [00:00<?, ?it/s]

0.20444242973708068

In [None]:
get_em_score(deepset_predictions, ground_truth)

  0%|          | 0/4412 [00:00<?, ?it/s]

0.050090661831368996

In [None]:
get_em_score(google_predictions, ground_truth)

  0%|          | 0/4412 [00:00<?, ?it/s]

0.05485040797824116

## Evaluate on F1 Metric

In [None]:
def get_f1_score(predictions, references):
    f1_scores = []

    for i in tqdm(range(len(predictions))):
        if "cannotanswer" not in references[i]:
            f1_scores.append(compute_f1(predictions[i], references[i]))

    return sum(f1_scores) / len(f1_scores)

In [None]:
get_f1_score(bert_predictions, ground_truth)

  0%|          | 0/4412 [00:00<?, ?it/s]

0.04369853992999201

In [None]:
get_f1_score(roberta_predictions, ground_truth)

  0%|          | 0/4412 [00:00<?, ?it/s]

0.1501495054582614

In [None]:
get_f1_score(deepset_predictions, ground_truth)

  0%|          | 0/4412 [00:00<?, ?it/s]

0.21006289493999736

In [None]:
get_f1_score(google_predictions, ground_truth)

  0%|          | 0/4412 [00:00<?, ?it/s]

0.2097017394814848