# Install Libraries

In [None]:
!pip install datasets
!pip install transfromers
!pip install accelerate
!pip install evaluate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Dataset

## Load SQuAD v2 Dataset

In [None]:
from datasets import load_dataset, DatasetDict, Dataset

In [None]:
qa_raw_dataset = load_dataset("rajpurkar/squad_v2")

In [None]:
qa_raw_dataset

In [None]:
from tqdm.auto import tqdm

seed = 31

org_train_samples_num = qa_raw_dataset["train"].num_rows
num_train_samples = 15000
train_portion = 0.7
validation_portion = 0.1

## Divide validation samples into validation and test sets

In [None]:
cleaned_dataset = DatasetDict()

splitted_raw_dataset = qa_raw_dataset["validation"].train_test_split(
    train_size=((num_train_samples / train_portion) * validation_portion)/qa_raw_dataset["validation"].num_rows,
    test_size=((num_train_samples / train_portion) * (1 - (train_portion + validation_portion)))/qa_raw_dataset["validation"].num_rows,
    seed=seed
)

cleaned_dataset["train"] = qa_raw_dataset["train"].train_test_split(
    train_size=(num_train_samples/org_train_samples_num),
    seed=seed
)["train"]
cleaned_dataset["validation"] = splitted_raw_dataset["train"]
cleaned_dataset["test"] = splitted_raw_dataset["test"]


In [None]:
cleaned_dataset

# Model

## Load DistilBERT Model (light version of BERT)

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")

## Load RoBERTa model

In [None]:
tokenizer_roberta = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
model_roberta = AutoModelForQuestionAnswering.from_pretrained("FacebookAI/roberta-base")

## Preprocess data

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    
    # Change tokenizer for different models
    inputs = tokenizer_roberta(
        questions,
        examples["context"],
        max_length=386,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        
        if len(answer["answer_start"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)
    
            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
    
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1
    
            # If the answer is not fully inside the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)
    
                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
tokenized_train_dataset = cleaned_dataset["train"].map(preprocess_function, batched=True, remove_columns=cleaned_dataset["train"].column_names)
tokenized_validation_dataset = cleaned_dataset["validation"].map(preprocess_function, batched=True, remove_columns=cleaned_dataset["validation"].column_names)

## Models Training

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

### DistilBERT Training

In [None]:
train_args = TrainingArguments(
    output_dir="squad_bert_model_temp",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
)

In [None]:
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("squad_bert_model")

### RoBERTa Training

In [None]:
train_args_roberta = TrainingArguments(
    output_dir="squad_roberta_model_temp",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
)

In [None]:
trainer_roberta = Trainer(
    model=model_roberta,
    args=train_args_roberta,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer_roberta,
    data_collator=data_collator,
)

In [None]:
import torch

torch.cuda.empty_cache()
torch.clear_autocast_cache()

In [None]:
trainer_roberta.train()
trainer_roberta.save_model("squad_roberta_model")

# Evaluation

In [None]:
from transformers import pipeline

## Loading Self Finetuned Models

In [None]:
bert_qa_pipeline = pipeline("question-answering", model="squad_bert_model", device=0)
roberta_qa_pipeline = pipeline("question-answering", model="squad_roberta_model", device=0)

## Loading Already Finetuned Models

In [None]:
deepset_model = pipeline("question-answering", model="deepset/roberta-base-squad2", device=0)
google_model = pipeline("question-answering", model="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", device=0)

## Calculate predictions and extract ground truth

In [None]:
def get_predictions(dataset: Dataset, model_pipeline):
    predictions = []
    for item in tqdm(dataset):
        prediction = {}
        
        model_prediction = model_pipeline(question=item["question"], context=item["context"])
        
        prediction["prediction_text"] = model_prediction["answer"]
        prediction["id"] = item["id"]
        
        predictions.append(prediction)

    return predictions

In [None]:
bert_predictions = get_predictions(cleaned_dataset["test"], bert_qa_pipeline)
roberta_predictions = get_predictions(cleaned_dataset["test"], roberta_qa_pipeline)
deepset_predictions = get_predictions(cleaned_dataset["test"], deepset_model)
google_predictions = get_predictions(cleaned_dataset["test"], google_model)

In [None]:
bert_predictions[0]

In [None]:
def get_ground_truth(dataset: Dataset):
    result = []
    for item in tqdm(dataset):
        gt = {}
        
        gt["id"] = item["id"]
        gt["answers"] = item["answers"]
        
        result.append(gt)
    
    return result

In [None]:
ground_truth = get_ground_truth(cleaned_dataset["test"])

In [None]:
ground_truth[0]

In [None]:
import json

def save_to_file(d, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(d, f, ensure_ascii=False, indent=2)

In [None]:
save_to_file(bert_predictions, "en/bert_predictions_squad.json")
save_to_file(roberta_predictions, "en/roberta_predictions_squad.json")
save_to_file(deepset_predictions, "en/deepset_predictions_squad.json")
save_to_file(google_predictions, "en/google_predictions_squad.json")
save_to_file(ground_truth, "en/ground_truth_squad.json")

## Load Metrics

In [None]:
import evaluate

em_f1_metric = evaluate.load("evaluate-metric/squad_v2")

## Load Data

In [None]:
def load_from_file(filename):
    with open(filename, "r", encoding="utf-8") as f:
        json_str = "".join(f.readlines())
        return json.loads(json_str)

In [None]:
bert_predictions = load_from_file("en/bert_predictions_squad.json")
roberta_predictions = load_from_file("en/roberta_predictions_squad.json")
deepset_predictions = load_from_file("en/deepset_predictions_squad.json")
google_predictions = load_from_file("en/google_predictions_squad.json")
ground_truth = load_from_file("en/ground_truth_squad.json")

## Compute Metrics

In [None]:
bert_eval_result = em_f1_metric.compute(predictions=bert_predictions, references=ground_truth, no_answer_threshold=no_ans)
roberta_eval_result = em_f1_metric.compute(predictions=roberta_predictions, references=ground_truth, no_answer_threshold=no_ans)
deepset_eval_result = em_f1_metric.compute(predictions=deepset_predictions, references=ground_truth, no_answer_threshold=no_ans)
google_eval_result = em_f1_metric.compute(predictions=google_predictions, references=ground_truth, no_answer_threshold=no_ans)

In [None]:
save_to_file(bert_eval_result, "en/bert_eval.json")
save_to_file(roberta_eval_result, "en/roberta_eval.json")
save_to_file(deepset_eval_result, "en/deepset_eval.json")
save_to_file(google_eval_result, "en/google_eval.json")

## Evaluate on Other Language

In [None]:
fr_qa_dataset = load_dataset("qwant/squad_fr", split="validation")

In [None]:
fr_qa_dataset

In [None]:
fr_bert_predictions = get_predictions(fr_qa_dataset, bert_qa_pipeline)
fr_roberta_predictions = get_predictions(fr_qa_dataset, roberta_qa_pipeline)
fr_deepset_predictions = get_predictions(fr_qa_dataset, deepset_model)
fr_google_predictions = get_predictions(fr_qa_dataset, google_model)

In [None]:
fr_ground_truth = get_ground_truth(fr_qa_dataset)

In [None]:
save_to_file(fr_bert_predictions, "fr/bert_predictions_squad.json")
save_to_file(fr_roberta_predictions, "fr/roberta_predictions_squad.json")
save_to_file(fr_deepset_predictions, "fr/deepset_predictions_squad.json")
save_to_file(fr_google_predictions, "fr/google_predictions_squad.json")
save_to_file(fr_ground_truth, "fr/ground_truth_squad.json")

In [None]:
fr_bert_eval_result = em_f1_metric.compute(predictions=fr_bert_predictions, references=fr_ground_truth)
fr_roberta_eval_result = em_f1_metric.compute(predictions=fr_roberta_predictions, references=fr_ground_truth)
fr_deepset_eval_result = em_f1_metric.compute(predictions=fr_deepset_predictions, references=fr_ground_truth)
fr_google_eval_result = em_f1_metric.compute(predictions=fr_google_predictions, references=fr_ground_truth)

In [None]:
save_to_file(fr_bert_eval_result, "fr/bert_eval.json")
save_to_file(fr_roberta_eval_result, "fr/roberta_eval.json")
save_to_file(fr_deepset_eval_result, "fr/deepset_eval.json")
save_to_file(fr_google_eval_result, "fr/google_eval.json")