In [None]:
!pip install transformers
!pip install accelerate
!pip install datasets
!pip install evaluate

# 1. Training

In [None]:
MODEL = "xlm-roberta-base"
REPO = "xlm-roberta-base-uqa"
LEARNING_RATE = 2e-5
EPOCHS = 6

### 1.1. Load Dataset

In [None]:
def filter_function(example):
    return not example['is_impossible']

In [None]:
from datasets import load_from_disk

raw_datasets = load_dataset("UQA")
raw_datasets["train"] = raw_datasets["train"].filter(filter_function)
raw_datasets["validation"] = raw_datasets["validation"].filter(filter_function)

In [None]:
raw_datasets

### 1.2. Load Model

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL)

### 1.3. Process Dataset

In [None]:
max_length = 384
stride = 128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answer"]
    answer_starts = examples["answer_start"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer_starts[sample_idx]
        end_char = answer_starts[sample_idx] + len(answer)
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

In [None]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)

### 1.4. Start Training

In [None]:
from transformers import default_data_collator, TrainingArguments, Trainer

args = TrainingArguments(
    output_dir=REPO,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    gradient_accumulation_steps=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=True
)

data_collator = default_data_collator

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

# 2. Evaluation

### 2.1. Prepare Dataset

In [None]:
from datasets import Dataset, DatasetDict

def merge_duplicate_ids(dataset):
    data_list = dataset.to_dict()
    grouped_data = {}

    for i in range(len(data_list['id'])):
        idx = data_list['id'][i]
        if idx not in grouped_data:
            grouped_data[idx] = {
                'id': idx,
                'title': data_list['title'][i],
                'context': data_list['context'][i],
                'question': data_list['question'][i],
                'is_impossible': data_list['is_impossible'][i],
                'answer': [data_list['answer'][i]],
                'answer_start': [data_list['answer_start'][i]]
            }
        else:
            grouped_data[idx]['answer'].append(data_list['answer'][i])
            grouped_data[idx]['answer_start'].append(data_list['answer_start'][i])

    merged_data = list(grouped_data.values())
    return merged_data

merged_train_data = merge_duplicate_ids(raw_datasets['train'])
merged_validation_data = merge_duplicate_ids(raw_datasets['validation'])

merged_train_dataset = Dataset.from_dict({k: [dic[k] for dic in merged_train_data] for k in merged_train_data[0]})
merged_validation_dataset = Dataset.from_dict({k: [dic[k] for dic in merged_validation_data] for k in merged_validation_data[0]})

dataset = DatasetDict({
    'train': merged_train_dataset,
    'validation': merged_validation_dataset
})

dataset

### 2.2. Load Evaluation Functions

In [None]:
import evaluate

metric = evaluate.load("squad")

### 2.3. Evaluate

In [None]:
import os
from tqdm import tqdm
from glob import glob
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [None]:
def evaluate_batch(model, tokenizer, batch):
    inputs = tokenizer(batch["question"], batch["context"], padding="max_length", truncation=True, max_length=512, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output = model(**inputs)
        start_logits, end_logits = output.start_logits, output.end_logits

    answers = []
    for i in range(len(start_logits)):
        start_idx = torch.argmax(start_logits[i])
        end_idx = torch.argmax(end_logits[i])
        answer = tokenizer.decode(inputs["input_ids"][i][start_idx:end_idx+1])
        answers.append(answer)

    return answers

In [None]:
def evaluate(model_dir, dataset):
    checkpoints = glob(f"{model_dir}/checkpoint-*")
    for checkpoint in checkpoints:
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        model = AutoModelForQuestionAnswering.from_pretrained(checkpoint).to("cuda")

        predictions = []
        references = []

        batch_size = 16
        for i in tqdm(range(0, len(dataset["validation"]), batch_size)):
            batch = dataset["validation"][i: i + batch_size]
            preds = evaluate_batch_filtered(model, tokenizer, batch)
            for j in range(len(preds)):
                references.append({"id": batch["id"][j], "answers": [{"text": text, "answer_start": start} for text,
                                                                  start in zip(batch["answer"][j],
                                                                              batch["answer_start"][j])]})
                predictions.append({"id": batch["id"][j], "prediction_text": preds[j]})
        print(checkpoint)
        print(metric.compute(predictions=predictions, references=references))

In [None]:
evaluate(REPO, dataset)