**Installing requirements**

In [None]:
!pip install -q transformers accelerate evaluate tqdm

**Importing libraries**

In [26]:
import json, torch
from pathlib import Path
from tqdm.auto import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
)
from evaluate import load as load_metric

**Using or overriding the postprocess**

In [27]:
try:
    from transformers.utils.question_answering import postprocess_qa_predictions
except Exception:
    import torch
    def postprocess_qa_predictions(
        examples,
        features,
        all_start_logits,
        all_end_logits,
        n_best_size=20,
        max_answer_length=30,
        **kwargs
    ):
        ex           = examples[0]
        offsets      = features[0]["offset_mapping"]
        start_scores = all_start_logits[0]
        end_scores   = all_end_logits[0]

        start_idx = torch.argmax(start_scores).item()
        end_idx   = torch.argmax(end_scores).item()

        if end_idx < start_idx or end_idx - start_idx + 1 > max_answer_length:
            return {ex["id"]: ""}

        start_char = offsets[start_idx][0]
        end_char   = offsets[end_idx][1]
        return {ex["id"]: ex["context"][start_char:end_char]}

**Loading the model and data**

In [28]:
MODEL_NAME   = "pedramyazdipoor/persian_xlm_roberta_large"
TEST_FILE    = Path("/kaggle/input/test-set/pqa_test.json")
DEVICE       = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN      = 512
N_BEST       = 20
MAX_ANS_LEN  = 30

print(f"Running on: {DEVICE}")

Running on: cuda


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(DEVICE).eval()

In [30]:
with TEST_FILE.open(encoding="utf-8") as f:
    dataset = json.load(f)["data"]

**Evaluation**

In [None]:
preds, refs = [], []
metric = load_metric("squad_v2")
for art in tqdm(dataset, desc="Articles"):
    for para in art["paragraphs"]:
        ctx = para["context"]
        for qa in para["qas"]:
            qid, question = str(qa["id"]), qa["question"]

            tok = tokenizer(
                question, ctx,
                return_offsets_mapping=True,
                truncation=True,
                max_length=MAX_LEN,
                return_tensors="pt"
            )


            offset_mapping = tok.pop("offset_mapping")
            seq_ids = tok.sequence_ids(0)
            context_mask = torch.tensor(
                [sid == 1 for sid in seq_ids], dtype=torch.bool
            )

            tok = {k: v.to(DEVICE) for k, v in tok.items()}
            context_mask = context_mask.to(DEVICE)

            with torch.no_grad():
                out = model(**tok)

            out.start_logits[0][~context_mask] = -1e9
            out.end_logits[0][~context_mask]   = -1e9

            pred_dict = postprocess_qa_predictions(
                examples=[{"id": qid, "context": ctx, "question": question}],
                features=[{"offset_mapping": offset_mapping[0]}],
                all_start_logits=out.start_logits,
                all_end_logits=out.end_logits,
                n_best_size=N_BEST,
                max_answer_length=MAX_ANS_LEN
            )
            pred_span = pred_dict[qid]

            preds.append({
                "id": qid,
                "prediction_text": pred_span,
                "no_answer_probability": 0.0
            })
            refs.append({
                "id": qid,
                "answers": {
                    "text":         [a["text"] for a in qa["answers"]],
                    "answer_start": [a["answer_start"] for a in qa["answers"]]
                }
            })

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Articles:   0%|          | 0/93 [00:00<?, ?it/s]

**Results**

In [32]:
scores = metric.compute(predictions=preds, references=refs)
print(f"\nExact Match: {scores['exact']:.2f}")
print(f"F1 Score   : {scores['f1']:.2f}")


Exact Match: 51.08
F1 Score   : 65.08
