**Installing requirements**

In [1]:
!pip install -q transformers datasets evaluate tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 req

**Importing libraries**

In [3]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from datasets import load_dataset
import json, torch, evaluate
from tqdm.auto import tqdm
from pathlib import Path

2025-07-20 02:26:14.955141: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752978375.167140      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752978375.228886      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


**Model and data loading**

In [5]:
MODEL_NAME = "pedramyazdipoor/parsbert_question_answering_PQuAD"
TEST_JSON  = Path("/kaggle/input/test-set/pqa_test.json")

**Using GPU**

In [6]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device ⇒", DEVICE)

Device ⇒ cuda


**Loading requirements and making configuration**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME, ignore_mismatched_sizes=True).to(DEVICE)

qa_pipe = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    device=0 if DEVICE.type=="cuda" else -1,
    max_seq_len=512,
    doc_stride=128
)

tokenizer_config.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/651M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/651M [00:00<?, ?B/s]

Some weights of the model checkpoint at pedramyazdipoor/parsbert_question_answering_PQuAD were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at pedramyazdipoor/parsbert_question_answering_PQuAD and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be 

In [8]:
with TEST_JSON.open(encoding="utf-8") as f:
    raw = json.load(f)

**References making**

In [9]:
refs_dict = {}
for art in raw["data"]:
    for para in art["paragraphs"]:
        ctx = para["context"]
        for qa in para["qas"]:
            qid = str(qa["id"])
            if qa["is_impossible"] or not qa["answers"]:
                refs_dict[qid] = {
                    "context": ctx,
                    "question": qa["question"],
                    "answers": {"text": [""], "answer_start": [-1]}
                }
            else:
                texts  = [a["text"] for a in qa["answers"]]
                starts = [a["answer_start"] for a in qa["answers"]]
                refs_dict[qid] = {
                    "context": ctx,
                    "question": qa["question"],
                    "answers": {"text": texts, "answer_start": starts}
                }

**Prediction**

In [10]:
preds, refs = [], []
for qid, item in tqdm(refs_dict.items(), desc="Predicting"):
    res = qa_pipe({"context": item["context"], "question": item["question"]})
    preds.append({
        "id": qid,
        "prediction_text": res.get("answer", ""),
        "no_answer_probability": 0.0
    })
    refs.append({
        "id": qid,
        "answers": item["answers"]
    })

Predicting:   0%|          | 0/930 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


**Results**

In [None]:
metric = evaluate.load("squad_v2")
scores = metric.compute(predictions=preds, references=refs)
print(f"Exact Match: {scores['exact']:.2f}")
print(f"F1 Score   : {scores['f1']:.2f}")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Exact Match: 0.00
F1 Score   : 5.79
