# **Benchmarking DeBERTa-v3-large-squad2 fine tuned on hotpotQA for full answer (4 Layer Unfreeze)**

In [1]:
!pip install transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow, evaluate
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 19.0.1
    Uninstalling pyarrow-19.0.1:
      Successfully uninstalled pyarrow-19.0.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigfr

In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import evaluate

# ------------------------------
# Load model
# ------------------------------
model_path = "/kaggle/input/deberta-v3-large-squad2-deepset-hotpot-fullanswer/kaggle/working/deberta-v3-large-squad-v2-hotpot"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

qa = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    handle_impossible_answer=False,   # HotpotQA always has an answer
    batch_size=16
)

# ------------------------------
# Load HotpotQA dev set
# ------------------------------
dataset = load_dataset("hotpot_qa", "fullwiki")
hotpot_val = dataset["validation"]

predictions = []
references = []

for ex in hotpot_val:
    # Concat paragraphs only
    paragraphs = [p[1] for p in ex["context"]]
    context = "\n".join(paragraphs)

    question = ex["question"]
    id_ = ex["id"]

    # Run QA model
    try:
        pred = qa({
            "context": context,
            "question": question
        })
        predicted_answer = pred.get("answer", "").strip()
    except:
        predicted_answer = ""

    # Prediction format
    predictions.append({
        "id": id_,
        "prediction_text": predicted_answer,
        "no_answer_probability": 0.0   # not used but expected by squad_v2 metric
    })

    # Reference format
    references.append({
        "id": id_,
        "answers": {
            "text": [ex["answer"]],
            "answer_start": [0]       # dummy start (HotpotQA does not provide span)
        }
    })

# ------------------------------
# Compute metrics (use squad_v2)
# ------------------------------
metric = evaluate.load("squad_v2")
results = metric.compute(predictions=predictions, references=references)

print("HotpotQA Exact Match (EM):", results["exact"])
print("HotpotQA F1:", results["f1"])


2025-11-14 15:49:31.979327: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763135372.471973      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763135372.583231      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Device set to use cuda:0


README.md: 0.00B [00:00, ?B/s]

fullwiki/train-00000-of-00002.parquet:   0%|          | 0.00/166M [00:00<?, ?B/s]

fullwiki/train-00001-of-00002.parquet:   0%|          | 0.00/166M [00:00<?, ?B/s]

fullwiki/validation-00000-of-00001.parqu(…):   0%|          | 0.00/28.0M [00:00<?, ?B/s]

fullwiki/test-00000-of-00001.parquet:   0%|          | 0.00/27.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/90447 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7405 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7405 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

HotpotQA Exact Match (EM): 0.0
HotpotQA F1: 0.1438456994321474


# **Benchmarking DeBERTa-v3-large-squad2 fine tuned on hotpotQA for gold standard Paragraph**

In [3]:
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import evaluate

model_path = "/kaggle/input/deberta-v3-large-squad2-finetune-hotpot-goldpara/debarta-squad-hotpot-final/checkpoint-2000"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

qa = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    handle_impossible_answer=False,   # HotpotQA has no null answers
    batch_size=16
)

dataset = load_dataset("hotpot_qa", "fullwiki")
hotpot_val = dataset["validation"]

predictions = []
references = []

for ex in hotpot_val:
    paragraphs = [p[1] for p in ex["context"]]
    context = "\n".join(paragraphs)

    question = ex["question"]
    id_ = ex["id"]

    try:
        pred = qa({
            "context": context,
            "question": question
        })
        predicted_answer = pred.get("answer", "").strip()
    except:
        predicted_answer = ""

    # PREDICTION FORMAT
    predictions.append({
        "id": id_,
        "prediction_text": predicted_answer,
        "no_answer_probability": 0.0
    })

    # REFERENCE FORMAT — MUST include no_answer_probability
    references.append({
        "id": id_,
        "answers": {
            "text": [ex["answer"]],
            "answer_start": [0]
        }
    })

metric = evaluate.load("squad_v2")
results = metric.compute(predictions=predictions, references=references)

print("HotpotQA Exact Match (EM):", results["exact"])
print("HotpotQA F1:", results["f1"])


Device set to use cuda:0


HotpotQA Exact Match (EM): 0.0
HotpotQA F1: 0.11803478987813897


# **Benchmarking DeBERTa-v3-large fine tune on SQuAD 2.0 (3/4)**

**For only .pt file model**

In [6]:
import torch
from transformers import AutoTokenizer, DebertaV2ForQuestionAnswering

# Load saved state dict
state = torch.load("/kaggle/input/deberta-v3-large-finetune-squad2-0/step24648_epoch0.pt")

# Load base model and tokenizer
model = DebertaV2ForQuestionAnswering.from_pretrained("microsoft/deberta-v3-large")
model.load_state_dict(state["model"], strict=False)

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")


qa = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    handle_impossible_answer=False,      # HotpotQA always has an answer
    batch_size=16
)

# ------------------------------
# Load HotpotQA dev set
# ------------------------------
dataset = load_dataset("hotpot_qa", "fullwiki")
hotpot_val = dataset["validation"]

predictions = []
references = []

for ex in hotpot_val:
    # HotpotQA context is list of [title, paragraph] pairs → concat only paragraphs
    paragraphs = [p[1] for p in ex["context"]]
    context = "\n".join(paragraphs)

    question = ex["question"]
    id_ = ex["id"]

    # Run QA model
    try:
        pred = qa({
            "context": context,
            "question": question
        })
        predicted_answer = pred.get("answer", "").strip()
    except:
        predicted_answer = ""

    predictions.append({
        "id": id_,
        "prediction_text": predicted_answer,
        "no_answer_probability": 0.0
    })

    references.append({
        "id": id_,
        "answers": {"text": [ex["answer"]], "answer_start": [0]}
    })

# ------------------------------
# Compute metrics
# ------------------------------
metric = evaluate.load("squad_v2")
results = metric.compute(predictions=predictions, references=references)

print("HotpotQA Exact Match:", results["exact"])
print("HotpotQA F1:", results["f1"])



Some weights of DebertaV2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


HotpotQA Exact Match: 0.0
HotpotQA F1: 0.1408875951911262


# **DeBERTa-v3-large-squad2**

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import evaluate

# ------------------------------
# Load model
# ------------------------------
model_path = "deepset/deberta-v3-large-squad2"   # or your fine-tuned checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

qa = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    handle_impossible_answer=False,      # HotpotQA always has an answer
    batch_size=16
)

# ------------------------------
# Load HotpotQA dev set
# ------------------------------
dataset = load_dataset("hotpot_qa", "fullwiki")
hotpot_val = dataset["validation"]

predictions = []
references = []

for ex in hotpot_val:
    # HotpotQA context is list of [title, paragraph] pairs → concat only paragraphs
    paragraphs = [p[1] for p in ex["context"]]
    context = "\n".join(paragraphs)

    question = ex["question"]
    id_ = ex["id"]

    # Run QA model
    try:
        pred = qa({
            "context": context,
            "question": question
        })
        predicted_answer = pred.get("answer", "").strip()
    except:
        predicted_answer = ""

    predictions.append({
        "id": id_,
        "prediction_text": predicted_answer
    })

    references.append({
        "id": id_,
        "answers": {"text": [ex["answer"]], "answer_start": [0]}
    })

# ------------------------------
# Compute metrics
# ------------------------------
metric = evaluate.load("hotpot_qa")
results = metric.compute(predictions=predictions, references=references)

print("HotpotQA Exact Match:", results["exact_match"])
print("HotpotQA F1:", results["f1"])


# **BigBird-RoBERTa-large fine tuned on MASHQA**

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import evaluate

# ------------------------------
# Load model
# ------------------------------
model_path = "/kaggle/input/bigbird-finetuned-mashqa/bigbird-roberta-large-mashqa"   # or your fine-tuned checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

qa = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    handle_impossible_answer=False,      # HotpotQA always has an answer
    batch_size=16
)

# ------------------------------
# Load HotpotQA dev set
# ------------------------------
dataset = load_dataset("hotpot_qa", "fullwiki")
hotpot_val = dataset["validation"]

predictions = []
references = []

for ex in hotpot_val:
    # HotpotQA context is list of [title, paragraph] pairs → concat only paragraphs
    paragraphs = [p[1] for p in ex["context"]]
    context = "\n".join(paragraphs)

    question = ex["question"]
    id_ = ex["id"]

    # Run QA model
    try:
        pred = qa({
            "context": context,
            "question": question
        })
        predicted_answer = pred.get("answer", "").strip()
    except:
        predicted_answer = ""

    predictions.append({
        "id": id_,
        "prediction_text": predicted_answer
    })

    references.append({
        "id": id_,
        "answers": {"text": [ex["answer"]], "answer_start": [0]}
    })

# ------------------------------
# Compute metrics
# ------------------------------
metric = evaluate.load("hotpot_qa")
results = metric.compute(predictions=predictions, references=references)

print("HotpotQA Exact Match:", results["exact_match"])
print("HotpotQA F1:", results["f1"])


# **BigBird-RoBERTa-large**

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import evaluate

# ------------------------------
# Load model
# ------------------------------
model_path = "google/bigbird-roberta-large"   # or your fine-tuned checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

qa = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    handle_impossible_answer=False,      # HotpotQA always has an answer
    batch_size=16
)

# ------------------------------
# Load HotpotQA dev set
# ------------------------------
dataset = load_dataset("hotpot_qa", "fullwiki")
hotpot_val = dataset["validation"]

predictions = []
references = []

for ex in hotpot_val:
    # HotpotQA context is list of [title, paragraph] pairs → concat only paragraphs
    paragraphs = [p[1] for p in ex["context"]]
    context = "\n".join(paragraphs)

    question = ex["question"]
    id_ = ex["id"]

    # Run QA model
    try:
        pred = qa({
            "context": context,
            "question": question
        })
        predicted_answer = pred.get("answer", "").strip()
    except:
        predicted_answer = ""

    predictions.append({
        "id": id_,
        "prediction_text": predicted_answer
    })

    references.append({
        "id": id_,
        "answers": {"text": [ex["answer"]], "answer_start": [0]}
    })

# ------------------------------
# Compute metrics
# ------------------------------
metric = evaluate.load("hotpot_qa")
results = metric.compute(predictions=predictions, references=references)

print("HotpotQA Exact Match:", results["exact_match"])
print("HotpotQA F1:", results["f1"])
