Load the data and split into train and test set

In [None]:
from datasets import load_dataset

# Load the dataset and split into train and test
dataset = load_dataset("pubmed_qa", "pqa_labeled")["train"].train_test_split(test_size=0.3, seed=42)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")


2. Load Pre-trained Model and Evaluate on Test Subset

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import evaluate

# Load tokenizer and model
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Set up QA pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Evaluation metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")


Quick evaluation on 200 samples:



In [None]:
import random

def map_answer_to_label(answer: str) -> str:
    answer = answer.lower()
    if any(x in answer for x in ["no", "not", "does not", "none", "negative"]):
        return "no"
    elif any(x in answer for x in ["yes", "does", "can", "reduce", "associated with", "positive"]):
        return "yes"
    else:
        return "maybe"

sampled_data = random.sample(list(test_dataset), 300)
preds, refs = [], []

for example in sampled_data:
    question, context, label = example["question"], example["context"], example["final_decision"]
    try:
        result = qa_pipeline({"question": question, "context": context})
        pred_label = map_answer_to_label(result["answer"])
    except:
        pred_label = "maybe"  
    preds.append(pred_label)
    refs.append(label)

label2id = {"yes": 0, "no": 1, "maybe": 2}
id2label = {v: k for k, v in label2id.items()}
# Convert to class IDs
preds_ids = [label2id.get(p, 2) for p in preds]  # default to 'maybe'
refs_ids = [label2id.get(r, 2) for r in refs]

acc = accuracy_metric.compute(predictions=preds_ids, references=refs_ids)["accuracy"]
f1 = f1_metric.compute(predictions=preds_ids, references=refs_ids, average="macro")["f1"]

print(f"Initial Accuracy: {acc:.2f}, Initial F1: {f1:.2f}")


Evaluate using semantic similarity

In [None]:
from sentence_transformers import SentenceTransformer, util

model_Sentence = SentenceTransformer("all-MiniLM-L6-v2")

sim_scores = []
for pred, ref in zip(preds, refs):
    sim = util.cos_sim(model_Sentence.encode(pred), model_Sentence.encode(ref)).item()
    sim_scores.append(sim)

avg_sim = sum(sim_scores) / len(sim_scores)
print(f"Average Semantic Similarity: {avg_sim:.2f}")


train the model

In [None]:
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import torch


def preprocess(examples):
    return tokenizer(
        examples["question"],  # Keep this as is
        [' '.join(entry['contexts']) for entry in examples["context"]],  # Process each example's context separately
        truncation=True,
        padding="max_length",
        max_length=512
    )



tokenized_train = train_dataset.map(preprocess, batched=True)
tokenized_test = test_dataset.map(preprocess, batched=True)

# Dummy labels for QA (start/end tokens) to make it minimally trainable
def add_dummy_labels(example):
    example["start_positions"] = 0
    example["end_positions"] = 1
    return example

tokenized_train = tokenized_train.map(add_dummy_labels)
tokenized_test = tokenized_test.map(add_dummy_labels)



In [None]:
tokenized_train

In [None]:
train_token_dataset = tokenized_train.remove_columns(["question", "final_decision", "context", "long_answer", "pubid"])


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=1,
    eval_strategy="no",
    logging_steps=10,
    save_steps=10000,
    save_total_limit=1,
    remove_unused_columns=False
)





In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_token_dataset,
    tokenizer=tokenizer
)



In [None]:
# Train
trainer.train()