In [40]:
from datasets import load_dataset
from rank_bm25 import BM25Okapi        # still used in preprocessing
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from nltk.tokenize import TreebankWordTokenizer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import torch

### NEW for semantic similarity
from sentence_transformers import SentenceTransformer, util

In [41]:
torch.manual_seed(32)
def evaluate_semantic_similarity(preds, refs):
    model_Sentence = SentenceTransformer("all-MiniLM-L6-v2")
    sim_scores = []
    for pred, ref in zip(preds, refs):
        sim = util.cos_sim(
            model_Sentence.encode(pred, convert_to_tensor=True),
            model_Sentence.encode(ref, convert_to_tensor=True)
        ).item()
        sim_scores.append(sim)
    avg_sim = sum(sim_scores) / len(sim_scores)
    print(f"Average Semantic Similarity: {avg_sim:.2f}")

In [42]:
# ——— load & split
dataset = load_dataset("pubmed_qa", "pqa_labeled")["train"].shuffle(seed=42)
split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset, test_dataset = split["train"], split["test"]


In [43]:
# ——— BM25 retriever (unchanged)
corpus = [' '.join(e['contexts']) for e in train_dataset["context"]]
tokenizer_bm25 = TreebankWordTokenizer()
bm25 = BM25Okapi([tokenizer_bm25.tokenize(doc) for doc in corpus])
def retrieve_with_bm25(q, k=1):
    tokens = tokenizer_bm25.tokenize(q)
    scores = bm25.get_scores(tokens)
    idxs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    return [corpus[i] for i in idxs]

In [44]:
# ——— preprocessing
label2id = {'no': 0, 'yes': 1, 'maybe': 2}
id2label = {v:k for k,v in label2id.items()}
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

In [45]:
def preprocess(examples):
    in_ids, attn, labs = [], [], []
    for q, lbl in zip(examples['question'], examples['final_decision']):
        docs = retrieve_with_bm25(q)
        if not docs: continue
        enc = tokenizer(q, docs[0], truncation=True, padding='max_length', max_length=512)
        in_ids.append(enc['input_ids'])
        attn.append(enc['attention_mask'])
        labs.append(label2id[lbl.lower()])
    return {'input_ids':in_ids, 'attention_mask':attn, 'labels':labs}

train_enc = preprocess(train_dataset)
test_enc  = preprocess(test_dataset)

In [46]:
class PubMedQADataset(torch.utils.data.Dataset):
    def __init__(self, enc): self.enc = enc
    def __len__(self): return len(self.enc['labels'])
    def __getitem__(self, i): return {k:torch.tensor(v[i]) for k,v in self.enc.items()}

train_ds, test_ds = PubMedQADataset(train_enc), PubMedQADataset(test_enc)



In [47]:
# ——— classification metrics
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    acc   = accuracy_score(pred.label_ids, preds)
    f1    = f1_score(pred.label_ids, preds, average='macro')
    return {'accuracy':acc, 'f1':f1}


In [48]:
# — PART 1: pretrained evaluation
print("=== PART 1: Pretrained evaluation ===")
pre_model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
    num_labels=3, id2label=id2label, label2id=label2id
)
pre_trainer = Trainer(model=pre_model, compute_metrics=compute_metrics, eval_dataset=test_ds)
pre_res = pre_trainer.evaluate()
print(pre_res)

# get raw preds & refs as texts
pred_out = pre_trainer.predict(test_ds)
pred_ids = np.argmax(pred_out.predictions, axis=1)
ref_ids  = pred_out.label_ids
pred_texts = [id2label[i] for i in pred_ids]
ref_texts  = [id2label[i] for i in ref_ids]

print("Semantic similarity (pretrained):")
evaluate_semantic_similarity(pred_texts, ref_texts)

=== PART 1: Pretrained evaluation ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.2734869718551636, 'eval_model_preparation_time': 0.0, 'eval_accuracy': 0.175, 'eval_f1': 0.15578002244668912, 'eval_runtime': 147.167, 'eval_samples_per_second': 1.359, 'eval_steps_per_second': 0.17}




Semantic similarity (pretrained):
Average Semantic Similarity: 0.59


In [49]:
# — PART 2: finetune & re-evaluate
print("\n=== PART 2: Finetune & evaluate ===")
ft_model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
    num_labels=3, id2label=id2label, label2id=label2id
)
training_args = TrainingArguments(
    output_dir='./results', learning_rate=2e-5,
    per_device_train_batch_size=8, per_device_eval_batch_size=8,
    num_train_epochs=3, weight_decay=0.01, logging_steps=50
)
ft_trainer = Trainer(
    model=ft_model, args=training_args,
    train_dataset=train_ds, compute_metrics=compute_metrics
)
ft_trainer.train()
ft_res = ft_trainer.evaluate(eval_dataset=test_ds)
print(ft_res)

# get finetuned preds & refs
pred_out2 = ft_trainer.predict(test_ds)
pred_ids2 = np.argmax(pred_out2.predictions, axis=1)
ref_ids2  = pred_out2.label_ids
pred_texts2 = [id2label[i] for i in pred_ids2]
ref_texts2  = [id2label[i] for i in ref_ids2]

print("Semantic similarity (finetuned):")
evaluate_semantic_similarity(pred_texts2, ref_texts2)



=== PART 2: Finetune & evaluate ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.9751
100,0.9596
150,0.922
200,0.8573
250,0.7603
300,0.7829




{'eval_loss': 1.138144850730896, 'eval_accuracy': 0.475, 'eval_f1': 0.3110209601081812, 'eval_runtime': 141.1494, 'eval_samples_per_second': 1.417, 'eval_steps_per_second': 0.177, 'epoch': 3.0}




Semantic similarity (finetuned):
Average Semantic Similarity: 0.83


In [11]:
# — FINAL comparison
print("\n=== Comparison on test set ===")
print(f"Accuracy → pretrained: {pre_res['eval_accuracy']:.4f}, finetuned: {ft_res['eval_accuracy']:.4f}")
print(f"   F1    → pretrained: {pre_res['eval_f1']:.4f}, finetuned: {ft_res['eval_f1']:.4f}")

print("Average semantic similarity -> pretrained:")
semantic_pretrained = evaluate_semantic_similarity(pred_texts, ref_texts)

print("Average semantic similarity -> finetuned:")
semantic_finetuned = evaluate_semantic_similarity(pred_texts2, ref_texts2)





=== Comparison on test set ===
Accuracy → pretrained: 0.0850, finetuned: 0.4750
   F1    → pretrained: 0.0586, finetuned: 0.3110
Average semantic similarity -> pretrained:
Average Semantic Similarity: 0.47
Average semantic similarity -> finetuned:
Average Semantic Similarity: 0.83
