### This notebook should be used for extracting the test data, where we have only attributes id, body, type

In [1]:
from Bio import Entrez
from keybert import KeyBERT
import json
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
import xml.etree.ElementTree as ET
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import random
import time
from datetime import datetime

data_file_path = "../datasets/test/batch_3/BioASQ-task13bPhaseA-testset3"
base_pid_url = 'http://www.ncbi.nlm.nih.gov/pubmed/' 
Entrez.email = "kasapovic.m@hotmail.com"  # validan email 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_question_from_file(file_path):
    """
    Loads the BioASQ-style JSON and extracts the list of questions.
    
    Parameters:
        file_path (str): Path to the JSON file.
    
    Returns:
        List[dict]: A list of question dictionaries.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        questions = data.get("questions", [])
        print(f"Loaded {len(questions)} questions from {file_path}")
        return questions
    except Exception as e:
        print(f"Failed to load questions: {e}")
        return []

In [3]:
test_questions = load_question_from_file(data_file_path)
test_questions[0]

Loaded 85 questions from ../datasets/test/batch_3/BioASQ-task13bPhaseA-testset3


{'id': '67e6ce6e18b1e36f2e0000cf',
 'type': 'factoid',
 'body': 'How many primary genetic associations were identified through pQTL mapping within the Pharma Proteomics Project?'}

In [4]:
def get_questions_text(questions):
    """
    Extracts the text of the questions from the parsed questions.
    
    Parameters:
        parsed_questions (List[dict]): A list of parsed question dictionaries.
    
    Returns:
        List[str]: A list of question texts.
    """
    return [{'id': question['id'],
              'body': question["body"]} for question in questions]

In [5]:
def save_to_json(data, file_path):
    """
    Saves the data to a JSON file.
    
    Parameters:
        data (any): The data to save.
        file_path (str): Path to the JSON file.
    """
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Saved data to {file_path}")
    except Exception as e:
        print(f"Failed to save data: {e}")

In [6]:
def parse_pubmed_abstracts_from_xml(xml_string):
    """
    Parse PubMed XML and extract PMID, title, and abstract.

    Args:
        xml_string (str): XML data as string from Entrez.efetch(..., retmode="xml")

    Returns:
        List[Dict]: List of articles with keys: 'pid', 'title', 'abstract'
    """
    root = ET.fromstring(xml_string)
    articles = []

    for article in root.findall(".//PubmedArticle"):
        pid = article.findtext(".//PMID")
        title = article.findtext(".//ArticleTitle")

        # Handle multiple AbstractText parts (can have labels, e.g. "BACKGROUND", "METHODS", etc.)
        abstract_parts = article.findall(".//Abstract/AbstractText")
        abstract = " ".join([part.text for part in abstract_parts if part.text])

        articles.append({
            "pid": base_pid_url + pid,
            "title": title,
            "abstract": abstract
        })

    return articles

In [7]:
# Generate semantic queries
def generate_queries_BERT(question, model, keyphrase_ngram_range=(1, 3), top_n=7):
    keywords = model.extract_keywords(question, keyphrase_ngram_range=keyphrase_ngram_range, stop_words='english', top_n=top_n)
    return [kw for kw, _ in keywords]

# Search PubMed for each query
def search_pubmed(query, retmax=500):
    handle = Entrez.esearch(db="pubmed", term=query, retmax=retmax)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

# Fetch details in JSON and decode
def fetch_details(id_list):
    ids = ",".join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="xml")
    xml_data = handle.read()  
    handle.close()
    records = parse_pubmed_abstracts_from_xml(xml_data)
    return records

In [8]:
import random
import time
import json
import os
import shutil
from datetime import datetime

#  Parameters
BATCH_SIZE = 5
RESULTS_FILE = "../datasets/test/batch_3/retrieved_articles_sampled_test_batch_3.json"
CHECKPOINT_FILE = "checkpoint.json"

#  Load checkpoint if it exists
if os.path.exists(CHECKPOINT_FILE):
    with open(CHECKPOINT_FILE, "r") as f:
        checkpoint = json.load(f)
        start_index = checkpoint.get("last_index", 0) + 1
else:
    start_index = 0

# Initialize
questions = get_questions_text(questions=test_questions)
kw_model = KeyBERT(model='all-MiniLM-L6-v2')
results = []

#  Resume from saved results file if available
if os.path.exists(RESULTS_FILE):
    try:
        with open(RESULTS_FILE, "r") as f:
            results = json.load(f)["results"]
    except json.JSONDecodeError:
        print(" Warning: Results file is corrupted. Starting with empty results.")
        results = []

#  Process from start_index
for i in range(start_index, len(questions)):
    question = questions[i]
    qid = question['id']
    print(f"[{i+1}/{len(questions)}] Question: {question['body']}")
    
    queries = generate_queries_BERT(question['body'], keyphrase_ngram_range=(1, 10), top_n=20, model=kw_model)
    print(f" Generated Queries: {queries}")
    
    all_ids = set()
    for q in queries:
        try:
            ids = search_pubmed(q, retmax=20000)
            all_ids.update(ids)
            time.sleep(0.7)
        except Exception as e:
            print(f" Failed to search PubMed with query '{q}': {e}")
    
    print(f" Found {len(all_ids)} unique PMIDs for this question.")
    
    #ground_truth = ground_truth_documents[qid]
    #found_gt = ground_truth.intersection(all_ids)
    #print(f" Found {len(found_gt)} out of {len(ground_truth)} ground truth PMIDs.")
    
    #non_gt = list(all_ids - found_gt)
    sampled_random = random.sample(list(all_ids), min(150, len(all_ids))) # INSTEAD OF 
    
    #selected_ids = list(found_gt) + sampled_random
    print(f"Fetching details for {len(sampled_random)} PMIDs (Random Sample)")

    try:
        details = fetch_details(sampled_random)
    except Exception as e:
        print(f"Error fetching details: {e}")
        details = []

    results.append({
        "qid": qid,
        "question": question['body'],
        #"queries": queries,
        #"ground_truth": list(ground_truth),
        #"ground_truth_total": len(ground_truth),
        #"ground_truth_found": list(found_gt),
        #"ground_truth_found_count": len(found_gt),
        #"random_sampled_count": len(sampled_random),
        #"random_sampled_pmids": sampled_random,
        #"selected_pmids": selected_ids,
        #"error_rate": {
            #"value": len(found_gt) / len(ground_truth) if len(ground_truth) > 0 else 0,
            #"details": f"{len(found_gt)} found out of {len(ground_truth)}"
        #},
        "all_retreived_articles": details
    })

    print(f"Saved result for question \"{question['body']}\"\n")

    # Save batch and update checkpoint
    if (i + 1) % BATCH_SIZE == 0 or i == len(questions) - 1:
        print(f"Saving batch at question {i+1}...")

        # Save to a temporary file first
        temp_file = RESULTS_FILE + ".tmp"
        with open(temp_file, "w") as f:
            json.dump({"results": results}, f, indent=2)

        # Move temp file to final results file (atomic save)
        shutil.move(temp_file, RESULTS_FILE)

        # Save checkpoint
        with open(CHECKPOINT_FILE, "w") as f:
            json.dump({"last_index": i}, f)

        print(f"Batch saved and checkpoint updated at index {i}\n")


[51/85] Question: What is the mechanism of action of Xalnesiran?
 Generated Queries: ['mechanism action xalnesiran', 'action xalnesiran', 'xalnesiran', 'mechanism action', 'mechanism', 'action']
 Found 25218 unique PMIDs for this question.
Fetching details for 150 PMIDs (Random Sample)
Saved result for question "What is the mechanism of action of Xalnesiran?"

[52/85] Question: What is Tenecteplase?
 Generated Queries: ['tenecteplase']
 Found 1124 unique PMIDs for this question.
Fetching details for 150 PMIDs (Random Sample)
Saved result for question "What is Tenecteplase?"

[53/85] Question: Is there any DNA vaccine approved for use in humans?
 Generated Queries: ['dna vaccine approved use humans', 'dna vaccine approved use', 'dna vaccine approved', 'vaccine approved use humans', 'dna vaccine', 'vaccine approved use', 'vaccine approved', 'vaccine', 'approved use humans', 'dna', 'use humans', 'approved use', 'humans', 'approved', 'use']
 Found 65683 unique PMIDs for this question.
Fetc