In [None]:
import os
import shutil
import tempfile
from tqdm import tqdm
import json
import re
from collections import defaultdict
from rank_bm25 import BM25Okapi
from data_manipulation import DataManipulator


In [4]:
# load json file
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    return data['data']

# test training daya
#input_file_data = 'datasets/test_golden_answers/batch_1/parsed_data_final_test_batch_1.json'
# input_file_data = 'datasets/test/test_golden_answers/batch_2/retrieved_articles_sampled_test_batch_2.json'
# input_file_data = 'datasets/test/test_golden_answers/batch_3/retrieved_articles_sampled_test_batch_3.json'

# training data
#input_file_data = 'datasets/training/parsed_data_final.json'

# test not golden
#input_file_data = "datasets/test/batch_3/retrieved_articles_sampled_test_batch_3.json"

# test output files
#output_file = 'results/bm25_results_test_batch_1.json'
# output_file = 'results/bm25_results_test_batch_2.json'
# output_file = 'results/bm25_results_test_batch_3.json'

# training output files
#output_file ='results/bm25_results_batch_3_not_golden.json'


# batch 4
input_file_data = 'datasets/test/batch_4/retrieved_articles_sampled_test_batch_4.json'
output_file = 'results/bm25_results_batch_4_no_golden.json'



data = load_data(file_path=input_file_data)

data[0]

{'qid': '67e6cf2618b1e36f2e0000d0',
 'question': 'Should Zotiraciclib be used for glioblastoma?',
 'all_retreived_articles': [{'pid': 'http://www.ncbi.nlm.nih.gov/pubmed/38137175',
   'title': 'A Systematic Review of Nanomedicine in Glioblastoma Treatment: Clinical Efficacy, Safety, and Future Directions.',
   'abstract': '(1) Background: Glioblastoma (GBM) is categorized as a grade IV astrocytoma by the World Health Organization (WHO), representing the most aggressive and prevalent form of glioma. It presents a significant clinical challenge, with limited treatment options and poor prognosis. This systematic review evaluates the efficacy and safety of various nanotherapy approaches for GBM and explores future directions in tumor management. Nanomedicine, which involves nanoparticles in the 1-100 nm range, shows promise in improving drug delivery and targeting tumor cells. (2) Methods: Following PRISMA guidelines, a systematic search of databases including Google Scholar, NCBI PubMed, 

In [5]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.split()


In [6]:
def prepare_corpus(articles):
    """
    Prepares tokenized documents for BM25 from article title + abstract.
    Returns (tokenized_corpus, article_refs)
    """
    corpus = []
    article_refs = []

    for article in articles:
        title = article.get('title', '')
        abstract = article.get('abstract', '')
        text = f"{title} {abstract}".strip()

        if not text:
            continue

        tokens = tokenize(text)
        if tokens:
            corpus.append(tokens)
            article_refs.append(article)  # store original for scoring

    return corpus, article_refs

In [7]:
def rank_articles_bm25(question, articles):
    corpus, article_refs = prepare_corpus(articles)
    if not corpus:
        return []

    bm25 = BM25Okapi(corpus)
    query = tokenize(question)

    scores = bm25.get_scores(query)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)

    top_10 = []
    for i in ranked_indices[:10]:
        article = article_refs[i]
        top_10.append({
            'pid': article.get('pid', ''),
            'title': article.get('title', ''),
            'abstract': article.get('abstract', ''),
            'score': float(scores[i])
        })

    return top_10

In [8]:
def save_results(results, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

In [9]:
def extract_snippets(question, top_articles):
    """
    Extract BioASQ-style snippets from the top-ranked articles.
    Returns a list of snippet dicts with section labels and character offsets.
    """
    query_terms = [re.sub(r'[^a-z0-9]', '', t.lower()) for t in re.findall(r'\w+', question)]
    snippets = []

    for article in top_articles:
        pid = article.get('pid', '')
        #doc_url = f"https://www.ncbi.nlm.nih.gov/pubmed/{pid}"

        for section in ['title', 'abstract']:
            field_text = article.get(section, '')
            text_lower = field_text.lower()
            text_norm = re.sub(r'[^a-z0-9\s]', '', text_lower)

            # Map normalized text back to original for offset tracking
            match_offsets = []
            for term in query_terms:
                for m in re.finditer(r'\b' + re.escape(term) + r'\b', text_norm):
                    start, end = m.start(), m.end()
                    match_offsets.append((start, end))

            if not match_offsets:
                continue

            snippet_start = min(offset[0] for offset in match_offsets)
            snippet_end = max(offset[1] for offset in match_offsets)

            # Use original text offsets — approximate based on normalized mapping
            snippet_text = field_text[snippet_start:snippet_end].strip()

            snippets.append({
                "beginSection": section,
                "endSection": section,
                "text": snippet_text,
                "document": pid,
                "offsetInBeginSection": snippet_start,
                "offsetInEndSection": snippet_end
            })

    return snippets

In [10]:
def rank_all_questions_bm25(data):
    results_by_question = []

    for entry in tqdm(data, desc="Processing questions..."):
        question = entry['question']
        articles = entry.get('all_retreived_articles', [])
        top_articles = rank_articles_bm25(question, articles)
        snippets = extract_snippets(question, top_articles)

        results_by_question.append({
            'id': entry['qid'],
            'question': question,
            'top_10_articles': top_articles, 
            'snippets': snippets
        })

    return { 'data' : results_by_question }

In [11]:
results_bm25 = rank_all_questions_bm25(data=data)
save_results(results_bm25, output_file)

Processing questions...: 100%|██████████| 85/85 [00:02<00:00, 40.26it/s]


Code down will be used to convert the result file to the format wanted for submission on BioASQ 

In [12]:
def transform_to_questions_format(data_dict):
    result = {"questions": []}

    for item in data_dict["data"]:
        question_entry = {
            "id": item["id"],
            "body": item["question"],
            "documents": [article["pid"] for article in item.get("top_10_articles", [])],
            "snippets": item.get("snippets", [])
        }
        result["questions"].append(question_entry)

    return result

In [13]:


# Load the original data
with open(output_file, "r") as f:
    data = json.load(f)

# Transform the data
converted = transform_to_questions_format(data)

# Save the converted result
with open("formated_batch_4_results.json", "w") as f:
    json.dump(converted, f, indent=2)
