In [2]:
import os
import shutil
import tempfile
from tqdm import tqdm
import json
import re
from collections import defaultdict
from rank_bm25 import BM25Okapi
from data_manipulation import DataManipulator
data_manipulator = DataManipulator()

corpus_path = 'Traditional_IR/tokenized_corpus.jsonl'
all_articles_file_path = "datasets/final_correct_datasets/all_retrieved_articles.json"
training_ground_truth_folder_path = "datasets/final_correct_datasets/training"
test_ground_truth_folder_path = "datasets/final_correct_datasets/test"
total_article_target = 70000

In [16]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.split()


def prepare_corpus_cached(articles, corpus_path='tokenized_corpus.jsonl'):
    """
    Tokenizes and caches the corpus to disk. If already exists, loads it.
    Returns tokenized corpus and article_refs.
    """
    if os.path.exists(corpus_path):
        print(f"Loading cached corpus from {corpus_path}...")
        corpus = []
        article_refs = []
        with open(corpus_path, 'r', encoding='utf-8') as f:
            for line in f:
                entry = json.loads(line)
                corpus.append(entry['tokens'])
                article_refs.append(entry['meta'])
        return corpus, article_refs

    print(f"Creating and caching corpus to {corpus_path}...")
    corpus = []
    article_refs = []

    with open(corpus_path, 'w', encoding='utf-8') as f:
        for article in tqdm(articles, desc="Tokenizing articles..."):
            title = article.get('title', '')
            abstract = article.get('abstract', '')
            text = f"{title} {abstract}".strip()

            if not text:
                continue

            tokens = tokenize(text)
            if tokens:
                record = {
                    'tokens': tokens,
                    'meta': {
                        'pid': article.get('pid', ''),
                        'title': title,
                        'abstract': abstract
                    }
                }
                f.write(json.dumps(record, ensure_ascii=False) + "\n")
                corpus.append(tokens)
                article_refs.append(record['meta'])

    return corpus, article_refs

In [17]:
def get_bm25_corpus():
    training_ground_truth = data_manipulator.get_ground_truth_from_all_files(training_ground_truth_folder_path)
    test_ground_truth = data_manipulator.get_ground_truth_from_all_files(test_ground_truth_folder_path)

    ground_truth_data = training_ground_truth + test_ground_truth

    sampled_articles = []

   
    all_articles = data_manipulator.get_all_articles(all_articles_file_path)
    sampled_articles = data_manipulator.build_article_dataset_with_ground_truth(ground_truth_data, 
                                                                                all_articles, 
                                                                                total_articles_target=total_article_target)
    
    corpus, article_refs = prepare_corpus_cached(sampled_articles, corpus_path)    
    
    return corpus, article_refs

In [18]:
# --- BM25 Ranking ---

def rank_articles_bm25(question, bm25, article_refs):
    query = tokenize(question)
    scores = bm25.get_scores(query)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)

    top_10 = []
    for i in ranked_indices[:10]:
        article = article_refs[i]
        top_10.append({
            'pid': article.get('pid', ''),
            'title': article.get('title', ''),
            'abstract': article.get('abstract', ''),
            'score': float(scores[i])
        })

    return top_10

In [19]:


# --- Snippet Extraction ---

def extract_snippets(question, top_articles):
    query_terms = [re.sub(r'[^a-z0-9]', '', t.lower()) for t in re.findall(r'\w+', question)]
    snippets = []

    for article in top_articles:
        pid = article.get('pid', '')
        for section in ['title', 'abstract']:
            field_text = article.get(section, '')
            text_lower = field_text.lower()
            text_norm = re.sub(r'[^a-z0-9\s]', '', text_lower)

            match_offsets = []
            for term in query_terms:
                for m in re.finditer(r'\b' + re.escape(term) + r'\b', text_norm):
                    start, end = m.start(), m.end()
                    match_offsets.append((start, end))

            if not match_offsets:
                continue

            snippet_start = min(offset[0] for offset in match_offsets)
            snippet_end = max(offset[1] for offset in match_offsets)
            snippet_text = field_text[snippet_start:snippet_end].strip()

            snippets.append({
                "beginSection": section,
                "endSection": section,
                "text": snippet_text,
                "document": pid,
                "offsetInBeginSection": snippet_start,
                "offsetInEndSection": snippet_end
            })

    return snippets



In [20]:
def save_results(results, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)


In [21]:
# --- Ranking All Questions ---

def rank_all_questions_bm25(questions, corpus, article_refs):
    #corpus, article_refs = prepare_corpus_cached(articles, corpus_path=corpus_path)
    bm25 = BM25Okapi(corpus)

    results_by_question = []

    for entry in tqdm(questions, desc="Ranking questions with full article set..."):
        question = entry['question']
        qid = entry['qid']
        top_articles = rank_articles_bm25(question, bm25, article_refs)
        snippets = extract_snippets(question, top_articles)

        results_by_question.append({
            'id': qid,
            'question': question,
            'top_10_articles': top_articles,
            'snippets': snippets
        })

    return { 'data': results_by_question }

In [22]:
corpus, article_refs = get_bm25_corpus()
training_questions = data_manipulator.get_questions_from_data(data_manipulator.get_ground_truth_from_all_files(training_ground_truth_folder_path))
results = rank_all_questions_bm25(questions=training_questions, corpus=corpus, article_refs=article_refs)

Creating and caching corpus to Traditional_IR_Corpus/tokenized_corpus.jsonl...


Tokenizing articles...: 100%|██████████| 70000/70000 [00:08<00:00, 8058.47it/s] 
Ranking questions with full article set...: 100%|██████████| 5390/5390 [21:59<00:00,  4.08it/s]


In [23]:
save_results(results=results, output_file='Traditional_IR/bm25_training_results.json')

In [25]:
# test
for i in range(4):
    file_path = f'datasets/final_correct_datasets/test/parsed_data_final_test_batch_{i+1}.json'

    test_questions = data_manipulator.get_questions_from_data(data_manipulator.get_ground_truth_one_file(file_path=file_path))

    results = rank_all_questions_bm25(questions=test_questions, corpus=corpus, article_refs=article_refs)

    save_results(results, output_file=f'Traditional_IR/bm25_test_batch_{i+1}_results.json')

Ranking questions with full article set...: 100%|██████████| 85/85 [00:23<00:00,  3.63it/s]
Ranking questions with full article set...: 100%|██████████| 85/85 [00:23<00:00,  3.66it/s]
Ranking questions with full article set...: 100%|██████████| 85/85 [00:22<00:00,  3.79it/s]
Ranking questions with full article set...: 100%|██████████| 85/85 [00:21<00:00,  3.89it/s]


In [3]:
all_articles = data_manipulator.get_all_articles(all_articles_file_path)

all_articles[0:4]

[{'pid': 'http://www.ncbi.nlm.nih.gov/pubmed/40244873',
  'title': 'Cystic Breast Lesions: Diagnostic Approach and US Assessment.',
  'abstract': 'Various cystic breast lesions are encountered during screening and diagnostic breast imaging. According to the Breast Imaging Reporting and Data System (BI-RADS) from the American College of Radiology, cystic breast lesions can be classified into the following categories based on sonographic findings: simple cysts, complicated cysts, clustered microcysts, and complex cystic and solid masses. With appropriate technique, simple cysts can be diagnosed easily by satisfying the diagnostic criteria, which include anechoic round or oval lesions with circumscribed margins and posterior enhancement on US images. Simple cysts are categorized as BI-RADS category 2, benign. Complicated cysts contain debris and satisfy all other sonographic criteria for simple cysts, except they are not anechoic. Clustered microcysts are defined as lesions comprising a c