In [1]:
import os
import shutil
import tempfile
from tqdm import tqdm
import json
import re
from collections import defaultdict
from rank_bm25 import BM25Okapi
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from data_manipulation import DataManipulator
from evaluation_metrices import Evaluator 

data_manipulator = DataManipulator()

corpus_path = 'Traditional_IR/tokenized_corpus.jsonl'
all_articles_file_path = "datasets/final_correct_datasets/all_retrieved_articles.json"
training_ground_truth_folder_path = "datasets/final_correct_datasets/training"
test_ground_truth_folder_path = "datasets/final_correct_datasets/test"
total_article_target = 70000

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vidak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.split()


def prepare_corpus_cached(articles, corpus_path='tokenized_corpus.jsonl'):
    """
    Tokenizes and caches the corpus to disk. If already exists, loads it.
    Returns tokenized corpus and article_refs.
    """
    if os.path.exists(corpus_path):
        print(f"Loading cached corpus from {corpus_path}...")
        corpus = []
        article_refs = []
        with open(corpus_path, 'r', encoding='utf-8') as f:
            for line in f:
                entry = json.loads(line)
                corpus.append(entry['tokens'])
                article_refs.append(entry['meta'])
        return corpus, article_refs

    print(f"Creating and caching corpus to {corpus_path}...")
    corpus = []
    article_refs = []

    with open(corpus_path, 'w', encoding='utf-8') as f:
        for article in tqdm(articles, desc="Tokenizing articles..."):
            title = article.get('title', '')
            abstract = article.get('abstract', '')
            text = f"{title} {abstract}".strip()

            if not text:
                continue

            tokens = tokenize(text)
            if tokens:
                record = {
                    'tokens': tokens,
                    'meta': {
                        'pid': article.get('pid', ''),
                        'title': title,
                        'abstract': abstract
                    }
                }
                f.write(json.dumps(record, ensure_ascii=False) + "\n")
                corpus.append(tokens)
                article_refs.append(record['meta'])

    return corpus, article_refs

In [3]:
def get_bm25_corpus():
    training_ground_truth = data_manipulator.get_ground_truth_from_all_files(training_ground_truth_folder_path)
    test_ground_truth = data_manipulator.get_ground_truth_from_all_files(test_ground_truth_folder_path)

    ground_truth_data = training_ground_truth + test_ground_truth

    sampled_articles = []

   
    all_articles = data_manipulator.get_all_articles(all_articles_file_path)
    sampled_articles = data_manipulator.build_article_dataset_with_ground_truth(ground_truth_data, 
                                                                                all_articles, 
                                                                                total_articles_target=total_article_target)
    
    corpus, article_refs = prepare_corpus_cached(sampled_articles, corpus_path)    
    
    return corpus, article_refs

In [4]:
# --- BM25 Ranking ---

def rank_articles_bm25(question, bm25, article_refs):
    query = tokenize(question)
    scores = bm25.get_scores(query)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)

    top_10 = []
    for i in ranked_indices[:10]:
        article = article_refs[i]
        top_10.append({
            'pid': article.get('pid', ''),
            'title': article.get('title', ''),
            'abstract': article.get('abstract', ''),
            'score': float(scores[i])
        })

    return top_10

In [5]:


def extract_snippets(question, top_articles, max_snippets_per_article=1, context_window=1):
    query_terms = set(tokenize(question))
    snippets = []

    for article in top_articles:
        pid = article.get('pid', '')
        for section in ['title', 'abstract']:
            field_text = article.get(section, '')
            if not field_text:
                continue

            sentences = sent_tokenize(field_text)
            scored_snippets = []

            for i, sentence in enumerate(sentences):
                tokens = set(tokenize(sentence))
                overlap = query_terms & tokens
                if overlap:
                    # Create snippet with context
                    start_idx = max(0, i - context_window)
                    end_idx = min(len(sentences), i + context_window + 1)
                    snippet_sentences = sentences[start_idx:end_idx]
                    snippet_text = ' '.join(snippet_sentences)

                    # Compute offsets in the original section text
                    try:
                        offset_start = field_text.index(snippet_sentences[0])
                        offset_end = field_text.index(snippet_sentences[-1]) + len(snippet_sentences[-1])
                    except ValueError:
                        offset_start, offset_end = 0, min(len(field_text), 512)

                    scored_snippets.append((len(overlap), snippet_text, offset_start, offset_end))

            # Sort and keep top snippets
            scored_snippets.sort(reverse=True, key=lambda x: x[0])
            for _, snippet_text, offset_start, offset_end in scored_snippets[:max_snippets_per_article]:
                snippets.append({
                    "beginSection": section,
                    "endSection": section,
                    "text": snippet_text,
                    "document": pid,
                    "offsetInBeginSection": offset_start,
                    "offsetInEndSection": offset_end
                })

    return snippets


In [6]:
def save_results(results, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)


In [7]:
# --- Ranking All Questions ---

def rank_all_questions_bm25(questions, corpus, article_refs):
    #corpus, article_refs = prepare_corpus_cached(articles, corpus_path=corpus_path)
    bm25 = BM25Okapi(corpus)

    results_by_question = []

    for entry in tqdm(questions, desc="Ranking questions with full article set..."):
        question = entry['question']
        qid = entry['qid']
        top_articles = rank_articles_bm25(question, bm25, article_refs)
        snippets = extract_snippets(question, top_articles)

        results_by_question.append({
            'id': qid,
            'question': question,
            'top_10_articles': top_articles,
            'snippets': snippets
        })

    return { 'data': results_by_question }

In [8]:
corpus, article_refs = get_bm25_corpus()
training_questions = data_manipulator.get_questions_from_data(data_manipulator.get_ground_truth_from_all_files(training_ground_truth_folder_path))
results = rank_all_questions_bm25(questions=training_questions, corpus=corpus, article_refs=article_refs)

Loading cached corpus from Traditional_IR/tokenized_corpus.jsonl...


Ranking questions with full article set...: 100%|██████████| 5390/5390 [1:14:00<00:00,  1.21it/s]


In [9]:
save_results(results=results, output_file='Traditional_IR/bm25_training_results.json')

In [10]:
# test
for i in range(4):
    file_path = f'datasets/final_correct_datasets/test/parsed_data_final_test_batch_{i+1}.json'

    test_questions = data_manipulator.get_questions_from_data(data_manipulator.get_ground_truth_one_file(file_path=file_path))

    results = rank_all_questions_bm25(questions=test_questions, corpus=corpus, article_refs=article_refs)

    save_results(results, output_file=f'Traditional_IR/bm25_test_batch_{i+1}_results.json')

Ranking questions with full article set...: 100%|██████████| 85/85 [01:11<00:00,  1.20it/s]
Ranking questions with full article set...: 100%|██████████| 85/85 [01:12<00:00,  1.18it/s]
Ranking questions with full article set...: 100%|██████████| 85/85 [01:11<00:00,  1.19it/s]
Ranking questions with full article set...: 100%|██████████| 85/85 [01:04<00:00,  1.32it/s]


### Evaluation 

In [2]:
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        return data

Evaluation of training data


In [3]:
training_dataset_ground_truth_file_path = "datasets/final_correct_datasets/training/parsed_data_final.json"
training_dataset_predicted_file_path = 'Traditional_IR/Results/bm25_training_results.json'

ground_truth_data = load_json(training_dataset_ground_truth_file_path)
predicted_data = load_json(training_dataset_predicted_file_path)

evaluator_training = Evaluator(ground_truth_data=ground_truth_data, predicted_data=predicted_data)



In [4]:
results_articles = evaluator_training.evaluate_metrics_for_articles(k=10)
results_snippets = evaluator_training.evaluate_metrics_for_snippets()

Processing questions...: 5390it [00:00, 77040.48it/s]
Evaluating snippets...: 5390it [00:07, 682.29it/s]


In [5]:
print("Training results for articles: ")
evaluator_training.print_results(results_articles)

print("\nTraining results for snippets:")
evaluator_training.print_results(results_snippets)

Training results for articles: 
MRR: 81.21
MAP: 51.39
nDCG@10: 69.58
P_article: 40.38
R_article: 59.54
F1_article: 37.96
GMAP: 15.13

Training results for snippets:
P_snip: 0.03
R_snip: 0.09
F1_snip: 0.04
MAP_snip: 0.07
GMAP_snip: 0.0


Evaluation of test

In [6]:
for i in range(4):
    test_ground_truth_path = f"datasets/final_correct_datasets/test/parsed_data_final_test_batch_{i + 1}.json"
    test_predicted_results_path = f"Traditional_IR/Results/bm25_test_batch_{i + 1}_results.json"

    results = []

    test_ground_truth = load_json(test_ground_truth_path)
    test_predicted = load_json(test_predicted_results_path)

    evaluator_test = Evaluator(test_ground_truth, test_predicted)

    results_articles = evaluator_test.evaluate_metrics_for_articles(k = 10)
    results_snippets = evaluator_test.evaluate_metrics_for_snippets()

    print(f"Test results articles for test batch {i + 1} ")
    evaluator_test.print_results(results_articles)

    print(f"\nTest results snippets for test batch {i + 1} ")
    evaluator_test.print_results(results_snippets)

    print()


Processing questions...: 85it [00:00, 81882.37it/s]
Evaluating snippets...: 85it [00:00, 728.78it/s]


Test results articles for test batch 1 
MRR: 68.9
MAP: 57.21
nDCG@10: 62.56
P_article: 14.73
R_article: 65.96
F1_article: 22.73
GMAP: 3.97

Test results snippets for test batch 1 
P_snip: 0.0
R_snip: 0.0
F1_snip: 0.0
MAP_snip: 0.0
GMAP_snip: 0.0



Processing questions...: 85it [00:00, ?it/s]
Evaluating snippets...: 85it [00:00, 851.31it/s]


Test results articles for test batch 2 
MRR: 69.98
MAP: 55.46
nDCG@10: 62.67
P_article: 16.24
R_article: 67.82
F1_article: 24.61
GMAP: 5.29

Test results snippets for test batch 2 
P_snip: 0.0
R_snip: 0.0
F1_snip: 0.0
MAP_snip: 0.0
GMAP_snip: 0.0



Processing questions...: 85it [00:00, ?it/s]
Evaluating snippets...: 85it [00:00, 724.32it/s]


Test results articles for test batch 3 
MRR: 70.29
MAP: 51.96
nDCG@10: 59.8
P_article: 16.55
R_article: 63.75
F1_article: 24.74
GMAP: 6.28

Test results snippets for test batch 3 
P_snip: 0.07
R_snip: 0.08
F1_snip: 0.07
MAP_snip: 0.12
GMAP_snip: 0.0



Processing questions...: 85it [00:00, ?it/s]
Evaluating snippets...: 85it [00:00, 727.94it/s]

Test results articles for test batch 4 
MRR: 64.37
MAP: 45.6
nDCG@10: 54.3
P_article: 18.35
R_article: 59.33
F1_article: 26.68
GMAP: 3.24

Test results snippets for test batch 4 
P_snip: 0.0
R_snip: 0.0
F1_snip: 0.0
MAP_snip: 0.0
GMAP_snip: 0.0






Training results for articles: 
MRR: 81.21
MAP: 51.39
nDCG@10: 69.58
P_article: 40.38
R_article: 59.54
F1_article: 37.96
GMAP: 15.13

Training results for snippets:
P_snip: 8.54
R_snip: 51.5
F1_snip: 13.0
MAP_snip: 38.27
GMAP_snip: 6.83

Test results articles for test batch 2 
MRR: 69.98
MAP: 55.46
nDCG@10: 62.67
P_article: 16.24
R_article: 67.82
F1_article: 24.61
GMAP: 5.29

Test results snippets for test batch 2 
P_snip: 3.48
R_snip: 55.89
F1_snip: 6.3
MAP_snip: 15.28
GMAP_snip: 1.22

Processing questions...: 85it [00:00, 84945.40it/s]
Evaluating snippets...: 85it [00:00, 161.00it/s]
Test results articles for test batch 3 
MRR: 70.29
MAP: 51.96
nDCG@10: 59.8
P_article: 16.55
R_article: 63.75
F1_article: 24.74
GMAP: 6.28

Test results snippets for test batch 3 
P_snip: 3.09
R_snip: 52.1
F1_snip: 5.72
MAP_snip: 15.39
GMAP_snip: 1.22

Processing questions...: 85it [00:00, 62306.16it/s]
Evaluating snippets...: 85it [00:00, 166.46it/s]Test results articles for test batch 4 
MRR: 64.37
MAP: 45.6
nDCG@10: 54.3
P_article: 18.35
R_article: 59.33
F1_article: 26.68
GMAP: 3.24

Test results snippets for test batch 4 
P_snip: 3.02
R_snip: 50.58
F1_snip: 5.58
MAP_snip: 17.74
GMAP_snip: 1.25