### Dataset Preparation

In [5]:
from data_manipulation import DataManipulator

def build_sampled_articles_with_ground_truth(
    all_articles_file_path: str,
    training_ground_truth_folder_path: str,
    test_ground_truth_folder_path: str,
    total_article_target: int = 70000
):
    data_manipulator = DataManipulator()

    all_articles = data_manipulator.get_all_articles(all_articles_file_path)

    training_ground_truth = data_manipulator.get_ground_truth_from_all_files(training_ground_truth_folder_path)
    test_ground_truth = data_manipulator.get_ground_truth_from_all_files(test_ground_truth_folder_path)
    ground_truth_data = training_ground_truth + test_ground_truth

    sampled_articles = data_manipulator.build_article_dataset_with_ground_truth(
        ground_truth_data=ground_truth_data,
        all_articles=all_articles,
        total_articles_target=total_article_target
    )

    return sampled_articles

In [6]:
sampled_articles = build_sampled_articles_with_ground_truth(
    all_articles_file_path="datasets/final_correct_datasets/all_retrieved_articles.json",
    training_ground_truth_folder_path="datasets/final_correct_datasets/training",
    test_ground_truth_folder_path="datasets/final_correct_datasets/test",
    total_article_target=70000
)

### Loading pre-trained BERT model


In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
article_texts = [f"{a['title']} {a['abstract']}" for a in sampled_articles]
article_embeddings = model.encode(article_texts, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/2188 [00:00<?, ?it/s]

In [24]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

### Generation of results

In [25]:
from sentence_transformers import util
import torch
from tqdm import tqdm
import json
from nltk.tokenize import sent_tokenize

def generate_neural_results_with_snippets(questions, sampled_articles, article_embeddings, model, output_file_path):
    results = []

    for q in tqdm(questions, desc="Generating predictions"):
        q_embedding = model.encode(q["question"], convert_to_tensor=True)
        scores = util.cos_sim(q_embedding, article_embeddings)[0]
        top_k_indices = torch.topk(scores, k=10).indices

        top_articles = []
        snippet_candidates = []

        for i in top_k_indices:
            a = sampled_articles[i]
            top_articles.append({
                "pid": a["pid"],
                "title": a["title"],
                "abstract": a["abstract"],
                "score": float(scores[i])
            })

            sentences = sent_tokenize(a["title"]) + sent_tokenize(a["abstract"])
            for sent in sentences:
                snippet_candidates.append({
                    "text": sent,
                    "document": a["pid"]
                })

        if snippet_candidates:
            snippet_texts = [s["text"] for s in snippet_candidates]
            snippet_embeddings = model.encode(snippet_texts, convert_to_tensor=True)
            snippet_scores = util.cos_sim(q_embedding, snippet_embeddings)[0]
            top_snippet_indices = torch.topk(snippet_scores, k=min(10, len(snippet_candidates))).indices

            top_snippets = []
            for idx in top_snippet_indices:
                s = snippet_candidates[idx]
                top_snippets.append({
                    "text": s["text"],
                    "document": s["document"],
                    "beginSection": "abstract",
                    "endSection": "abstract",
                    "offsetInBeginSection": 0,
                    "offsetInEndSection": 0
                })
        else:
            top_snippets = []

        results.append({
            "qid": q["qid"],
            "question": q["question"],
            "top_10_articles": top_articles,
            "snippets": top_snippets
        })

    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump({"data": results}, f, ensure_ascii=False, indent=2)

    print(f"\n Predictions with snippets saved to: {output_file_path}")


In [22]:
import json

with open("datasets/final_correct_datasets/training/parsed_data_final.json", "r", encoding="utf-8") as f:
    training_data = json.load(f)["data"]

### Obtaining Predictions for Training Set

In [26]:
output_file = "neural_results_with_snippets.json"

generate_neural_results_with_snippets(
    questions=training_data,
    sampled_articles=sampled_articles,
    article_embeddings=article_embeddings,
    model=model,
    output_file_path=output_file
)

Generating predictions: 100%|██████████| 5390/5390 [8:19:38<00:00,  5.56s/it]   



 Predictions with snippets saved to: neural_results_with_snippets.json


### Evaluation of Training Set

In [27]:
from evaluation_metrices import Evaluator
import json

with open("datasets/final_correct_datasets/training/parsed_data_final.json", "r", encoding="utf-8") as f:
    gt_data = json.load(f)

with open("neural_results_with_snippets.json", "r", encoding="utf-8") as f:
    pred_data = json.load(f)

evaluator = Evaluator(gt_data, pred_data)

results_articles = evaluator.evaluate_metrics_for_articles(k=10)
print("Results for articles (training set):")
evaluator.print_results(results_articles)

results_snippets = evaluator.evaluate_metrics_for_snippets()
print("\nResults for snippets (training set):")
evaluator.print_results(results_snippets)


Processing questions...: 5390it [00:00, 92585.70it/s]


Results for articles (training set):
MRR: 62.8
MAP: 30.9
nDCG@10: 47.38
P_article: 28.12
R_article: 39.48
F1_article: 25.55
GMAP: 1.77


Evaluating snippets...: 5390it [00:01, 2822.72it/s]


Results for snippets (training set):
P_snip: 11.18
R_snip: 12.6
F1_snip: 9.51
MAP_snip: 9.78
GMAP_snip: 0.04





### Obtaining Predictions for Test Batches

In [30]:
from data_manipulation import DataManipulator

dm = DataManipulator()

for i in range(4):
    file_path = f'datasets/final_correct_datasets/test/parsed_data_final_test_batch_{i+1}.json'
    
    # Load and extract questions only
    batch_questions = dm.get_questions_from_data(dm.get_ground_truth_one_file(file_path=file_path))

    # Call your function
    output_file = f"neural_results_with_snippets_test_batch_{i+1}.json"
    print(f"Generating predictions for test batch {i + 1}...")

    generate_neural_results_with_snippets(
        questions=batch_questions,
        sampled_articles=sampled_articles,
        article_embeddings=article_embeddings,
        model=model,
        output_file_path=output_file
    )

Generating predictions for test batch 1...


Generating predictions: 100%|██████████| 85/85 [08:37<00:00,  6.08s/it]



 Predictions with snippets saved to: neural_results_with_snippets_test_batch_1.json
Generating predictions for test batch 2...


Generating predictions: 100%|██████████| 85/85 [09:52<00:00,  6.97s/it]



 Predictions with snippets saved to: neural_results_with_snippets_test_batch_2.json
Generating predictions for test batch 3...


Generating predictions: 100%|██████████| 85/85 [10:06<00:00,  7.13s/it]



 Predictions with snippets saved to: neural_results_with_snippets_test_batch_3.json
Generating predictions for test batch 4...


Generating predictions: 100%|██████████| 85/85 [07:48<00:00,  5.51s/it]


 Predictions with snippets saved to: neural_results_with_snippets_test_batch_4.json





### Evaluation of Test Batches:

In [31]:
from evaluation_metrices import Evaluator
import json

for i in range(4):
    gt_path = f"datasets/final_correct_datasets/test/parsed_data_final_test_batch_{i + 1}.json"
    pred_path = f"neural_results_with_snippets_test_batch_{i + 1}.json"

    with open(gt_path, "r", encoding="utf-8") as f_gt, open(pred_path, "r", encoding="utf-8") as f_pred:
        gt_data = json.load(f_gt)
        pred_data = json.load(f_pred)

    evaluator = Evaluator(gt_data, pred_data)

    print(f"\nTest results for articles (test batch {i + 1}):")
    results_articles = evaluator.evaluate_metrics_for_articles(k=10)
    evaluator.print_results(results_articles)

    print(f"\nTest results for snippets (test batch {i + 1}):")
    results_snippets = evaluator.evaluate_metrics_for_snippets()
    evaluator.print_results(results_snippets)



Test results for articles (test batch 1):


Processing questions...: 85it [00:00, ?it/s]


MRR: 58.38
MAP: 44.89
nDCG@10: 52.13
P_article: 14.17
R_article: 58.53
F1_article: 21.56
GMAP: 2.04

Test results for snippets (test batch 1):


Evaluating snippets...: 85it [00:00, 30411.66it/s]

P_snip: 4.6
R_snip: 13.99
F1_snip: 6.39
MAP_snip: 8.85
GMAP_snip: 0.0






Test results for articles (test batch 2):


Processing questions...: 85it [00:00, 81788.45it/s]


MRR: 49.81
MAP: 34.55
nDCG@10: 41.94
P_article: 10.8
R_article: 47.99
F1_article: 16.61
GMAP: 0.62

Test results for snippets (test batch 2):


Evaluating snippets...: 85it [00:00, 3500.99it/s]

P_snip: 2.95
R_snip: 9.98
F1_snip: 4.43
MAP_snip: 5.05
GMAP_snip: 0.0






Test results for articles (test batch 3):


Processing questions...: 85it [00:00, ?it/s]


MRR: 53.65
MAP: 37.9
nDCG@10: 46.05
P_article: 12.95
R_article: 52.87
F1_article: 19.62
GMAP: 1.55

Test results for snippets (test batch 3):


Evaluating snippets...: 85it [00:00, 5150.47it/s]

P_snip: 2.95
R_snip: 9.51
F1_snip: 4.31
MAP_snip: 3.34
GMAP_snip: 0.0






Test results for articles (test batch 4):


Processing questions...: 85it [00:00, ?it/s]


MRR: 57.74
MAP: 40.43
nDCG@10: 48.55
P_article: 15.52
R_article: 53.02
F1_article: 22.73
GMAP: 1.61

Test results for snippets (test batch 4):


Evaluating snippets...: 85it [00:00, 4506.13it/s]

P_snip: 3.41
R_snip: 9.97
F1_snip: 4.87
MAP_snip: 5.79
GMAP_snip: 0.0



