In [1]:
from data_manipulation import DataManipulator

dm = DataManipulator()

all_articles = dm.get_all_articles("datasets/final_correct_datasets/all_retrieved_articles.json")
training_data = dm.get_ground_truth_one_file("datasets/final_correct_datasets/training/parsed_data_final.json")

# Napravi lookup po PID-u
article_lookup = {article['pid']: article for article in all_articles}

# Ground-truth PID-ovi
gt_pids = set()
for entry in training_data:
    gt_pids.update(entry['ground_truth_documents_pid'])

# Dodaj sve GT članke
gt_articles = [article_lookup[pid] for pid in gt_pids if pid in article_lookup]

# Nasumično uzmi ostatak
import random
non_gt_articles = [a for a in all_articles if a['pid'] not in gt_pids]
random.seed(42)
sampled_non_gt = random.sample(non_gt_articles, k=70000 - len(gt_articles))

sampled_articles = gt_articles + sampled_non_gt


In [2]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
import torch

model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
article_texts = [f"{a['title']} {a['abstract']}" for a in sampled_articles]
article_embeddings = model.encode(article_texts, convert_to_tensor=True, show_progress_bar=True)


Batches:   0%|          | 0/2188 [00:00<?, ?it/s]

In [12]:
from sentence_transformers import util
import torch
from tqdm import tqdm
import json

# Generiši predikcije
results = []

for q in tqdm(training_data, desc="Generating predictions"):
    q_embedding = model.encode(q["question"], convert_to_tensor=True)
    scores = util.cos_sim(q_embedding, article_embeddings)[0]
    top_k_indices = torch.topk(scores, k=10).indices

    top_articles = []
    for i in top_k_indices:
        a = sampled_articles[i]
        top_articles.append({
            "pid": a["pid"],
            "title": a["title"],
            "abstract": a["abstract"],
            "score": float(scores[i])
        })

    results.append({
        "qid": q["qid"],
        "question": q["question"],
        "top_10_articles": top_articles,
        "snippets": []  # Ne radiš snippet retrieval
    })

# Sačuvaj rezultate
with open("neural_results.json", "w", encoding="utf-8") as f:
    json.dump({"data": results}, f, ensure_ascii=False, indent=2)


Generating predictions:   0%|          | 0/5390 [00:00<?, ?it/s]

Generating predictions: 100%|██████████| 5390/5390 [09:30<00:00,  9.44it/s]


In [13]:
from evaluation_metrices import Evaluator
import json

# Učitaj ground truth i predikcije
with open("datasets/final_correct_datasets/training/parsed_data_final.json", "r", encoding="utf-8") as f:
    gt_data = json.load(f)

with open("neural_results.json", "r", encoding="utf-8") as f:
    pred_data = json.load(f)

# Pokreni evaluaciju
evaluator = Evaluator(gt_data, pred_data)
results_articles = evaluator.evaluate_metrics_for_articles(k=10)

# Prikaži rezultate
print("Rezultati za članke (training set):")
evaluator.print_results(results_articles)


Processing questions...: 5390it [00:00, 115555.60it/s]

Rezultati za članke (training set):
MRR: 63.02
MAP: 30.94
nDCG@10: 47.42
P_article: 28.16
R_article: 39.33
F1_article: 25.55
GMAP: 1.76





In [17]:
from sentence_transformers import SentenceTransformer, util
import torch
import json
from tqdm import tqdm

def generate_bert_results_for_batch(batch_index, model, sampled_articles, article_embeddings, output_path):
    gt_path = f"datasets/final_correct_datasets/test/parsed_data_final_test_batch_{batch_index}.json"
    with open(gt_path, "r", encoding="utf-8") as f:
        test_questions = json.load(f)["data"]

    results = []

    for q in tqdm(test_questions, desc=f"Processing batch {batch_index}"):
        q_embedding = model.encode(q["question"], convert_to_tensor=True)
        scores = util.cos_sim(q_embedding, article_embeddings)[0]
        top_k_indices = torch.topk(scores, k=10).indices

        top_articles = []
        for i in top_k_indices:
            a = sampled_articles[i]
            top_articles.append({
                "pid": a["pid"],
                "title": a["title"],
                "abstract": a["abstract"],
                "score": float(scores[i])
            })

        results.append({
            "qid": q["qid"],
            "question": q["question"],
            "top_10_articles": top_articles,
            "snippets": []  # Not used here
        })

    # Save predictions
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump({"data": results}, f, indent=2, ensure_ascii=False)


In [None]:
for i in range(1, 5):
    output_path = f"neural_test_batch_{i}_results.json"
    generate_bert_results_for_batch(
        batch_index=i,
        model=model,
        sampled_articles=sampled_articles,
        article_embeddings=article_embeddings,
        output_path=output_path
    )


Processing batch 1: 100%|██████████| 85/85 [00:13<00:00,  6.45it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'neural_results/neural_test_batch_1_results.json'

In [15]:
import json
from evaluation_metrices import Evaluator

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


In [16]:
for i in range(4):
    test_ground_truth_path = f"datasets/final_correct_datasets/test/parsed_data_final_test_batch_{i + 1}.json"
    test_predicted_results_path = f"neural_results/neural_test_batch_{i + 1}_results.json"

    test_ground_truth = load_json(test_ground_truth_path)
    test_predicted = load_json(test_predicted_results_path)

    evaluator_test = Evaluator(test_ground_truth, test_predicted)

    results_articles = evaluator_test.evaluate_metrics_for_articles(k=10)
    print(f"\n📘 Test results – Articles for test batch {i + 1}")
    evaluator_test.print_results(results_articles)

    # Ako budeš imala snippet predikcije, možeš uključiti i ovo:
    # results_snippets = evaluator_test.evaluate_metrics_for_snippets()
    # print(f"\n📝 Snippet results for test batch {i + 1}")
    # evaluator_test.print_results(results_snippets)


FileNotFoundError: [Errno 2] No such file or directory: 'neural_results/neural_test_batch_1_results.json'