In [20]:
# Retrieval Baseline Evaluation

## 1. Loading Embeddings and FAISS-index

from pathlib import Path
import sys
import numpy as np
import faiss
from src.retriever.index import load_embeddings, load_index

# Auto-detection project_root: go up until we find data/clean/embeddings.npy
root = Path.cwd()
while not (root / "data" / "clean" / "embeddings.npy").exists():
    # if we get to the root of the file system - exit with an error
    if root.parent == root:
        raise RuntimeError("Could not find the folder data/clean/embeddings.npy")
    root = root.parent

project_root = root
print("project_root:", project_root)

# Paths
clean_dir = project_root / "data" / "clean"
index_dir = project_root / "data" / "index"

emb_path = clean_dir / "embeddings.npy"
ids_path = clean_dir / "ids.json"

flat_index_path   = index_dir / "faiss_flat_ip.index"
flat_ids_path     = index_dir / "ids_flat_ip.json"
ivfopq_index_path = index_dir / "faiss_ivfopq.index"
ivfopq_ids_path   = index_dir / "ids_ivfopq.json"

# To make importing src/... works
sys.path.append(str(project_root))

print("project_root:", project_root)
print("embeddings exists:", emb_path.exists(), emb_path)
print("ids exists:       ", ids_path.exists(), ids_path)

ids, vecs = load_embeddings(Path(emb_path), Path(ids_path))
# turn each [paper_id, section, chunk_id] into a tuple
ids = [tuple(x) for x in ids]

index, _ = load_index(Path(flat_index_path), Path(flat_ids_path))
vecs = vecs.astype('float32')  

project_root: D:\SciSumm-RAG
project_root: D:\SciSumm-RAG
embeddings exists: True D:\SciSumm-RAG\data\clean\embeddings.npy
ids exists:        True D:\SciSumm-RAG\data\clean\ids.json


In [21]:
import json

# Loading chunk_texts mapping from JSONL file
chunks_file = clean_dir / "chunks.jsonl" 
chunk_texts = {}
with open(chunks_file, "r", encoding="utf-8") as f:
    for line in f:
        pid, section, cid, txt = json.loads(line)
        chunk_texts[(pid, section, cid)] = txt

In [22]:
# Now we get the top-k fragments and reassemble the text
top_k = 5
query_vec = vecs[0:1]
D, I = index.search(query_vec, top_k)
ids_for_query = [ids[idx] for idx in I[0]]

txt = "\n\n".join([chunk_texts[c] for c in ids_for_query])

from src.generator.hf_summarizer import generate_summary_hf
summary = generate_summary_hf(
    txt,
    max_length=200,
    min_length=50
)
print(summary)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cuda:0


This work addresses the inverse identification of apparent elastic properties of random heterogeneous materials using machine learning based on artificial neural networks . The proposed neural network-based identification method requires the construction of a database from which an artificial neural network can be trained to learn the relationship between the hyperparameters of a prior stochastic model of the random compliance field and some relevant quantities of interest .


In [32]:
# measure recall@k on a subsample (test_faiss_search.py script)
!python ../test_faiss_search.py --embeddings ../data/clean/embeddings.npy --ids ../data/clean/ids.json --index ../data/index/faiss_flat_ip.index --mode flat --topk 5 --sample-size 500

Loading embeddings and IDs...
Loaded 2895000 vectors of dim 384
Normalizing embeddings...
Loading index from ..\data\index\faiss_flat_ip.index...
Evaluating recall@5 on sample of 500... 
Same-paper recall@5: 1.000
Running simple search on first query...
[([2011.11761, 'Unknown', 'Unknown__0'], 1.0), ([2011.11761, 'Unknown', 'Unknown__4'], 0.9239822626113892), ([2011.11761, 'Unknown', 'Unknown__3'], 0.9239822626113892), ([2011.11761, 'Unknown', 'Unknown__2'], 0.9239822626113892), ([2011.11761, 'Unknown', 'Unknown__1'], 0.9239822626113892)]
