# Retrieval Baseline Notebook

## Retrieval Baseline for SciSumm-RAG

In this notebook we:
- Load the FAISS index and chunks (chunks.jsonl)
- Demonstrate retrieval (direct and hybrid)
- Generate summary via HFSummarizer
- Evaluate retrieval quality qualitatively and quantitatively (ROUGE)

In [52]:
import os
import pandas as pd
import numpy as np
import faiss
import json
from typing import Tuple, List
import sys
from pathlib import Path

In [53]:
# Auto-detection project_root: go up until we find data/clean/embeddings.npy
root = Path.cwd()
while not (root / "data" / "clean" / "embeddings.npy").exists():
    # if we get to the root of the file system - exit with an error
    if root.parent == root:
        raise RuntimeError("Could not find the folder data/clean/embeddings.npy")
    root = root.parent

project_root = root
print("project_root:", project_root)

# Paths
clean_dir = project_root / "data" / "clean"
index_dir = project_root / "data" / "index" / "faiss"

emb_path = clean_dir / "embeddings.npy"
ids_path = clean_dir / "ids.json"

flat_index_path   = index_dir / "flat_index.index"
flat_ids_path     = index_dir / "flat_index_ids.json"
ivfopq_index_path = index_dir / "ivfpq_index.index"
ivfopq_ids_path   = index_dir / "ivfpq_index_ids.json"

# To make importing src/... works
sys.path.append(str(project_root))

print("project_root:", project_root)
print("embeddings exists:", emb_path.exists(), emb_path)
print("ids exists:       ", ids_path.exists(), ids_path)

project_root: D:\SciSumm-RAG
project_root: D:\SciSumm-RAG
embeddings exists: True D:\SciSumm-RAG\data\clean\embeddings.npy
ids exists:        True D:\SciSumm-RAG\data\clean\ids.json


In [55]:
from src.retriever.embed import embed_texts
from src.retriever.index import (
    normalize_embeddings,
    search,
    hybrid_search,
    load_embeddings
)
from src.generator.hf_summarizer import HFSummarizer

ImportError: cannot import name 'HFSummarizer' from 'src.generator.hf_summarizer' (D:\SciSumm-RAG\src\generator\hf_summarizer.py)

In [10]:
# For ROUGE metrics
!pip install rouge-score
from rouge_score import rouge_scorer

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting absl-py (from rouge-score)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Downloading absl_py-2.3.1-py3-none-any.whl (135 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (pyproject.toml): started
  Building wheel for rouge-score (pyproject.toml): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=25027 sha256=35dd085fc922a119f3a5e4915359b325426eabd9875c16170ca1a741e63dabf1
  Stored in directory: c:\users\zelen\appdata\local\pip\cache\wheels\85\9d\af\01feefbe7d55ef54687


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [48]:
def load_index_and_ids(
    index_file: Path,
    ids_file: Path
) -> Tuple[faiss.Index, List[Tuple[str,str,str]]]:
    # 1) читаем FAISS-индекс из .index-файла
    idx = faiss.read_index(str(index_file))
    # 2) читаем метаданные из *_ids.json
    with open(ids_file, 'r', encoding='utf-8') as f:
        raw = json.load(f)
    # JSON хранит списки, приведём их к кортежам
    ids = [tuple(x) for x in raw]
    return idx, ids

In [49]:
ids, vecs = load_embeddings(emb_path, ids_path)
# turn each [paper_id, section, chunk_id] into a tuple
ids = [tuple(x) for x in ids]
vecs = vecs.astype('float32')  

index_flat, ids_flat = load_index_and_ids(flat_index_path, flat_ids_path)

In [50]:
import json

# Loading chunk_texts mapping from JSONL file
chunks_file = clean_dir / "chunks.jsonl" 
chunk_texts = {}
with open(chunks_file, "r", encoding="utf-8") as f:
    for line in f:
        pid, section, cid, txt = json.loads(line)
        chunk_texts[(pid, section, cid)] = txt

## 4. Preparation of the summarizer

In [None]:
# Используем HF summarizer
summ = HFSummarizer()

In [7]:
queries = [
    "What is a mechanism for generating notebook interfaces for DSLs?",  # CV/PL
    "How to stabilize corium during severe nuclear accident?",           # Nuclear
    "What methods exist for probabilistic verification of software?"   # ML/verification
]

# Embed & normalize все запросы
q_embs = embed_texts(queries)
q_embs = normalize_embeddings(q_embs.astype(np.float32))

# выбираем тип retrieval: direct или hybrid
use_hybrid = True
results = []
for q, q_emb in zip(queries, q_embs):
    if use_hybrid:
        res = hybrid_search(
            coarse_idx=idx,
            ids=ids,
            queries=q_emb[np.newaxis, :],
            query_texts=[q],
            chunks_path=CHUNKS_PATH,
            rerank_model=summ,  # или gpt
            top_k_coarse=50,
            top_k=5
        )[0]
    else:
        dist, inds = idx.search(q_emb[np.newaxis,:], 5)
        res = [(ids[i], float(dist_val)) for i, dist_val in zip(inds[0], dist[0])]
    results.append(res)

# Отобразим для первого запроса
for rank, (key, score) in enumerate(results[0], start=1):
    print(f"{rank}. {key} (score={score:.3f})")
    print(chunk_texts[key][:200], "...\n")

Device set to use cuda:0


We establish weak well - posedness for critical symmetric stable driven sdes in r d with additive noise z, d 1 . We study the case where the stable index of the driving process z is = 1 which exactly corresponds to the order of the drift term having the coefficient b which is continuous and bounded .


In [None]:
for q, res in zip(queries, results):
    print("\nQUERY:", q)
    passages = [chunk_texts[k] for k,_ in res]
    combined = "\n\n".join(passages)
    summary = summ.summarize(combined, max_length=150, min_length=30)
    print("SUMMARY:", summary)

In [51]:
# measure recall@k on a subsample (test_faiss_search.py script)
!python test_faiss_search.py --embeddings data/clean/embeddings.npy --ids        data/clean/ids.json --index      data/index/faiss/flat_index.index --mode       flat --topk       5 --sample-size 1000      

C:\Users\zelen\AppData\Local\Programs\Python\Python312\python.exe: can't open file 'D:\\SciSumm-RAG\\notebooks\\test_faiss_search.py': [Errno 2] No such file or directory
