# Draft notebook for the application

## General imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import os

## Test book content loader

In [3]:
import loaders.book_content_loader as bcl

In [4]:
search_query = "Martin Eden"
res_search = bcl.search_books(search_query)

In [5]:
url_archive = bcl.get_book_archive_page(res_search)

In [6]:
book_text = bcl.fetch_book_text(url_archive)

Book found with length: 930190 characters
  aching  brow,  he  had  been  at 
work  all  day. 

“  Part  of  last  week’s  washin’  mounted  up,  me  bein’  away 
to  get  you,”  he  explained.  “  Your  box  arrived  all  right. 
It’s  in  y


In [7]:
path_book_content = bcl.write_book_to_file(book_text, search_query)

Book text written to documents/Martin_Eden.txt


In [8]:
path_book_content

'documents/Martin_Eden.txt'

## RAG with HuggingFace

In [3]:
import retrieval.rag_retriever as rr

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
if not os.path.exists("documents"):
    os.makedirs("documents")

with open("documents/sample.txt", "w") as f:
    f.write("""
Nicolas Cotoni is a french computer engineer. From Lyon he was graduated the engineering school named EPITA.
""")

# Initialize the RAG system
model_name = "Qwen/Qwen2.5-3B-Instruct" if torch.cuda.is_available() else "HuggingFaceTB/SmolLM2-135M"
print(1)
rag = rr.LocalRAGSystem("documents/Martin_Eden.txt", model_name=model_name, model_name_embeddings="sentence-transformers/all-MiniLM-L6-v2")

# rag.initialize()

print(2)
answer_tag = "**Answer:**"

# Example query
result = rag.query("Who is Ruth?")
print(3)
print("\nQuestion: Who is Ruth?")
print("\nAnswer:", result["result"][result["result"].find(answer_tag) + len(answer_tag):])


1
Loaded 1697 document chunks
1.1
Vector store created successfully
1.2
Loading model: HuggingFaceTB/SmolLM2-135M
Vector store created successfully
1.2
Loading model: HuggingFaceTB/SmolLM2-135M


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


1.3
RAG system initialized successfully!
2
3

Question: Who is Ruth?

Answer: 

The character is Ruth. She is a young woman who has grown up with her father, a successful businessman, and a wealthy family. Her mother died when she was ten years old, leaving her alone with her father. She grew up in a small town in the Midwest, surrounded by people who looked down upon her as a “different” person. She became a teacher at a local school, which taught her English and math classes. She also studied law and worked as a lawyer until she married a man named James. They lived together for many years before James passed away.

She then moved to New York City, where she met and fell in love with a man named James. They got engaged and started dating, but James never returned home. When James died, Ruth found herself alone again. She went back to her hometown, where she continued to study law and work as a lawyer. She eventually decided to marry James, but she didn’t know what to do next. She tri

In [70]:
import os
import glob
import pickle
from typing import List, Dict, Tuple
from pathlib import Path

import numpy as np
from tqdm import tqdm

# embeddings
from sentence_transformers import SentenceTransformer

# generator
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# faiss
import faiss

In [84]:
# ---------- Config ----------
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
GEN_MODEL = "HuggingFaceTB/SmolLM2-135M"

DATA_DIR = "documents"                 # directory with text files (or subdirs)
# DATA_DIR = "test_doc"                  # directory with text files (or subdirs)
CHUNK_SIZE = 800                       # characters per chunk (adjust to suit model context)
CHUNK_OVERLAP = 150                    # overlap between chunks
BATCH_SIZE = 64                        # embedding batch size
TOP_K = 5                              # documents to retrieve
INDEX_PATH = "faiss.index"
METADATA_PATH = "chunks.pkl"
EMB_NORM = True                        # whether to normalize embeddings (cosine)
GENERATE_MAX_LENGTH = 2048 # 512
# ----------------------------


In [85]:
def split_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
    """Simple character-based chunking with overlap. Works well for long docs."""
    if len(text) <= chunk_size:
        return [text]
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        if end >= len(text):
            break
        start = end - overlap
    return chunks


def load_documents_from_dir(directory: str) -> List[Dict]:
    """
    Walks directory and reads .txt, .md, .pdf (pdf text extraction not included),
    returns list of dicts: {'id': idx, 'source': filepath, 'text': text}
    """
    docs = []
    files = []
    p = Path(directory)
    if not p.exists():
        raise FileNotFoundError(f"Data directory not found: {directory}")
    # Add common text-like file types; you can extend or add PDF parsing externally
    for ext in ("*.txt", "*.md", "*.csv"):
        files.extend(sorted(p.rglob(ext)))
    # If you want PDF support, parse with pdfminer or pypdf and add here.
    idx = 0
    for filepath in files:
        try:
            text = filepath.read_text(encoding="utf-8", errors="ignore")
        except Exception:
            continue
        docs.append({"id": idx, "source": str(filepath), "text": text})
        idx += 1
    return docs


def build_chunks_from_documents(documents: List[Dict]) -> List[Dict]:
    """Turn each document into overlapping chunks, tracked with metadata."""
    chunks = []
    chunk_id = 0
    for doc in documents:
        doc_chunks = split_text(doc["text"])
        for i, c in enumerate(doc_chunks):
            chunks.append({
                "chunk_id": chunk_id,
                "source_id": doc["id"],
                "source": doc["source"],
                "text": c,
                "chunk_index": i,
            })
            chunk_id += 1
    return chunks

In [86]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM


class RAGLocal:
    def __init__(
        self,
        embed_model_name: str = EMBED_MODEL,
        gen_model_name: str = GEN_MODEL,
        device: str = "cpu",
    ):
        # performance tuning for CPU
        torch.set_num_threads(max(1, os.cpu_count() - 1))
        self.device = torch.device(device)
        # Embedding model (SentenceTransformers)
        self.embed_model = SentenceTransformer(embed_model_name, device="cpu")
        # Generation model (transformers)
        self.tokenizer = AutoTokenizer.from_pretrained(gen_model_name, use_fast=True)
        # Some causal LM tokenizers don't have a padding token by default.
        # Set pad_token to eos_token so tokenizer(..., padding=True) works.
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        # self.gen_model = AutoModelForSeq2SeqLM.from_pretrained(gen_model_name)
        self.gen_model = AutoModelForCausalLM.from_pretrained(gen_model_name)
        self.gen_model.to(self.device)
        # Will be filled by build/load
        self.index = None
        self.chunks = []  # metadata for each vector in the index
        self.emb_dim = None

    def embed_texts(self, texts: List[str], batch_size: int = BATCH_SIZE) -> np.ndarray:
        """Return numpy array of embeddings (float32). Uses batches to reduce memory."""
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            emb = self.embed_model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
            embeddings.append(emb)
        embeddings = np.vstack(embeddings).astype("float32")
        if EMB_NORM:
            # normalize for cosine similarity with inner product index
            faiss.normalize_L2(embeddings)
        return embeddings

    def build_faiss_index(self, embeddings: np.ndarray):
        """Create a FAISS IndexFlatIP (inner product) suitable for normalized embeddings."""
        self.emb_dim = embeddings.shape[1]
        # choose inner product on normalized vectors -> cosine similarity
        index = faiss.IndexFlatIP(self.emb_dim)
        index.add(embeddings)
        self.index = index

    def save_index(self, index_path: str = INDEX_PATH, metadata_path: str = METADATA_PATH):
        if self.index is None:
            raise RuntimeError("Index not built")
        faiss.write_index(self.index, index_path)
        with open(metadata_path, "wb") as f:
            pickle.dump(self.chunks, f)
        print(f"Saved index to {index_path} and metadata to {metadata_path}")

    def load_index(self, index_path: str = INDEX_PATH, metadata_path: str = METADATA_PATH):
        if not os.path.exists(index_path) or not os.path.exists(metadata_path):
            raise FileNotFoundError("Index or metadata not found; build the index first.")
        self.index = faiss.read_index(index_path)
        with open(metadata_path, "rb") as f:
            self.chunks = pickle.load(f)
        # infer embedding dim
        self.emb_dim = self.index.d
        print(f"Loaded index {index_path} with {len(self.chunks)} chunks")

    def index_documents(self, data_dir: str = DATA_DIR, rebuild: bool = False):
        """Read files, chunk, embed, and create index. Saves to disk."""
        if not rebuild and os.path.exists(INDEX_PATH) and os.path.exists(METADATA_PATH):
            print("Index already exists. Loading from disk.")
            self.load_index()
            return

        print("Loading documents...")
        docs = load_documents_from_dir(data_dir)
        if len(docs) == 0:
            raise RuntimeError(f"No documents found in {data_dir}")
        print(f"Loaded {len(docs)} documents. Chunking...")
        chunks = build_chunks_from_documents(docs)
        self.chunks = chunks
        texts = [c["text"] for c in chunks]
        print(f"Embedding {len(texts)} chunks in batches...")
        embeddings = self.embed_texts(texts)
        print("Building FAISS index...")
        self.build_faiss_index(embeddings)
        print("Saving index and metadata to disk...")
        self.save_index()

    def retrieve(self, query: str, top_k: int = TOP_K) -> List[Tuple[Dict, float]]:
        """Embed the query and retrieve top_k chunks with scores."""
        q_emb = self.embed_texts([query])  # shape (1, dim)
        if self.index is None:
            raise RuntimeError("Index not loaded")
        D, I = self.index.search(q_emb, top_k)
        scores = D[0]
        indices = I[0]
        results = []
        for idx, score in zip(indices, scores):
            if idx < 0:
                continue
            results.append((self.chunks[idx], float(score)))
        return results

    def generate(self, query: str, top_k: int = TOP_K) -> str:
        """Retrieve contexts, construct prompt, and generate an answer."""
        results = self.retrieve(query, top_k=top_k)
        contexts = []
        for r, score in results:
            contexts.append(f"Source: {r['source']}\n\n{r['text']}\n---\n")
        context_str = "\n".join(contexts)
        prompt = f"""Use the following extracted context to answer the question as precisely as possible.
            If the answer is not in the context, say you don't know.\n\n
            Context:\n{context_str}\nQuestion: {query}\nAnswer:"""
        # Tokenize and generate
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(self.device)

        with torch.no_grad():
            out = self.gen_model.generate(
                **inputs,
                max_length=GENERATE_MAX_LENGTH,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=3
            )
        answer = self.tokenizer.decode(out[0], skip_special_tokens=True)
        # Combine with provenance for traceability
        provenance = "\n\nRetrieved chunks (top_k):\n"
        for r, score in results:
            provenance += f"- {r['source']} (chunk {r['chunk_index']}) score={score:.4f}\n"
        return answer + provenance

In [87]:
rag = RAGLocal()
# Build index if not present, otherwise load
try:
    rag.index_documents(DATA_DIR, rebuild=False)
except Exception as e:
    print("Error building/loading index:", e)

print("\nRAG local CPU ready. Ask questions.")


2025-09-22 15:29:33,884 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


Index already exists. Loading from disk.
Loaded index faiss.index with 1431 chunks

RAG local CPU ready. Ask questions.


In [88]:
q = "Who is Ruth?"
answer = rag.generate(q, top_k=TOP_K)
print("\nAnswer:\n", answer)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



Answer:
 Use the following extracted context to answer the question as precisely as possible.
            If the answer is not in the context, say you don't know.


            Context:
Source: documents\Martin_Eden.txt


MARTIN  EDEN 


73 


He  was  too  young  to  know  better,  but  be  robbed  himself 
of  life  for  the  sake  of  thirty  thousand  a  year  that’s  clean 
wasted  upon  him.  Why,  thirty  thousand,  lump  sum, 
wouldn’t  buy  for  him  right  now  what  ten  cents  he  was 
layin’  up  would  have  bought  him,  when  he  was  a  kid,  in 
the  way  of  candy  an’  peanuts  or  a  seat  in  nigger  heaven.” 

It  was  just  such  uniqueness  of  points  of  view  that  startled 
Ruth.  Not  only  were  they  new  to  her,  and  contrary  to 
her  own  beliefs,  but  she  always  felt  in  them  germs  of  truth 
that  threatened  to  unseat  or  modify  her  own  convictions. 
Had  she  been  fourteen  instead  of  twenty-four,  she  might 
have  been  changed 