In [13]:
import os, re, time
from pathlib import Path
from typing import List, Dict, Any
from collections import defaultdict

from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
assert OPENAI_API_KEY, "Missing OPENAI_API_KEY in your .env"

In [None]:
notebook_dir = Path.cwd()
if notebook_dir.name == "notebooks":
    project_root = notebook_dir.parent
else:
    project_root = notebook_dir

os.chdir(project_root)
sys.path.insert(0, str(project_root))

print(f"ðŸ“‚ Working directory: {os.getcwd()}")

from src.services.llm_services import load_config

config = load_config("src/config/config.yaml")

Connect to Weaviate (v4)

In [14]:
import weaviate
from weaviate.connect import ConnectionParams
from weaviate.classes.config import Property, DataType, Configure, Tokenization

WEAVIATE_HTTP = "http://localhost:8080"
WEAVIATE_GRPC_PORT = 50051

client = weaviate.WeaviateClient(
    connection_params=ConnectionParams.from_url(
        WEAVIATE_HTTP,
        grpc_port=WEAVIATE_GRPC_PORT
    )
)
client.connect()
print("Weaviate ready:", client.is_ready())

Weaviate ready: True


Create Schema (Weaviate Collection)

In [16]:
COL = "FinancialChunk"

if not client.collections.exists(COL):
    client.collections.create(
        name=COL,
        properties=[
            Property(name="text", data_type=DataType.TEXT, tokenization=Tokenization.WORD),
            Property(name="source", data_type=DataType.TEXT),
            Property(name="page", data_type=DataType.INT),
            Property(name="chunk_id", data_type=DataType.TEXT),
        ],
        # Weaviate will create embeddings via OpenAI module
        vectorizer_config=Configure.Vectorizer.text2vec_openai(
            model="text-embedding-3-small"
        ),
    )

col = client.collections.get(COL)
print("Collection:", COL)

Collection: FinancialChunk


Load PDF (page-by-page)

In [26]:
from pypdf import PdfReader

PDF_PATH = Path(config["data_root"] + "/Uber_annual_report_2024.pdf")
assert PDF_PATH.exists(), f"PDF not found: {PDF_PATH}"

def clean_text(s: str) -> str:
    s = s.replace("\x00", " ")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

reader = PdfReader(str(PDF_PATH))

pages_text = []
for i, page in enumerate(reader.pages, start=1):
    txt = clean_text(page.extract_text() or "")
    if txt:
        pages_text.append({"page": i, "text": txt})

print("Pages with text:", len(pages_text), " / total pages:", len(reader.pages))

Pages with text: 142  / total pages: 142


Chunking

In [18]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def create_chunks(text: str, chunk_size: int = 1500, chunk_overlap: int = 200) -> List[str]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", " ", ""],
    )
    chunks = splitter.split_text(text)

    # optional stats
    if chunks:
        print(f"chunks={len(chunks)}, avg={sum(len(c) for c in chunks)//len(chunks)}, "
              f"min={min(len(c) for c in chunks)}, max={max(len(c) for c in chunks)}")
    return chunks

all_chunks = []
for p in pages_text:
    chunks = create_chunks(p["text"], 1500, 200)
    for j, ch in enumerate(chunks):
        all_chunks.append({
            "text": ch,
            "source": PDF_PATH.name,
            "page": p["page"],
            "chunk_id": f"{p['page']}-{j}"
        })

print("Total chunks:", len(all_chunks))
print("Sample:", all_chunks[0]["chunk_id"], all_chunks[0]["text"][:200])

chunks=1, avg=29, min=29, max=29
chunks=1, avg=585, min=585, max=585
chunks=2, avg=1381, min=1317, max=1446
chunks=2, avg=1182, min=870, max=1495
chunks=1, avg=1453, min=1453, max=1453
chunks=4, avg=1206, min=491, max=1483
chunks=2, avg=1351, min=1284, max=1418
chunks=4, avg=1367, min=1246, max=1444
chunks=4, avg=1441, min=1373, max=1486
chunks=5, avg=1427, min=1328, max=1494
chunks=4, avg=1331, min=975, max=1490
chunks=5, avg=1272, min=632, max=1493
chunks=4, avg=1465, min=1450, max=1497
chunks=4, avg=1328, min=1032, max=1472
chunks=5, avg=1439, min=1347, max=1488
chunks=5, avg=1399, min=1308, max=1490
chunks=6, avg=1271, min=393, max=1480
chunks=6, avg=1330, min=652, max=1498
chunks=6, avg=1291, min=534, max=1481
chunks=6, avg=1286, min=669, max=1440
chunks=6, avg=1312, min=804, max=1432
chunks=6, avg=1226, min=250, max=1452
chunks=4, avg=1310, min=980, max=1443
chunks=5, avg=1339, min=895, max=1490
chunks=6, avg=1213, min=190, max=1446
chunks=6, avg=1244, min=390, max=1498
chunks=6,

Index into Weaviate

In [19]:
from tqdm import tqdm

BATCH = 64

with col.batch.dynamic() as b:
    for obj in tqdm(all_chunks, desc="Indexing"):
        b.add_object(properties=obj)

print("Indexed.")

Indexing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 537/537 [00:14<00:00, 38.25it/s]


Indexed.


Retrieval: BM25 + Dense Vector

In [20]:
def retrieve_bm25(q: str, k: int = 25):
    res = col.query.bm25(query=q, limit=k)
    return res.objects  # ranked

def retrieve_vector(q: str, k: int = 25):
    res = col.query.near_text(query=q, limit=k)
    return res.objects  # ranked

RRF fusion

In [21]:
def rrf_fuse(lists: List[List[Any]], rrf_k: int = 60, top_n: int = 30) -> List[Any]:
    scores = defaultdict(float)

    for lst in lists:
        for rank, obj in enumerate(lst, start=1):
            scores[str(obj.uuid)] += 1.0 / (rrf_k + rank)

    ranked_ids = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    wanted = {rid for rid, _ in ranked_ids}

    by_id = {}
    for lst in lists:
        for obj in lst:
            sid = str(obj.uuid)
            if sid in wanted and sid not in by_id:
                by_id[sid] = obj

    return [by_id[rid] for rid, _ in ranked_ids if rid in by_id]

Cross-Encoder rerank

In [22]:
from sentence_transformers import CrossEncoder

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2")

def cross_encoder_rerank(question: str, candidates: List[Any], top_n: int = 8) -> List[Any]:
    pairs = [(question, c.properties["text"]) for c in candidates]
    scores = reranker.predict(pairs)
    ranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
    return [c for c, _ in ranked[:top_n]]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

query_librarian(question)

In [25]:
from openai import OpenAI
oa = OpenAI(api_key=OPENAI_API_KEY)

def build_context(objs: List[Any]) -> str:
    out = []
    for i, o in enumerate(objs, start=1):
        src = o.properties.get("source", "unknown")
        page = o.properties.get("page", None)
        text = o.properties.get("text", "")
        out.append(f"[{i}] ({src}, p.{page})\n{text}")
    return "\n\n".join(out)

def query_librarian(question: str,
                    k_retrieve: int = 25,
                    fused_n: int = 30,
                    final_n: int = 8,
                    rrf_k: int = 60,
                    model: str = "gpt-4o-mini") -> Dict[str, Any]:
    t0 = time.time()

    bm25 = retrieve_bm25(question, k=k_retrieve)
    vec  = retrieve_vector(question, k=k_retrieve)

    fused = rrf_fuse([bm25, vec], rrf_k=rrf_k, top_n=fused_n)
    top = cross_encoder_rerank(question, fused, top_n=final_n)

    context = build_context(top)

    system = (
        "You are The Librarian for financial PDFs. "
        "Answer ONLY using the provided context. "
        "If insufficient, say you don't have enough information. "
        "Include page citations like (p. X) for claims."
    )

    user = f"""Context:
{context}

Question: {question}
Answer:"""

    resp = oa.chat.completions.create(
        model=model,
        messages=[{"role": "system", "content": system},
                  {"role": "user", "content": user}],
        temperature=0.2
    )

    return {
        "answer": resp.choices[0].message.content,
        "latency_ms": int((time.time() - t0) * 1000),
        "evidence": [
            {"source": o.properties.get("source"),
             "page": o.properties.get("page"),
             "chunk_id": o.properties.get("chunk_id")}
            for o in top
        ]
    }


Test

In [24]:
out = query_librarian("Where is Form 10-K mentioned? Quote the relevant part.")
print(out["answer"])
print("latency_ms:", out["latency_ms"])
print("evidence:", out["evidence"])

Form 10-K is mentioned multiple times in the document. One relevant part states: "We undertake no obligation to update any forward-looking statements made in this Annual Report on Form 10-K to reflect events or circumstances after the date of this Annual Report on Form 10-K or to reflect new information, actual results, revised expectations, or the occurrence of unanticipated events, except as required by law" (p. 7).
latency_ms: 8984
evidence: [{'source': 'Uber_annual_report_2024.pdf', 'page': 7, 'chunk_id': '7-1'}, {'source': 'Uber_annual_report_2024.pdf', 'page': 52, 'chunk_id': '52-2'}, {'source': 'Uber_annual_report_2024.pdf', 'page': 134, 'chunk_id': '134-0'}, {'source': 'Uber_annual_report_2024.pdf', 'page': 13, 'chunk_id': '13-2'}, {'source': 'Uber_annual_report_2024.pdf', 'page': 52, 'chunk_id': '52-1'}, {'source': 'Uber_annual_report_2024.pdf', 'page': 7, 'chunk_id': '7-0'}, {'source': 'Uber_annual_report_2024.pdf', 'page': 68, 'chunk_id': '68-3'}, {'source': 'Uber_annual_rep

In [27]:
client.close()