# Part 3: "The Librarian" (Advanced RAG System)

This notebook implements an Advanced Hybrid RAG Pipeline using Weaviate.

In [None]:
import weaviate
from langchain_weaviate import WeaviateVectorStore
from sentence_transformers import CrossEncoder
import os
import sys

# Add project src to path
sys.path.append(os.path.abspath("../"))

from src.services.llm_services import load_config, get_embeddings, get_llm
from src.utils.data_processing import load_and_clean_pdf, chunk_text

## 1. Vector Database Setup (Weaviate)

Initializing Weaviate and creating the schema for Uber's Annual Report chunks.

In [None]:
config = load_config("../src/config/config.yaml")
client = weaviate.Client(url=config["weaviate_url"])

# Define and create schema
schema = {
    "class": "UberReport",
    "vectorizer": "none",  # We'll provide our own vectors
    "properties": [
        {"name": "content", "dataType": ["text"]},
        {"name": "page_number", "dataType": ["int"]}
    ]
}

# client.schema.create_class(schema)

## 2. Hybrid Search & Refinement

Implementing Dense + BM25 search with Reciprocal Rank Fusion (RRF) and Cross-Encoder Reranking.

In [None]:
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def query_librarian(question):
    # 1. Hybrid Search (Dense + BM25)
    # 2. RRF Fusion
    # 3. Cross-Encoder Reranking
    # 4. LLM Generation
    pass