# 03 â€“ Embedding and Index Experiments

This notebook is for experimenting with different embeddings and vector stores.

- Start with `OpenAIEmbedder` or `LocalSentenceTransformerEmbedder`.
- Use `SimpleRAGPipeline.index_directory` to build an index.
- Inspect and log basic stats with MLflow helpers.



In [None]:
import sys
from pathlib import Path

# Add src directory to path so we can import modules
sys.path.insert(0, str(Path("../src").resolve()))

from rag_pipeline.lab_pipeline import RAGLabConfig, SimpleRAGPipeline

DATA_DIR = Path("../data/raw")

config = RAGLabConfig(
    data_root=DATA_DIR,
    chunk_size=500,
    chunk_overlap=100,
    top_k=5,
)

pipeline = SimpleRAGPipeline(config=config)

print("Indexing directory...\n")
stats = pipeline.index_directory(exts=[".txt", ".md", ".pdf", ".json", ".csv"])
print("Index stats:", stats)



In [None]:
# Quick check: run an example query against the freshly built index
query = "What is this data about?"
result = pipeline.run_query(query, top_k=3)

print("Prompt preview:\n")
print(result["prompt"][:800])

print("\nRetrieved chunks (truncated):\n")
for i, doc in enumerate(result["retrieved"], start=1):
    print("-" * 60)
    print(f"Rank {i} | Score: {doc['score']:.4f}")
    print(doc["text"][:400])

