In [1]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma  # Updated import
from langchain.retrievers import BM25Retriever
from langchain.schema import Document
from typing import List, Optional
import pickle
from pathlib import Path
from tqdm import tqdm
import hashlib
import json
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ============================================================================
# CONFIGURATION
# ============================================================================
PATH = "../data/ISLP_website.pdf"
# CHROMA_DIR = '../stats/chroma_semantic'
# BM25_DIR = "../stats/bm25_retriever.pkl"
# METADATA_DIR = "../stats/doc_metadata.json"
# COLLECTION_NAME = "agentic_rag_collection"  # Explicit collection name

# ============================================================================
# EMBEDDINGS
# ============================================================================
embeddings = OllamaEmbeddings(model='nomic-embed-text')

# ============================================================================
# OPTIMIZED DOCUMENT LOADING
# ============================================================================
def load_documents(file_path: str) -> List[Document]:
    """
    Load documents with rich metadata (LangChain 0.3+ compatible)
    """
    logger.info(f"Loading documents from {file_path}")
    
    loader = PyMuPDFLoader(
        file_path=file_path,
        extract_images=True,
        extract_tables='markdown'
    )
    
    docs = loader.load()
    
    # Add rich metadata for better filtering
    for i, doc in enumerate(docs):
        doc.metadata.update({
            'page': i + 1,
            'source': file_path,
            'doc_id': hashlib.md5(doc.page_content.encode()).hexdigest()[:8],
            'file_name': Path(file_path).name,
            'total_pages': len(docs)
        })
    
    logger.info(f"✅ Loaded {len(docs)} pages")
    return docs

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
docs = load_documents(PATH)

INFO:__main__:Loading documents from ../data/ISLP_website.pdf


Consider using the pymupdf_layout package for a greatly improved page layout analysis.


INFO:__main__:✅ Loaded 613 pages


In [11]:
from rich import print
print(docs[450].page_content)