In [None]:
from langchain_pymupdf4llm import PyMuPDF4LLMLoader
from langchain_community.document_loaders.parsers import LLMImageBlobParser
from langchain_experimental.text_splitter import SemanticChunker
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import os 
load_dotenv()   

URL = os.getenv('URL')
API_KEY = os.getenv('OPENROUTER_API_KEY')
# model = ChatOpenAI(
#                 model="amazon/nova-2-lite-v1:free",
#                 temperature=0.0,
#                 api_key=API_KEY, # type: ignore
#                 base_url=URL,
#                 # max_tokens=max_tokens
#             )
model = ChatOllama(model="gemma3:4b")
PATH = r"..\\data\\ai-agents-with-python-build-autonomous-systems-that-think-learn-and-act.pdf"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

loader = PyMuPDF4LLMLoader(
    PATH,
    mode="page",
    extract_images=True,
    images_parser=LLMImageBlobParser(model=model))
docs = loader.load()

Consider using the pymupdf_layout package for a greatly improved page layout analysis.


In [3]:
breakpoint_threshold_type ='percentile'
breakpoint_threshold_amount = 95
embeddings = OllamaEmbeddings(model='nomic-embed-text')    
splitter = SemanticChunker(
        embeddings=embeddings,
        breakpoint_threshold_type=breakpoint_threshold_type,
        breakpoint_threshold_amount=breakpoint_threshold_amount
    )
    
chunks = splitter.split_documents(docs)

In [4]:
vector_db = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory='../vector/semantic',
        collection_name="semantic",
        collection_metadata={"hnsw:space": "cosine"}  # Use cosine similarity
    )

In [5]:
from langchain_classic.retrievers import BM25Retriever
import pickle
save_path = '../vector/sparse.pkl'
bm25_retriever = BM25Retriever.from_documents(
        chunks,
        k=15  # Return top 15 results
    )
    
    # OPTIONAL: Tune BM25 parameters (advanced)
    # bm25_retriever.k1 = 1.5  # Term frequency saturation (default: 1.2)
    # bm25_retriever.b = 0.75  # Length normalization (default: 0.75)
    
    # Save to disk
with open(save_path, "wb") as f:
    pickle.dump(bm25_retriever, f)

In [6]:
from rich import print
for doc in docs:
    
    print(doc.page_content)