In [1]:
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from langchain_pymupdf4llm import PyMuPDF4LLMLoader

# Load PDF
loaders1 = [
    PyPDFLoader("../Data/botanical.pdf"),
    PyPDFLoader("../Data/astronomical.pdf"),
    PyPDFLoader("../Data/biological.pdf"),
    PyPDFLoader("../Data/cosmological.pdf"),
    PyPDFLoader("../Data/culinary.pdf"),
    PyPDFLoader("../Data/pharmaceutical.pdf")
]

loaders2 = [
    PyMuPDFLoader("../Data/botanical.pdf"),
    PyMuPDFLoader("../Data/astronomical.pdf"),
    PyMuPDFLoader("../Data/biological.pdf"),
    PyMuPDFLoader("../Data/cosmological.pdf"),
    PyMuPDFLoader("../Data/culinary.pdf"),
    PyMuPDFLoader("../Data/pharmaceutical.pdf")
]

loaders3 = [
    PyMuPDF4LLMLoader("../Data/botanical.pdf"),
    PyMuPDF4LLMLoader("../Data/astronomical.pdf"),
    PyMuPDF4LLMLoader("../Data/biological.pdf"),
    PyMuPDF4LLMLoader("../Data/cosmological.pdf"),
    PyMuPDF4LLMLoader("../Data/culinary.pdf"),
    PyMuPDF4LLMLoader("../Data/pharmaceutical.pdf")
]

docs_py = []
docs_pymu = []
docs_py4llm = []

PyPDFLoader, PyMuPDFLoader, and PyMuPDF4LLMLoader are all LangChain-compatible PDF loaders, but they differ in backend library, document layout handling, and output richness.

- PyPDFLoader
    - Backend: Uses pypdf/PyPDF2.
    - Strengths: Quick, efficient plain-text extraction from simple PDFs.
    - Limitations: Struggles with multi-column layouts, tables, images, and complex formatting; may emit warnings for malformed PDFs.
    - Use case: Best for basic, well-structured text PDFs.

- PyMuPDFLoader
    - Backend: Uses PyMuPDF (pymupdf).
    - Strengths: Robust parsing from complex layouts—handles multi-column text, some tables, images, and non-standard page formats better.
    - Limitations: Output is generally plain text; advanced formatting (markdown, table extraction) is limited.
    - Use case: Preferred for scientific papers, forms, and rich-layout documents where PyPDFLoader fails.

- PyMuPDF4LLMLoader
    - Backend: Advanced usage of PyMuPDF with markdown and LLM-oriented formatting.
    - Strengths:
        - Extracts well-structured markdown: headings, lists, tables, code blocks.
        - Tables converted to markdown, images referenced in output.
        - Customizable splitting (by page, flowing text), better for LLM input.
        - Superior for RAG or QA pipelines due to markdown fidelity.
    - Limitations: Slightly heavier dependency, may require fine-tuning settings for best results.
    - Use case: When highest-quality, markdown-structured output is needed for downstream language model or retrieval tasks.

In [2]:
for loader in loaders1:
    docs = loader.load()
    docs_py.extend(docs)

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)


- Struggling with multi-column PDF documents 

In [3]:
for loader in loaders2:
    docs = loader.load()
    docs_pymu.extend(docs)

In [4]:
for loader in loaders3:
    docs = loader.load()
    docs_py4llm.extend(docs)

In [5]:
docs_pymu[0].page_content

"Botanical Section \nBotanixum Sectiorum Arcanix \n \nA peculiar plant with spiraling leaves and \nvibrant blue flowers that seem to emit a \nfaint glow in moonlight. \n• \nHoloris spiralis: In lumine lunae, \nflores azuri magni brillant. \n• \nRadices mysticae: Radices intortae \nterram quaerunt, lumina nocturna \nsequentes. \n• \nUsus: Extractum florae noctem \nilluminat, mentem serenat. \n \nLuminaflora Spiralis \nLuminaflora Spiralis thrives under the \nmoon's tender gaze, where its spiraling \nleaves and vibrant blue petals unfold in a \nmesmerizing dance of light. Believed by \nancient scholars to bridge the earthly \nrealm with the ethereal, these plants \nradiate a soft luminescence, guiding lost \ntravelers through the darkest nights. \nMystics and poets claim that merely \nbeing in the presence of Luminaflora can \nsoothe troubled thoughts and illuminate \nthe path to inner peace.  \n \nThe roots of Luminaflora Spiralis are as \nintriguing as its blooms. Entwining \ndeeply wi

In [6]:
docs_py4llm[0].page_content

"# **Botanical Section**\n\n### **Botanixum Sectiorum Arcanix**\n\n## **Luminaflora Spiralis**\n\n\n\nA peculiar plant with spiraling leaves and\n\nvibrant blue flowers that seem to emit a\n\nfaint glow in moonlight.\n\n\n  - **Holoris spiralis:** In lumine lunae,\nflores azuri magni brillant.\n\n\n  - **Radices mysticae:** Radices intortae\nterram quaerunt, lumina nocturna\n\nsequentes.\n\n\n  - **Usus:** Extractum florae noctem\n\nilluminat, mentem serenat.\n\n\n\n**Luminaflora Spiralis** thrives under the\nmoon's tender gaze, where its spiraling\nleaves and vibrant blue petals unfold in a\nmesmerizing dance of light. Believed by\nancient scholars to bridge the earthly\nrealm with the ethereal, these plants\nradiate a soft luminescence, guiding lost\ntravelers through the darkest nights.\nMystics and poets claim that merely\nbeing in the presence of Luminaflora can\nsoothe troubled thoughts and illuminate\nthe path to inner peace.\n\nThe roots of Luminaflora Spiralis are as\nintrigui

In [7]:
len(docs_pymu[0].page_content)

1621

In [8]:
len(docs_py4llm[0].page_content)

1632

In [9]:
len(docs_py), len(docs_pymu),len(docs_py4llm)

(24, 24, 24)

## Split Documents into Chunks

The main difference between `CharacterTextSplitter` and `RecursiveCharacterTextSplitter` lies in how they split text into chunks, especially when dealing with preserving logical boundaries in text.
- `CharacterTextSplitter`
    - Splits text simply by a single specified separator (e.g. newline \n).
    - Splits text into fixed-size chunks with overlap based solely on character count.
    - It does not attempt to preserve sentence, paragraph, or semantic structure; splits can be arbitrary.
    - Example: split at every newline, group next 1000 characters, overlap 150 characters. It operates in a straightforward, linear way.
    - `CharacterTextSplitter` splits text based on a fixed character separator into chunks defined by number of characters, with overlap for context continuity. It’s a simple, character-based chunking method suitable for basic text splitting.

- `RecursiveCharacterTextSplitter`
    - More sophisticated splitter that tries to preserve semantic boundaries like paragraphs, sentences, or words.
    - Splits text recursively using a priority list of separators (default: ["\n\n", "\n", " ", ""]).
    - Starts splitting by the largest separator (double newline), if resulting chunks are still too large, splits those chunks by the next separator (single newline), and so forth, until chunks are under the size limit. This avoids breaking natural text units abruptly by chunking on logical boundaries like paragraphs or sentences. Designed especially for preparing texts for LLMs to keep context coherent. Can keep or discard separators and supports regex-based separators.
    - `RecursiveCharacterTextSplitter` recursively splits text on progressively smaller separators (paragraphs, lines, words) to create coherent chunks fitting size constraints, making it ideal for LLM applications requiring semantically meaningful chunks.

- `PythonCodeTextSplitter` in LangChain is a specialized text splitter designed to split source code, specifically Python code, into logical chunks based on Python syntax structures.
    - Key Characteristics:
        - Splitting Logic:
        - It attempts to split text along Python-specific syntax boundaries, such as class definitions (class), function/method definitions (def), and other Python code blocks.
    - Implementation:
        - It is implemented as a subclass of RecursiveCharacterTextSplitter with Python-specific separators tailored for Python code.
    - Chunk Size Measurement:
        - By default, chunk size is measured by the number of characters, but this can be controlled with custom length functions.
    - Use Case:
        - Useful when processing Python code for tasks such as code analysis, code search, or feeding code into LLMs, ensuring splits don't cut through syntax elements but align on logical code blocks.
    - Example Usage:
        ```from langchain.text_splitter import PythonCodeTextSplitter
        python_text = """
        class Foo:
             def bar():
                 pass
             def foo():
                 pass
        """

        splitter = PythonCodeTextSplitter(chunk_size=100, chunk_overlap=0)
        docs = splitter.create_documents([python_text])
    - Result:
        - The text is split into chunks aligned with Python class/method blocks rather than arbitrary character count splits.

    - Summary:
        - `PythonCodeTextSplitter` is designed to split Python source code into meaningful chunks based on syntax like classes and functions. It preserves code structure during splitting and is ideal for code-related NLP or LLM tasks


In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

text_splitter_recur = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""], # List of separators to use for splitting, in order of preference
    chunk_size=1_000,                   # Max size of each chunk (measured by characters or tokens)
    chunk_overlap=150,                  # Number of characters overlapping between chunks to maintain context continuity
    length_function=len,                # Function to measure chunk length (default is Python's len)
    is_separator_regex=False,           # Whether separators are interpreted as regex (default False)
)

text_splitter_char = CharacterTextSplitter(
    separator="\n",
    chunk_size=1_000,
    chunk_overlap=150,
    length_function=len
)

In [11]:
# Split the documents into chunks using both docs splitters
docs_pymu_recursive = text_splitter_recur.split_documents(docs_pymu)
docs_pymu_character = text_splitter_char.split_documents(docs_pymu)

docs_py4llm_recursive = text_splitter_recur.split_documents(docs_py4llm)
docs_py4llm_character = text_splitter_char.split_documents(docs_py4llm)

In [12]:

print(len(docs_pymu_recursive), len(docs_pymu_character))
print(len(docs_py4llm_recursive), len(docs_py4llm_character))

43 43
50 43


## Convert Chunks to Embeddings and Store in FAISS Vector Store

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Initialize the HuggingFaceEmbeddings model

# BGE example
# model_name = "BAAI/bge-base-en"
# model_name = "sentence-transformers/all-mpnet-base-v2"
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

# Create a FAISS vector store
pymu_recur_vectorDB = FAISS.from_documents(documents=docs_pymu_recursive, embedding=embedding_model)
pymu_char_vectorDB = FAISS.from_documents(documents=docs_pymu_character, embedding=embedding_model)

py4llm_recur_vectorDB = FAISS.from_documents(documents=docs_py4llm_recursive, embedding=embedding_model)
py4llm_char_vectorDB = FAISS.from_documents(documents=docs_py4llm_character, embedding=embedding_model)

In [14]:
print(pymu_recur_vectorDB.index.ntotal)
print(pymu_char_vectorDB.index.ntotal)
print(py4llm_recur_vectorDB.index.ntotal)
print(py4llm_char_vectorDB.index.ntotal)

43
43
50
43


In [15]:
import numpy as np

# Determine if embeddings are normalized
embeddings = [embedding_model.embed_query(doc.page_content) for doc in docs_pymu_recursive]

embeddings_array = np.array(embeddings)
norms = np.linalg.norm(embeddings_array, axis=1)

print("First 10 embedding norms:", norms[:10])

# Determine if embeddings are normalized
if np.allclose(norms, 1.0, atol=1e-6):
    print("All embeddings are normalized within tolerance.")
else:
    print("Some embeddings are not normalized within tolerance.")


First 10 embedding norms: [1.00000001 0.99999995 1.00000003 1.00000006 0.99999997 1.00000004
 1.00000005 1.00000001 1.00000003 1.00000001]
All embeddings are normalized within tolerance.


## Persist Data in your Vector Store

In [16]:
pymu_recur_vectorDB.save_local("../VectorDB/faiss_recur_index")
pymu_char_vectorDB.save_local("../VectorDB/faiss_char_index")
py4llm_char_vectorDB.save_local("../VectorDB/faiss4llm_char_index")
py4llm_recur_vectorDB.save_local("../VectorDB/faiss4llm_recur_index")