In [1]:
import sys
sys.path.append('../Src')

from platform import python_version
import numpy as np
import pandas as pd


import glob
#  The glob library (or more accurately, the glob module in Python) is a built-in Python module that provides a way to find all 
# the pathnames matching a specified pattern according to the rules used by the Unix shell.
# It's extremely useful when you need to discover files or directories based on wildcard patterns, without having to manually 
# iterate through directories and check filenames.
import langchain
from langchain.document_loaders import DirectoryLoader

# initializing variables
RANDOM_STATE = 1776

# print versions
print("LangChain Version: " + langchain.__version__)
print("Numpy Version: " + np.__version__)
print("Pandas Version: " + pd.__version__)
# print("Seaborn Version: " + sns.__version__)
# print("Matplotlib Version: " + plt.matplotlib.__version__)
print("Python Version: " + python_version())

LangChain Version: 0.3.26
Numpy Version: 2.3.1
Pandas Version: 2.3.0
Python Version: 3.13.2


#### This line of code initializes a DirectoryLoader object from LangChain, configured to find and prepare specific files for loading. Let's break down its components:
- from langchain.document_loaders import DirectoryLoader:
    - This line imports the DirectoryLoader class from LangChain's document_loaders module. This class is designed to load multiple documents from a specified directory.
        - loader = DirectoryLoader(...):
    - This creates an instance of the DirectoryLoader class and assigns it to the variable loader. This loader object is now configured but hasn't actually loaded any documents yet.
        - './data':
    - This is the first argument to DirectoryLoader and specifies the path to the directory you want to load documents from.
        - ./data means a directory named data located in the current working directory where your Python script is being executed.
        - glob="**/*.pdf":
        - This is a crucial argument that defines a glob pattern to filter which files within the specified directory should be loaded.
        - **: This is a wildcard that matches any directory (including subdirectories) zero or more levels deep. So, it tells the loader to search recursively within ./data.
        - *.pdf: This wildcard matches any file ending with .pdf.
        - Combined (**/*.pdf): This pattern instructs the DirectoryLoader to find all files ending with .pdf within the ./data directory and any of its subdirectories.



In [2]:
# from docx import Document

# doc = Document("your_file.docx")
# for para in doc.paragraphs:
#     print(para.text)



# from langchain_community.document_loaders.excel import UnstructuredExcelLoader

# # Load the Excel file (choose mode: "single" or "elements")
# loader = UnstructuredExcelLoader("your_file.xlsx", mode="elements")
# docs = loader.load()
# # List All Sheet Names
# for doc in docs:
#     print(doc.metadata.get("sheet_name"), doc.page_content[:100])

# Use UnstructuredExcelLoader to bring Excel data into LangChain. You can load entire files or 
# individual sheets, and each document will include both text and metadata for downstream processing

### Modes Explained
- "single" mode (default):
- Loads the entire Excel file as one document.
- The document's metadata includes an HTML representation of the table under the text_as_html key.

### "elements" mode:
- Loads each sheet as a separate table element.
- Each element is a separate document, and its metadata contains details about the sheet and an HTML version of the table.
- What You Get
    - Each loaded document has:
    - page_content: The raw text extracted from the Excel file or sheet.
    - metadata: Information about the sheet, file, and (in "single" mode) an HTML version of the table.

### Typical Workflow
- Use UnstructuredExcelLoader to load Excel data into LangChain Document objects.
- Process, chunk, embed, or retrieve from these documents as needed for your LLM or RAG pipeline.
- Example: List All Sheet Names
    - python
        - for doc in docs:
        - print(doc.metadata.get("sheet_name"), doc.page_content[:100])
### Summary Table
- Loader	File Types Supported	Modes	Special Features
- UnstructuredExcelLoader	.xlsx, .xls	"single", "elements"	HTML table in metadata, sheet-level docs
### In summary:
- Use UnstructuredExcelLoader to bring Excel data into LangChain. You can load entire files or individual sheets, and each document will include both text and metadata for downstream processing

In [3]:
# Loading all PDFs from a 'data' directory (including subdirectories) -  Configure the loader
loader = DirectoryLoader('../Data', glob="**/*.pdf")

# Actually load the documents
documents = loader.load()

print(f"Total documents loaded: {len(documents)}\n")
# print(documents[0].metadata['source'])  # Display the source of the first document

# print(documents[0])  # Display the content of the first document

direct_loader_docs = documents.copy()
 

Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P7' is a

Total documents loaded: 153



This error comes from an invalid color value—usually a resource name like /P0—when a numeric value is needed for gray color in a PDF. The cause is almost always malformed or nonstandard PDFs, and the solution is either updating your tools, regenerating the PDF, or bypassing/repairing problematic resources, depending on your workflow and needs. 

## Unstructured PDF Loader

In [4]:
from langchain_community.document_loaders import UnstructuredFileLoader # Make sure this is imported

# Configure the loader (your line)
loader = DirectoryLoader('../Data', glob="**/*.pdf", loader_cls=UnstructuredFileLoader)

# Now, execute the loading process
documents = loader.load()

# You can then check how many documents were loaded and inspect a sample:
print(f"Successfully loaded {len(documents)} documents.")
# if documents:
#     print("\nFirst document snippet:")
#     print(documents[0].page_content[:500]) # Print first 500 characters of the first document
#     print("\nFirst document metadata:")
#     print(documents[0].metadata)

# Configure the loader to use UnstructuredFileLoader for PDFs
# UnstructuredFileLoader is more robust but might have more dependencies and be slower
loader = DirectoryLoader('../Data', glob="**/*.pdf", loader_cls=UnstructuredFileLoader)


direct_loader_unstructured_docs = documents.copy()

Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P7' is a

Successfully loaded 153 documents.


In [None]:
# documents[1].metadata['source']
# documents[1].metadata.get('page')

In [None]:
# documents[1].metadata.keys()  # List all metadata keys for the second document

In [5]:
all_keys = set()
for doc in documents:
    all_keys.update(doc.metadata.keys())
print(f"All unique metadata keys: {all_keys}")

All unique metadata keys: {'source'}


## Different ways to load documents

In [6]:
from langchain.document_loaders import UnstructuredPDFLoader  # or langchain_community if using that

pdf_files = glob.glob("../Data/*.pdf")

documents = []

for file_path in pdf_files:
    # loader = UnstructuredPDFLoader(file_path=file_path, strategy="hi_res")
    loader = UnstructuredPDFLoader(file_path=file_path, strategy="fast")
    docs = loader.load()
    documents.extend(docs)


print(f"Loaded {len(documents)} documents from {len(pdf_files)} files.")
# print(documents[0].metadata['source'])  # Display the source of the first document


direct_loader_unstructured_pdf_docs = documents.copy()

Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P7' is a

Loaded 153 documents from 153 files.


In [7]:
all_keys = set()
for doc in documents:
    all_keys.update(doc.metadata.keys())
print(f"All unique metadata keys: {all_keys}")

All unique metadata keys: {'source'}


#### UnstructuredPDFLoader (153 documents from 153 files)
- Default Behavior: UnstructuredPDFLoader (using the unstructured library) typically aims to process an entire PDF file as one logical document. Even if a PDF has multiple pages, UnstructuredPDFLoader will often concatenate the content of all pages into the page_content of a single LangChain Document object for that PDF file.
- Granularity: It focuses on extracting the overall text and structure of the entire document, rather than splitting it by page. While it might include page_number in the metadata or even in the page_content itself, the primary output for one PDF file is usually one Document object.
- Result: This is why you get 1 document per PDF file. If you have 153 PDF files, you get 153 Document objects.

#### PyMuPDFLoader (966 documents from 153 files)

- Default Behavior: PyMuPDFLoader (using the PyMuPDF library, also known as fitz) typically loads each page of a PDF file as a separate LangChain Document object.
- Granularity: It's designed to provide more granular control at the page level. For a 10-page PDF, PyMuPDFLoader would yield 10 separate Document objects, each representing one page.
- Result: This explains the significant difference. If you divide the total documents by the number of files (966 / 153), you get approximately 6.3 pages per PDF file on average. This is a very common scenario. Each of those 966 documents represents one page from one of your 153 PDF files.

#### Implications for your RAG Pipeline:
- The choice between these loaders (and their output granularity) has significant implications for your RAG pipeline:
    - UnstructuredPDFLoader (Document-level chunks):
        - Pros: Good for maintaining broad context if you want to retrieve large sections or entire small documents at once. Often better at handling complex layouts or scanned documents (especially with hi_res).
        - Cons: If your PDFs are very long, a single Document object might exceed the LLM's context window. You'll definitely need a RecursiveCharacterTextSplitter afterwards to break these large documents into smaller, manageable chunks for your vector store.

#### PyMuPDFLoader (Page-level chunks):
- Pros: Provides immediate, smaller, page-level chunks, which are often a good starting point for RAG. Each chunk comes with clear page metadata. Generally faster for basic text extraction from well-formed PDFs.
- Cons: Might not handle complex layouts, tables, or scanned PDFs as robustly as unstructured. You might still need a RecursiveCharacterTextSplitter if individual pages are too long for your LLM's context window, or if you want to split pages further semantically.

In [8]:
# How to load multiple PDFs with PyMuPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader

pdf_paths = glob.glob("../Data/*.pdf")  # list all pdf files

pymu_documents = []
for path in pdf_paths:
    loader = PyMuPDFLoader(path)
    docs = loader.load()
    pymu_documents.extend(docs)

print(f"Loaded {len(pymu_documents)} documents from {len(pdf_paths)} files.")
# print(documents[0].metadata['source'])  # Display the source of the first document

pymupdf_loader_docs = documents.copy()

Loaded 966 documents from 153 files.


In [9]:
all_keys = set()
for doc in documents:
    all_keys.update(doc.metadata.keys())
print(f"All unique metadata keys: {all_keys}")

All unique metadata keys: {'source'}


In [10]:
from langchain_community.document_loaders import PyPDFLoader # Use PyPDFLoader

pdf_files = glob.glob("../Data/*.pdf")
documents = []

for file_path in pdf_files:
    loader = PyPDFLoader(file_path) # No strategy needed for PyPDFLoader
    docs = loader.load()
    documents.extend(docs)

print(f"Loaded {len(documents)} documents from {len(pdf_files)} files using PyPDFLoader.")

pypdf_loader_docs = documents.copy()

Loaded 966 documents from 153 files using PyPDFLoader.


In [11]:
all_keys = set()
for doc in documents:
    all_keys.update(doc.metadata.keys())
print(f"All unique metadata keys: {all_keys}")

# doc.page_content

# from langchain_core.documents.base import Document

# # Create a Document manually
# doc = Documents(
#     page_content="This is a sample page from a PDF.",
#     metadata={"source": "file.pdf", "page": 1}
# )

# print(doc.page_content)  # Output: This is a sample page from a PDF.
# print(doc.metadata)      # Output: {'source': 'file.pdf', 'page': 1}

All unique metadata keys: {'author', 'creationdate--text', 'ptex.fullbanner', 'source', 'ieee article id', 'appligent', 'total_pages', 'icv', 'subject', 'ieee publication id', 'bidi.fullbanner', 'ieee issue id', 'meeting starting date', 'meeting ending date', 'trapped', 'pdfversion', 'company', 'page', 'page_label', 'moddate', 'ksoproductbuildver', 'keywords', 'sourcemodified', 'application', 'producer', 'creator', 'creationdate', 'ksotemplatedocersaverecord', 'title'}


## Multiple Loader Used

In [12]:
print(type(direct_loader_docs))
print(type(direct_loader_unstructured_docs))
print(type(direct_loader_unstructured_pdf_docs))
print(type(pymupdf_loader_docs))
print(type(pypdf_loader_docs))

<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>


In [13]:
print(len(direct_loader_docs))
print(len(direct_loader_unstructured_docs))
print(len(direct_loader_unstructured_pdf_docs))
print(len(pymupdf_loader_docs))
print(len(pypdf_loader_docs))

153
153
153
153
966


In [16]:
direct_loader_docs[1].page_content

'A Comparison ofPipelined RAG-n and DA FPGA- based Multiplierless Filters\n\nUwe Meyer-Baese The Department ofECE FAMU-FSU College ofEng. Tallahassee, FL USA Email: umb -eng.fsu.edu\n\nJiajia Chen, Chip Hong Chang Nanyang Technological University School ofEE Engineering Blk S2,Nanyang Avenue, Singapore Email: {chenO183,echchang}gntu.edu.sg\n\nAndrew G. Dempster The School ofSurveying and Spatial Information Systems University ofNew South Wales Sydney 2052, Australia Email:a.dempstergunsw.edu.au\n\nAbstract The paper starts with an overview of distributed arithmetic (DA) and n-dimensional reduced adder graph (RAG-n) multiplierless filter design methods. Since DA designs are table-based and RAG-n designs are adder-based, FPGA synthesis design data are used for a realistic comparison. Benchmark FIR filters [1-4] of length 11 to 63 are compiled. For a wide set ofrealistic design examples, it will be shown that pipelined RAG-n designs achieve on average a gain of 71% in performance a 56% ar

In [17]:
direct_loader_unstructured_docs[1].page_content

'A Comparison ofPipelined RAG-n and DA FPGA- based Multiplierless Filters\n\nUwe Meyer-Baese The Department ofECE FAMU-FSU College ofEng. Tallahassee, FL USA Email: umb -eng.fsu.edu\n\nJiajia Chen, Chip Hong Chang Nanyang Technological University School ofEE Engineering Blk S2,Nanyang Avenue, Singapore Email: {chenO183,echchang}gntu.edu.sg\n\nAndrew G. Dempster The School ofSurveying and Spatial Information Systems University ofNew South Wales Sydney 2052, Australia Email:a.dempstergunsw.edu.au\n\nAbstract The paper starts with an overview of distributed arithmetic (DA) and n-dimensional reduced adder graph (RAG-n) multiplierless filter design methods. Since DA designs are table-based and RAG-n designs are adder-based, FPGA synthesis design data are used for a realistic comparison. Benchmark FIR filters [1-4] of length 11 to 63 are compiled. For a wide set ofrealistic design examples, it will be shown that pipelined RAG-n designs achieve on average a gain of 71% in performance a 56% ar

In [18]:
direct_loader_unstructured_pdf_docs[1].page_content

'A Comparison ofPipelined RAG-n and DA FPGA- based Multiplierless Filters\n\nUwe Meyer-Baese The Department ofECE FAMU-FSU College ofEng. Tallahassee, FL USA Email: umb -eng.fsu.edu\n\nJiajia Chen, Chip Hong Chang Nanyang Technological University School ofEE Engineering Blk S2,Nanyang Avenue, Singapore Email: {chenO183,echchang}gntu.edu.sg\n\nAndrew G. Dempster The School ofSurveying and Spatial Information Systems University ofNew South Wales Sydney 2052, Australia Email:a.dempstergunsw.edu.au\n\nAbstract The paper starts with an overview of distributed arithmetic (DA) and n-dimensional reduced adder graph (RAG-n) multiplierless filter design methods. Since DA designs are table-based and RAG-n designs are adder-based, FPGA synthesis design data are used for a realistic comparison. Benchmark FIR filters [1-4] of length 11 to 63 are compiled. For a wide set ofrealistic design examples, it will be shown that pipelined RAG-n designs achieve on average a gain of 71% in performance a 56% ar

In [19]:
pymupdf_loader_docs[1].page_content

'A Comparison ofPipelined RAG-n and DA FPGA- based Multiplierless Filters\n\nUwe Meyer-Baese The Department ofECE FAMU-FSU College ofEng. Tallahassee, FL USA Email: umb -eng.fsu.edu\n\nJiajia Chen, Chip Hong Chang Nanyang Technological University School ofEE Engineering Blk S2,Nanyang Avenue, Singapore Email: {chenO183,echchang}gntu.edu.sg\n\nAndrew G. Dempster The School ofSurveying and Spatial Information Systems University ofNew South Wales Sydney 2052, Australia Email:a.dempstergunsw.edu.au\n\nAbstract The paper starts with an overview of distributed arithmetic (DA) and n-dimensional reduced adder graph (RAG-n) multiplierless filter design methods. Since DA designs are table-based and RAG-n designs are adder-based, FPGA synthesis design data are used for a realistic comparison. Benchmark FIR filters [1-4] of length 11 to 63 are compiled. For a wide set ofrealistic design examples, it will be shown that pipelined RAG-n designs achieve on average a gain of 71% in performance a 56% ar

In [20]:
pypdf_loader_docs[1].page_content

'• Postgres(pgvector): A vector database that supports sim-\nilarity search based on vector embeddings.\n• Azure AI Search : A cloud-based search service with\nbuilt-in AI capabilities for full-text search and indexing.\n• Elasticsearch: A distributed search and analytics engine\nthat provides real-time search capabilities.\nThe retrieval module performs a hybrid search, combining\ntraditional keyword-based search with vector-based similarity\nsearch, followed by a reranker that refines the results. The\ngeneration module utilizes GPT-4 models to generate contex-\ntually relevant responses. The orchestration layer, developed in\nPython, integrates these components and runs in a Kubernetes\nenvironment, ensuring scalability and resilience.\nIV. P LATFORM AUTOMATION\nPlatform automation is a critical aspect of MLOps, ensuring\nconsistent, reproducible, and scalable deployments. Platform\nautomation is achieved through the use of various tools and\npractices that streamline the deployment

In [22]:
# pymu_documents is your list from PyMuPDFLoader
if pymupdf_loader_docs:
    print("\n--- Inspecting PyMuPDFLoader Documents ---")
    print(f"First document metadata: {pymupdf_loader_docs[0].metadata}")
    print(f"Second document metadata: {pymupdf_loader_docs[1].metadata}")
    # You should see 'page' or 'page_number' in the metadata, likely starting from 0 or 1.
    # The 'source' will be the same for consecutive pages of the same PDF.


--- Inspecting PyMuPDFLoader Documents ---
First document metadata: {'source': '../Data/Streamlining_AI_Application_MLOps_Best_Practices_and_Platform_Automation_Illustrated_through_an_Advanced_RAG_based_Chatbot.pdf'}
Second document metadata: {'source': '../Data/A_Comparison_of_Pipelined_RAG-n_and_DA_FPGA-based_Multiplierless_Filters.pdf'}


In [21]:
#  pypdf_loader_docs is your list from PyMuPDFLoader
if pypdf_loader_docs:
    print("\n--- Inspecting PyPDFLoader Documents ---")
    print(f"First document metadata: {pypdf_loader_docs[0].metadata}")
    print(f"Second document metadata: {pypdf_loader_docs[1].metadata}")
    # You should see 'page' or 'page_number' in the metadata, likely starting from 0 or 1.
    # The 'source' will be the same for consecutive pages of the same PDF.


--- Inspecting PyPDFLoader Documents ---
First document metadata: {'producer': 'PDF Editor 2.2 - Foxit Corporation; modified using iTextSharp 5.4.1 ©2000-2012 1T3XT BVBA (AGPL-version); modified using iText® Core 7.2.4 (AGPL version) ©2000-2022 iText Group NV', 'creator': 'PyPDF', 'creationdate': '2024-07-19T15:20:37+00:00', 'meeting starting date': '10 July 2024', 'moddate': '2024-08-13T18:37:04-04:00', 'ieee article id': '10625230', 'ieee issue id': '10624738', 'subject': '2024 2nd International Conference on Sustainable Computing and Smart Systems (ICSCSS);2024; ; ;10.1109/ICSCSS60660.2024.10625230', 'ieee publication id': '10624685', 'title': 'Streamlining AI Application: MLOps Best Practices and Platform Automation Illustrated through an Advanced RAG based Chatbot', 'meeting ending date': '12 July 2024', 'source': '../Data/Streamlining_AI_Application_MLOps_Best_Practices_and_Platform_Automation_Illustrated_through_an_Advanced_RAG_based_Chatbot.pdf', 'total_pages': 10, 'page': 0, 

The phrase "all doc loaded in langchain" means you've successfully completed the Document Loading phase of your RAG pipeline. You now have a list of LangChain Document objects (either 153 or 966, depending on whether you used UnstructuredPDFLoader or PyMuPDFLoader for your final load), with their page_content and metadata correctly extracted.

#### What's Next: Text Splitting (Chunking)
- Even if PyMuPDFLoader already split your PDFs into pages, individual pages can still be too large for an LLM's context window. The next crucial step in building your RAG system is Text Splitting (Chunking).

#### Why is Text Splitting Important?
- LLM Context Window Limits: Large language models have a maximum amount of text they can process at once (their "context window"). Your full documents or even single long pages will often exceed this limit.
- Retrieval Relevance: When you perform a retrieval, you want to find the most relevant snippets of information, not necessarily entire documents or pages. Smaller, semantically coherent chunks lead to more precise retrieval.
- Cost and Speed (for API-based LLMs): Smaller inputs mean lower token counts, which can reduce costs and inference time if you were using external LLM APIs (less relevant for purely local LLMs, but still good practice).

#### How to do it:
- You'll typically use a RecursiveCharacterTextSplitter from LangChain. It attempts to split text in a smart way, trying to keep paragraphs and sentences together, and only splitting into smaller units if larger ones exceed your defined chunk_size.

In [24]:
# print(len(direct_loader_docs))
# print(len(direct_loader_unstructured_docs))
# print(len(direct_loader_unstructured_pdf_docs))
print(len(pymupdf_loader_docs))
print(len(pypdf_loader_docs))

153
966


In [28]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Assuming 'documents' is your list of loaded LangChain Document objects

# Initialize the text splitter
# chunk_size: The maximum number of characters for each text chunk.
# chunk_overlap: The number of characters to overlap between adjacent chunks.
#                This helps retain context across splits.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,   # Experiment with this value (e.g., 500, 1500, 2000)
    chunk_overlap=200  # Experiment with this value (e.g., 50, 100, 200)
)

# Split the documents into chunks
chunks_direct_loader_pymupdf = text_splitter.split_documents(pymupdf_loader_docs)

print(f"\nOriginal documents: {len(pymupdf_loader_docs)}")
print(f"Split into {len(chunks_direct_loader_pymupdf)} chunks.")

# You can view a sample chunk as well:
if chunks_direct_loader_pymupdf:
    print("\n--- Sample of the first chunk ---")
    print(f"Content (first 500 characters):\n{chunks_direct_loader_pymupdf[0].page_content[:500]}...")
    print(f"Metadata:\n{chunks_direct_loader_pymupdf[0].metadata}")


Original documents: 153
Split into 8498 chunks.

--- Sample of the first chunk ---
Content (first 500 characters):
0 3 2 5 2 6 0 1 4 2 0 2 0 6 6 0 6 S S C S C I / 9 0 1 1 0 1 : I

.

.

.

O D | E E E I

4 2 0 2 © 0 0 1 3 $ / 4 2 / 0 - 9 9 9 7 - 3 0 5 3 - 8 - 9 7 9 | ) S S C S C I (

.

s

m e t s y S t r a m S d n a g n i t u p m o C e b a n a t s u S n o e c n e r e f n o C

l

i

l

a n o i t a n r e t n

I

d n 2 4 2 0 2

Proceedings of 2nd International Conference on Sustainable Computing and Smart Systems (ICSCSS 2024) IEEE Xplore Part Number: CFP24DJ3-ART; ISBN: 979-8-3503-7999-0

Streamlining AI Appl...
Metadata:
{'source': '../Data/Streamlining_AI_Application_MLOps_Best_Practices_and_Platform_Automation_Illustrated_through_an_Advanced_RAG_based_Chatbot.pdf'}


The chunks list will now contain many more smaller Document objects, each suitable for embedding and storing in your vector database.

In [30]:
# Assuming 'documents' is your list of loaded LangChain Document objects

# Initialize the text splitter
# chunk_size: The maximum number of characters for each text chunk.
# chunk_overlap: The number of characters to overlap between adjacent chunks.
#                This helps retain context across splits.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,   # Experiment with this value (e.g., 500, 1500, 2000)
    chunk_overlap=200  # Experiment with this value (e.g., 50, 100, 200)
)

# Split the documents into chunks
chunks_direct_loader_pypdf = text_splitter.split_documents(pypdf_loader_docs)

print(f"\nOriginal documents: {len(pypdf_loader_docs)}")
print(f"Split into {len(chunks_direct_loader_pypdf)} chunks.")

# You can view a sample chunk as well:
if chunks_direct_loader_pypdf:
    print("\n--- Sample of the first chunk ---")
    print(f"Content (first 500 characters):\n{chunks_direct_loader_pypdf[0].page_content[:500]}...")
    print(f"Metadata:\n{chunks_direct_loader_pypdf[0].metadata}")


Original documents: 966
Split into 6510 chunks.

--- Sample of the first chunk ---
Content (first 500 characters):
Streamlining AI Application: MLOps Best Practices
and Platform Automation Illustrated through an
Advanced RAG based Chatbot
Harcharan Singh Kabbay
Sr. Machine Learning Engineer
World Wide Technology
St. Louis, MO, USA
harcharan.kabbay@gmail.com
Abstract—This research study presents a comprehensive guide
to MLOps best practices tailored for developing and deploying
Artificial Intelligence (AI) based applications, focusing on the
challenges and recent techniques in the field. Current systems
face ...
Metadata:
{'producer': 'PDF Editor 2.2 - Foxit Corporation; modified using iTextSharp 5.4.1 ©2000-2012 1T3XT BVBA (AGPL-version); modified using iText® Core 7.2.4 (AGPL version) ©2000-2022 iText Group NV', 'creator': 'PyPDF', 'creationdate': '2024-07-19T15:20:37+00:00', 'meeting starting date': '10 July 2024', 'moddate': '2024-08-13T18:37:04-04:00', 'ieee article id': '10625230

#### The next crucial steps in building your local RAG (Retrieval-Augmented Generation) pipeline are:
- Text Embedding: Convert your text chunks into numerical representations (vectors).
- Vector Store/Database: Store these embeddings in a specialized database that allows for efficient similarity search.

#### Text Embedding (Creating Embeddings)
- What it is: Text embedding is the process of converting text (your chunks) into dense numerical vectors. These vectors capture the semantic meaning of the text, such that chunks with similar meanings will have vectors that are "close" to each other in a multi-dimensional space.
- Why it's needed: When a user asks a query, you'll convert that query into an embedding as well. To find relevant documents, you'll then search for document chunks whose embeddings are most similar to the query's embedding.
- Tool: For a local RAG setup, you'll use a local embedding model. HuggingFaceEmbeddings is a common choice as it allows you to load models directly from Hugging Face Hub that can run on your CPU/GPU without external API calls.
- What it is and Why it's needed: Your explanation is spot on. Embeddings are the bridge between human language and machine understanding, enabling semantic search. The "closeness" of vectors in multi-dimensional space directly correlates to the semantic similarity of the original text chunks.
#### Key considerations when using HuggingFaceEmbeddings:
- Model Selection:
    - Performance vs. Size: The Hugging Face MTEB (Massive Text Embedding Benchmark) Leaderboard is your go-to resource for comparing models. Look at "Retrieval Average" scores, but also consider "Model Size" and "Max Tokens." Larger models often offer better performance but require more computational resources (RAM and potentially GPU).
    - Domain Specificity: For highly specialized domains (e.g., medical, legal), fine-tuned models might outperform general-purpose ones.
    - Multilingual Support: If your data isn't exclusively English, ensure the model supports the languages you need.
    - Examples of good local models: Models like BAAI/bge-small-en-v1.5, intfloat/e5-base-v2, and variants of all-MiniLM-L6-v2 are popular choices for their balance of performance and efficiency for local deployment.

#### Hardware Usage (CPU/GPU):
- HuggingFaceEmbeddings typically uses the sentence-transformers library under the hood. This library automatically tries to leverage a GPU if one is available and configured correctly (e.g., PyTorch with CUDA support).
- Explicit Device Setting: You can explicitly set the device to "cpu" or "cuda" (for GPU) when initializing HuggingFaceEmbeddings using model_kwargs={'device': 'cpu'} or model_kwargs={'device': 'cuda'}. This is particularly useful if you have multiple GPUs or want to force CPU usage for testing/resource management.
- Batching: For faster embedding generation, especially on GPUs, process your text chunks in batches. HuggingFaceEmbeddings often has a batch_size parameter.
- Performance Monitoring: If you have a GPU, monitor its utilization (e.g., with nvidia-smi on Linux) to ensure it's being used effectively. Sometimes, CPU bottlenecks (e.g., slow data loading) can lead to low GPU utilization.
- Installation: You'll need to install the sentence-transformers package: pip install sentence-transformers. If you plan to use a GPU, ensure your PyTorch installation supports CUDA.
- Normalization: For cosine similarity (a common metric for vector similarity search), it's often recommended to normalize the embeddings. Some models produce normalized embeddings by default, while others might require you to set normalize_embeddings=True in encode_kwargs when initializing HuggingFaceEmbeddings.

#### Vector Store/Database
- What it is and Why it's needed: Once you have your embeddings, you need a place to store them that allows for fast "nearest neighbor" searches. This is where vector databases shine. They are optimized for efficiently finding vectors that are semantically close to a given query vector.

#### Next steps for the Vector Store:
- Choice of Vector Store: For local RAG, popular choices include:
    - ChromaDB: A lightweight, easy-to-use vector database that can run entirely locally without complex setup. Excellent for getting started.
    - FAISS (Facebook AI Similarity Search): A library for efficient similarity search and clustering of dense vectors. It's an in-memory index, meaning it's very fast but doesn't persist data between runs unless you explicitly save and load the index. Good for smaller datasets or if you manage persistence yourself.
    - Milvus Lite/Qdrant Lite: Lighter versions of their full-fledged vector database counterparts, offering more features than FAISS while still being suitable for local deployments.
- Indexing: Once you choose a vector store, you'll need to "index" your document embeddings into it. This process organizes the vectors for efficient search.
- Similarity Search: When a user query comes in, you'll convert it to an embedding using the same HuggingFaceEmbeddings model. Then, you'll perform a similarity search in your vector store to retrieve the most relevant document chunks. Common similarity metrics include cosine similarity, dot product, or Euclidean distance.
- By focusing on these two steps – robust text embedding and efficient vector storage – you'll build the core retrieval component of your local RAG pipeline.


In [31]:
len(chunks_direct_loader_pymupdf), len(chunks_direct_loader_pypdf)

(8498, 6510)

In [None]:
def two_sum(nums, target):
    num_to_index = {}
    for i, num in enumerate(nums):
        complement = target - num
        if complement in num_to_index:
            return [num_to_index[complement], i]
        num_to_index[num] = i

In [None]:
nums = [3, 7, 11, 15, 2]
target = 17
result = two_sum(nums, target)
print(result)  # Output: [0, 1]

In [None]:
num_to_index = {}
for i, num in enumerate(nums):
    print(target - num)
    complement = target - num
    print(f":::::Current num_to_index: {num_to_index}")
    if complement in num_to_index:
        print(f"Found: {num_to_index[complement]}, {i}")

    # print(f"Current num_to_index: {num_to_index}")
    num_to_index[num] = i

In [None]:
num_to_index

In [None]:
num_to_index[3]