Indexing Pipeline - Data Loading to Data Store in Vector DB

In [1]:
from langchain_core.documents import Document

In [None]:
# The example of the document structure of document loaders

doc = Document(
    page_content="This is book of rag development",
    metadata={
        "source":"HHPP",
        "Author":"Saan",
        "DOB":"2006-01-01"
    }
    )
doc

#docuemnet loader provides the page content and the metadata. The metadata is necessary for vector store at the retrieval stage.

Document(metadata={'source': 'HHPP', 'Author': 'Saan', 'DOB': '2006-01-01'}, page_content='This is book of rag development')

Document Loading from the Directory

In [None]:
#Load files from directory loader
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader

loader = DirectoryLoader("../data/pdf_files",
                         glob="**/*.pdf",
                         loader_cls=PyMuPDFLoader,
                         show_progress=True,
                         use_multithreading=True,
                         loader_kwargs={'mode':'page','extract_images':True})
documents = loader.load()

100%|██████████| 3/3 [00:05<00:00,  1.83s/it]


In [52]:
#Document loading function

from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader

def document_loader(dir_path):
    doc_loader = DirectoryLoader(
        path=dir_path,
        glob="**/*.pdf",
        loader_cls=PyMuPDFLoader,
        show_progress=True,
        use_multithreading=True,
        loader_kwargs={'mode':'page','extract_images':True}
    )
    documents = doc_loader.load()
    return documents

In [53]:
dir_path = "../data/pdf_files"
loaded_pdf = document_loader(dir_path=dir_path)
loaded_pdf

100%|██████████| 3/3 [00:03<00:00,  1.11s/it]


[Document(metadata={'producer': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creator': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creationdate': '2025-04-07T14:20:51+00:00', 'source': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'file_path': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-04-07T14:20:54+00:00', 'trapped': '', 'modDate': 'D:20250407142054Z', 'creationDate': 'D:20250407142051Z', 'page': 0}, page_content='A practical \u2028\nguide to \u2028\nbuilding agents'),
 Document(metadata={'producer': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creator': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creationdate': '2025-04-07T14:20:51+00:00', 'source': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'file_path': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': '', 'su

Document Chunking/ Splitting

In [42]:
#The data chunking step - use the langchain recusrsive character text splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, separators=["/n","/n/n"])
texts = text_splitter.split_documents(documents)
texts[0]


Document(metadata={'producer': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creator': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creationdate': '2025-04-07T14:20:51+00:00', 'source': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'file_path': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-04-07T14:20:54+00:00', 'trapped': '', 'modDate': 'D:20250407142054Z', 'creationDate': 'D:20250407142051Z', 'page': 0}, page_content='A practical \u2028\nguide to \u2028\nbuilding agents')

In [58]:
#Document chunking function
from langchain_text_splitters import RecursiveCharacterTextSplitter

def document_spiltter(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=['/n','/n/n',' ',''],
        length_function=len
    )
    split_docs = text_splitter.split_documents(documents)
    if split_docs:
        print(f"Page content of: {split_docs[0].page_content[:200]}")
        print(f"Metadata of: {split_docs[0].metadata}")
    
    return split_docs

In [59]:
chunk_pdf = document_spiltter(documents=loaded_pdf)

Page content of: A practical  
guide to  
building agents
Metadata of: {'producer': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creator': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creationdate': '2025-04-07T14:20:51+00:00', 'source': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'file_path': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-04-07T14:20:54+00:00', 'trapped': '', 'modDate': 'D:20250407142054Z', 'creationDate': 'D:20250407142051Z', 'page': 0}


In [61]:
#Check the number of chunks
print(f"The number of chunks created: {len(chunk_pdf)}")

The number of chunks created: 1561


Chunk Embedding

In [72]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity
import os

In [64]:
class EmbedingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        try:
            print("Loading embedding model")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model is loaded. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error model loading: {e}")
        
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model is not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Embeddings are generated with shape: {embeddings.shape}")
        
        return embeddings
        
embedding_manager = EmbedingManager()
embedding_manager
        

Loading embedding model


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 331.20it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model is loaded. Embedding dimension: 384


<__main__.EmbedingManager at 0x24d8909eb40>

In [68]:
encoded_sample = embedding_manager.generate_embeddings(chunk_pdf[0].page_content)

Generating embeddings for 40 texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 58.89it/s]

Embeddings are generated with shape: (384,)





VectorStore

In [None]:
class VectorStore:
    def __init__(self, collection_name:str = "pdf_documents", persist_directory:str = "../ data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
        
    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":"PDF embedding for RAG"}
            )
            print(f"Vector store is initialized. Collection: {self.collection_name}")
            print(f"Existing document in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing the vector stor is: {e}")
            raise
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match the number of embeddings")
        print(f"Adding {len(documents)} documents to vector store...")
        
        ids = []
        metadatas = [] 
        documents_text = []
        embedding_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            #prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            documents_text.append(doc.page_content)
            
            embedding_list.append(embeddings.tolist())
            
        try:
            self.collection.add(
                ids=ids,
                embeddings=embedding_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding the documents to vector store: {e}")
            raise

vectorstore = VectorStore()
vectorstore
                
    

Vector store is initialized. Collection: pdf_documents
Existing document in collection: 0


<__main__.VectorStore at 0x24d8adf9be0>

In [77]:
texts = [doc.page_content for doc in chunk_pdf]
texts

['A practical \u2028\nguide to \u2028\nbuilding agents',
 'Contents\nWhat is an agent?\n4\nWhen should you build an agent?\n5\nAgent design foundations\n7\nGuardrails\n24\nConclusion\n32\n2\nPractical guide to building agents',
 'Introduction\nLarge language models are becoming increasingly capable of handling complex, multi-step tasks. \nAdvances in reasoning, multimodality, and tool use have unlocked a new category of LLM-powered \nsystems known as agents.\nThis guide is designed for product and engineering teams exploring how to build their first agents, \ndistilling insights from numerous customer deployments into practical and actionable best \npractices. It includes frameworks for identifying promising use cases, clear patterns for designing \nagent logic and orchestration, and best practices to ensure your agents run safely, predictably, \u2028\nand effectively.\xa0\nAfter reading this guide, you’ll have the foundational knowledge you need to confidently start \nbuilding your fi

In [None]:
### Convert the text to embeddings

texts = [doc.page_content for doc in chunk_pdf]

### Generate embeddings

embeddings = embedding_manager.generate_embeddings(texts=texts)

### Store in the vectorstore

vectorstore.add_documents(documents=chunk_pdf, embeddings=embeddings)

Generating embeddings for 1561 texts


Batches:   0%|          | 0/49 [00:00<?, ?it/s]

Batches: 100%|██████████| 49/49 [01:01<00:00,  1.25s/it]


Embeddings are generated with shape: (1561, 384)
Adding 1561 documents to vector store...
