Indexing Pipeline - Data Loading to Data Store in Vector DB

In [1]:
from langchain_core.documents import Document

In [2]:
# The example of the document structure of document loaders

doc = Document(
    page_content="This is book of rag development",
    metadata={
        "source":"HHPP",
        "Author":"Saan",
        "DOB":"2006-01-01"
    }
    )
doc

#docuemnet loader provides the page content and the metadata. The metadata is necessary for vector store at the retrieval stage.

Document(metadata={'source': 'HHPP', 'Author': 'Saan', 'DOB': '2006-01-01'}, page_content='This is book of rag development')

Document Loading from the Directory

In [3]:
#Load files from directory loader
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader

loader = DirectoryLoader("../data/pdf_files",
                         glob="**/*.pdf",
                         loader_cls=PyMuPDFLoader,
                         show_progress=True,
                         use_multithreading=True,
                         loader_kwargs={'mode':'page','extract_images':True})
documents = loader.load()

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 1/1 [00:01<00:00,  1.22s/it]


In [4]:
#Document loading function

from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader

def document_loader(dir_path):
    doc_loader = DirectoryLoader(
        path=dir_path,
        glob="**/*.pdf",
        loader_cls=PyMuPDFLoader,
        show_progress=True,
        use_multithreading=True,
        loader_kwargs={'mode':'page','extract_images':False}
    )
    documents = doc_loader.load()
    return documents

In [5]:
dir_path = "../data/pdf_files"
loaded_pdf = document_loader(dir_path=dir_path)
loaded_pdf

100%|██████████| 1/1 [00:00<00:00,  2.00it/s]


[Document(metadata={'producer': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creator': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creationdate': '2025-04-07T14:20:51+00:00', 'source': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'file_path': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-04-07T14:20:54+00:00', 'trapped': '', 'modDate': 'D:20250407142054Z', 'creationDate': 'D:20250407142051Z', 'page': 0}, page_content='A practical \u2028\nguide to \u2028\nbuilding agents'),
 Document(metadata={'producer': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creator': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creationdate': '2025-04-07T14:20:51+00:00', 'source': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'file_path': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': '', 'su

Document Chunking/ Splitting

In [6]:
#The data chunking step - use the langchain recusrsive character text splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20, separators=["/n","/n/n"])
texts = text_splitter.split_documents(documents)
texts[0]


Document(metadata={'producer': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creator': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creationdate': '2025-04-07T14:20:51+00:00', 'source': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'file_path': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-04-07T14:20:54+00:00', 'trapped': '', 'modDate': 'D:20250407142054Z', 'creationDate': 'D:20250407142051Z', 'page': 0}, page_content='A practical \u2028\nguide to \u2028\nbuilding agents')

In [7]:
#Document chunking function
from langchain_text_splitters import RecursiveCharacterTextSplitter

def document_spiltter(documents, chunk_size=100, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=['/n','/n/n',' ',''],
        length_function=len
    )
    split_docs = text_splitter.split_documents(documents)
    if split_docs:
        print(f"Page content of: {split_docs[0].page_content[:100]}")
        print(f"Metadata of: {split_docs[0].metadata}")
    
    return split_docs

In [8]:
chunk_pdf = document_spiltter(documents=loaded_pdf)

Page content of: A practical  
guide to  
building agents
Metadata of: {'producer': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creator': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'creationdate': '2025-04-07T14:20:51+00:00', 'source': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'file_path': '..\\data\\pdf_files\\Agent development OpenAI.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-04-07T14:20:54+00:00', 'trapped': '', 'modDate': 'D:20250407142054Z', 'creationDate': 'D:20250407142051Z', 'page': 0}


In [9]:
#Check the number of chunks
print(f"The number of chunks created: {len(chunk_pdf)}")

The number of chunks created: 395


Chunk Embedding

In [10]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity
import os

In [11]:
class EmbedingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        try:
            print("Loading embedding model")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model is loaded. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error model loading: {e}")
        
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model is not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Embeddings are generated with shape: {embeddings.shape}")
        
        return embeddings
        
embedding_manager = EmbedingManager()
embedding_manager
        

Loading embedding model


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 582.81it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model is loaded. Embedding dimension: 384


<__main__.EmbedingManager at 0x24ab31b16d0>

VectorStore

In [13]:
class VectorStore:
    def __init__(self, collection_name:str = "pdf_documents", persist_directory:str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
        
    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":"PDF embedding for RAG"}
            )
            print(f"Vector store is initialized. Collection: {self.collection_name}")
            print(f"Existing document in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing the vector stor is: {e}")
            raise
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match the number of embeddings")
        print(f"Adding {len(documents)} documents to vector store...")
        
        ids = []
        metadatas = [] 
        documents_text = []
        embedding_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            #prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            documents_text.append(doc.page_content)
            
            embedding_list.append(embedding.tolist())
            
        try:
            self.collection.add(
                ids=ids,
                embeddings=embedding_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding the documents to vector store: {e}")
            raise

vectorstore = VectorStore()
vectorstore
                
    

Vector store is initialized. Collection: pdf_documents
Existing document in collection: 0


<__main__.VectorStore at 0x24ab35a8f20>

In [15]:
### Convert the text to embeddings

texts = [doc.page_content for doc in chunk_pdf]

### Generate embeddings

embeddings = embedding_manager.generate_embeddings(texts=texts)

Generating embeddings for 395 texts


Batches: 100%|██████████| 13/13 [00:01<00:00,  7.53it/s]

Embeddings are generated with shape: (395, 384)





In [17]:
### Store in the vectorstore
vectorstore.add_documents(documents=chunk_pdf, embeddings=embeddings)

Adding 395 documents to vector store...
Successfully added 395 documents to vector store
Total documents in collection: 395


In [19]:
sample_text = ["Hello nipuna", "What is your name", "Can we meet today"]
encoded_text = embedding_manager.generate_embeddings(sample_text)
encoded_text

Generating embeddings for 3 texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 41.03it/s]

Embeddings are generated with shape: (3, 384)





array([[-0.06120766,  0.02402948, -0.00271914, ...,  0.00687961,
         0.01836214,  0.04964016],
       [-0.0718504 ,  0.03329802,  0.02547467, ..., -0.00569125,
        -0.06389017, -0.03960548],
       [-0.08560155,  0.04661411,  0.02221717, ..., -0.06646735,
        -0.01085835, -0.04476922]], shape=(3, 384), dtype=float32)

In [20]:
client = chromadb.PersistentClient(path="../data/vector_store/")

In [21]:
client.heartbeat()

1769964308498048000

In [22]:
collection = client.get_collection(name="pdf_documents")

In [23]:
results = collection.query(
    query_texts=["building agents"],
    n_results=2
)

C:\Users\asus\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:47<00:00, 1.76MiB/s]


In [28]:
results=results["documents"]

In [27]:
query_texts=["building agents"]
query_embedding = embedding_manager.generate_embeddings(query_texts)
query_embedding

Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 34.52it/s]

Embeddings are generated with shape: (1, 384)





array([[-4.26194891e-02, -2.33312473e-02, -6.21018335e-02,
        -2.12672390e-02, -2.51870416e-02, -1.73256062e-02,
         8.34357832e-03, -3.70369107e-02, -3.87254171e-02,
         1.36401420e-02, -9.22526640e-04, -6.10618852e-02,
         2.61851959e-02,  6.62367791e-02,  4.04345356e-02,
         5.29701561e-02,  6.17718734e-02,  3.65550704e-02,
        -2.97364183e-02, -4.87483069e-02, -3.92001756e-02,
        -2.31265202e-02, -4.74164151e-02, -2.59839036e-02,
        -1.03739444e-02,  3.11208218e-02,  1.19358953e-02,
         1.58576351e-02,  7.09835961e-02, -5.20781800e-02,
         2.09456999e-02,  8.39686673e-03,  7.65768141e-02,
         4.09917422e-02,  1.25064746e-01,  1.48321420e-01,
        -1.21474415e-02,  2.71551590e-02,  4.03746888e-02,
         3.67605798e-02, -1.29965907e-02, -1.74915269e-02,
        -3.27113047e-02, -4.04816829e-02, -8.89015570e-03,
         1.03026710e-03, -3.24919298e-02, -2.24975608e-02,
         1.17589114e-02, -2.25873813e-02, -3.84909250e-0

In [29]:
results_embedding = embedding_manager.generate_embeddings(results[0])
results_embedding

Generating embeddings for 2 texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 43.37it/s]

Embeddings are generated with shape: (2, 384)





array([[-2.06803493e-02, -1.94551013e-02, -7.30885863e-02,
        -5.58968335e-02, -3.00675426e-02, -1.63870938e-02,
        -1.73912644e-02,  1.42581891e-02, -7.55652040e-02,
         3.21233422e-02, -2.61475854e-02, -5.20354062e-02,
         3.60759608e-02,  4.85152826e-02,  7.04937428e-02,
         4.17150594e-02,  5.50948307e-02,  2.90822964e-02,
        -9.56715364e-03, -7.32099116e-02,  1.68426018e-02,
        -4.13927287e-02, -2.17893701e-02, -6.68296292e-02,
        -8.57930928e-02,  2.12403666e-03,  4.47945967e-02,
        -1.82665773e-02,  6.66119531e-02, -3.20675746e-02,
         4.08207141e-02, -2.41474994e-02,  5.66361733e-02,
         1.24129804e-03,  9.52778980e-02,  1.41561329e-01,
         2.74930131e-02,  1.08989626e-02,  3.39778438e-02,
         5.84158935e-02, -2.11970787e-02, -5.77526772e-03,
        -2.79398020e-02, -3.59896347e-02, -3.51747544e-03,
         7.41445832e-03, -2.56845299e-02, -2.60551311e-02,
         4.96548309e-04, -5.67874759e-02, -8.94224644e-0

In [30]:
cosine_similarity(query_embedding,results_embedding)

array([[0.8665158, 0.8539872]], dtype=float32)