### RAG pipelinse- Data Ingestion to vector DB pipeline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
###Read all the doc
def process_all_pdfs(pdf_dir="."):
    """PROCESS ALL THE PDF FILES IN A DIRECTORY"""
    all_documents = []  #empty list
    pdf_dir = Path(pdf_dir)
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            docs = loader.load()
            
            for d in docs:
                d.metadata['source_file'] = pdf_file.name
                d.metadata['file_type'] = 'pdf'
                
            all_documents.extend(docs) # storing the metadata into this all_documnet
            print(f"Loaded {len(docs)} pages")
        except Exception as e:
            print(f"Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents=process_all_pdfs("../data")
       

Found 2 PDF files to process

Processing: 1-04ee286a-ad42-4d86-ab8d-14d18b4fbd1a.pdf
Loaded 1 pages

Processing: Self-Healing_Software_Systems_Lessons_from_Nature_[1].pdf
Loaded 16 pages

Total documents loaded: 17


In [3]:
all_pdf_documents

[Document(metadata={'producer': 'Skia/PDF m140', 'creator': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/140.0.0.0 Safari/537.36', 'creationdate': '2025-10-09T09:55:27+00:00', 'title': 'Certificate', 'moddate': '2025-10-09T15:54:16+05:30', 'source': '..\\data\\pdf\\1-04ee286a-ad42-4d86-ab8d-14d18b4fbd1a.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': '1-04ee286a-ad42-4d86-ab8d-14d18b4fbd1a.pdf', 'file_type': 'pdf'}, page_content='Riya Chandra\nIntroduction to Artificial Intelligence\nThe certificate is awarded to\nfor successfully completing the course\non October 9, 2025\nIssued on: Thursday, October 9, 2025\nTo verify, scan the QR code at https://verify.onwingspan.com'),
 Document(metadata={'producer': 'Skia/PDF m137 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Self-Healing Software', 'source': '..\\data\\pdf\\Self-Healing_Software_Systems_Lessons_from_Nature_[1].pdf', 'total_pages': 16, 'page'

### Chunkings

In [4]:
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """split documents into smaller chunks for better RAG performance"""
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n","\n"," ",""]
    )
    split_docs=text_splitter.split_documents(documents)
    print(f"split {len(documents)} document into {len(split_docs)} chunks")
    
    if split_docs:
        print(f"\nexample chunks:")
        print(f"content:{split_docs[0].page_content[:200]}...")
        print(f"metadata:{split_docs[0].metadata}")
    return split_docs    

In [5]:
chunks=split_documents(all_pdf_documents)
chunks

split 17 document into 102 chunks

example chunks:
content:Riya Chandra
Introduction to Artificial Intelligence
The certificate is awarded to
for successfully completing the course
on October 9, 2025
Issued on: Thursday, October 9, 2025
To verify, scan the QR...
metadata:{'producer': 'Skia/PDF m140', 'creator': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/140.0.0.0 Safari/537.36', 'creationdate': '2025-10-09T09:55:27+00:00', 'title': 'Certificate', 'moddate': '2025-10-09T15:54:16+05:30', 'source': '..\\data\\pdf\\1-04ee286a-ad42-4d86-ab8d-14d18b4fbd1a.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': '1-04ee286a-ad42-4d86-ab8d-14d18b4fbd1a.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Skia/PDF m140', 'creator': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/140.0.0.0 Safari/537.36', 'creationdate': '2025-10-09T09:55:27+00:00', 'title': 'Certificate', 'moddate': '2025-10-09T15:54:16+05:30', 'source': '..\\data\\pdf\\1-04ee286a-ad42-4d86-ab8d-14d18b4fbd1a.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': '1-04ee286a-ad42-4d86-ab8d-14d18b4fbd1a.pdf', 'file_type': 'pdf'}, page_content='Riya Chandra\nIntroduction to Artificial Intelligence\nThe certificate is awarded to\nfor successfully completing the course\non October 9, 2025\nIssued on: Thursday, October 9, 2025\nTo verify, scan the QR code at https://verify.onwingspan.com'),
 Document(metadata={'producer': 'Skia/PDF m137 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Self-Healing Software', 'source': '..\\data\\pdf\\Self-Healing_Software_Systems_Lessons_from_Nature_[1].pdf', 'total_pages': 16, 'page'

### Embedding and vectorStoreDB

In [11]:
import numpy as np
from sentence_transformers import SentenceTransformer #embedding model available inside this
import chromadb
from chromadb.config import Settings
import uuid  # every record should have some kind of unique id 
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
class EmbeddingManager: # for every class we make we have to make a init function
    
    """handles documnet embedding gernation using transformer"""
    def __init__(self,model_name:str="all-MiniLM-L6-v2"):  #this is present in hugging face reposnible n converting text into vectors
        """
        Intialide the embedding
        """  
        self.model_name=model_name
        self.model=None
        self._load_model()  # when object us created then model load ho jayega immediately
         
# Ek model load karo (jaise "all-MiniLM-L6-v2")
# Us model se embeddings generate karo
# In embeddings ko/ vector store me save karo / search karo
    def _load_model(self):
        """Load the sentenceTranformer model"""
        try:
            print(f"Loading embedding model:{self.model_name}")
            self.model=SentenceTransformer(self.model_name)
            print(f"model loaded successfully.Embedding dimensions :{self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"error loading model {self.model_name}:{e}")
            raise   
        
    def generate_embeddings(self,texts:List[str])->np.ndarray:
        """
        generate embeddings for a list of texts 
        
        Args:
        texts:List of text strings to embed
        
        Returns:
        numpy array of embeddings with shape (len(texts),embedding_dim)
        """    
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"generaing embedding for {len(texts)} texts...")
        embeddings=self.model.encode(texts, show_progress_bar=True)
        print(f"generated embedding with shape :{embeddings.shape}")
        return embeddings
        
    def get_sentence_embedding_dimension(self)-> int:
        """get the embedding dimensions of the model"""
        if not self.model:
            raise ValueError("MOdel nor loaded")
        return self.model.get_sentence_embedding_dimension()
   
   
   ## intialize the embedding manager
embedding_manager=EmbeddingManager()
embedding_manager      

Loading embedding model:all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model loaded successfully.Embedding dimensions :384


<__main__.EmbeddingManager at 0x1df21e142f0>

###vectorStore

In [None]:
## now my embedded model is ready now we have to make the vector db

class vectorStore:
    """manages doc embeddings in chromaDB vector store"""
    
    def __init__(self, collection_name: str="all_pdf_documents",persist_directory: str="../data/vector_store"):
    
        """Intialize the vector store
        Args:
        collection_name:name of the chromaDB collection 
        persist_directory: Directory to persist the vector store
        """

        self.collection_name= collection_name
        self.persist_directory= persist_directory
        self.client=None
        self.collection=None
        self._initialize_store()
        
    def _initialize_store(self):
        """initializinf chromDB client and collection"""
        try:
            #create persistent chromaDB client
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(path=self.persist_directory) 
            
            #get or create collection
            
            self.collection=self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":"pdf document embeddingd for RAG"}
            )   
            print(f"vector store intialized collection :{self.collection_name}")
            print(f"existing documents in collection :{self.collection.count()}")
            
        except Exception as e:
            print(f"error intializing vector store:{e}")    
            raise
        
    def add_document(self,documents:List[Any],embeddings:np.ndarray):
        """
        add document and their embeddings to the vector store 
        Args: 
        documents:List of Langchain documents
        embeddingS: Corresponding embeddings for the doc
        """   
        if len(documents)!=len(embeddings):
            raise ValueError("number of soc mnust match number if embeddings")
        
        print(f"adding {len(documents)} documnets to vector store...")
        
        #preparing the doc for chromaDB
        ids=[]
        metadata=[] 
        documents_text=[]
        embeddings_list=[]
        
        for i,(doc,embedding) in enumerate(zip(documents,embeddings)):
            #generating unique ids beacuse id for a specific record
            doc_id=f"doc){uuid.uuidv4().hex[:8]}_{i}"
            ids.append(doc_id)
        
            #preparing metadata
            metadata=dict(doc.metadata)
            metadata['doc_index']=i
            metadata['content_length']=len(doc.page_content)
            metadata.append(metadata)
            
            #document content
            documents_text.append(doc.page_content)
            
            #embedding
            embeddings_list.append(embedding.tolist())
        
        
        