### Rag Pipelines- Data Ingestion to Vector Db pipelines

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [2]:

### Read all the pdf's inside the dictionary
def process_all_pdfs(pdf_directory):
    """Process all the pdf files in a dictionary"""
    all_documents = []
    pdf_dir = Path(pdf_directory)

    # Find all the PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found{len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            # Add source info to metadataa
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
            print(f" Loaded{len(documents)} pages")
        
        except Exception as e:
            print(f"Erro:{e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")


Found3 PDF files to process

Processing: TCS1.pdf


 Loaded3 pages

Processing: TCS2.pdf
 Loaded4 pages

Processing: TCS3.pdf
 Loaded37 pages

Total documents loaded: 44


In [3]:
all_pdf_documents

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-11-09T01:52:04+05:30', 'author': 'Pranjal Singh', 'moddate': '2025-11-09T01:52:04+05:30', 'source': '..\\data\\TCS1.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1', 'source_file': 'TCS1.pdf', 'file_type': 'pdf'}, page_content='Tata Consultancy Services, commonly known as TCS, stands as one of the most prominent global \ntechnology and consulting organizations originating from India. It began its journey in 1968 \nunder the Tata Group and steadily expanded into a worldwide leader in digital and IT services. \nFrom its early days of handling basic computing tasks for Indian enterprises, TCS has \ntransformed into a powerhouse that supports the digital operations of major institutions across \nthe world. Its growth has been deeply rooted in a long-term vision, disciplined execution, value-\ndriven leadership, and a commitment to excellence in technology and business transf

In [4]:
## Text Splitting get into chunks

def split_documents(documents,chunk_size=800,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n","\n"," ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs


In [5]:
chunks = split_documents(all_pdf_documents)
chunks

Split 44 documents into 111 chunks

Example chunk:
Content: Tata Consultancy Services, commonly known as TCS, stands as one of the most prominent global 
technology and consulting organizations originating from India. It began its journey in 1968 
under the Ta...
Metadata: {'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-11-09T01:52:04+05:30', 'author': 'Pranjal Singh', 'moddate': '2025-11-09T01:52:04+05:30', 'source': '..\\data\\TCS1.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1', 'source_file': 'TCS1.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-11-09T01:52:04+05:30', 'author': 'Pranjal Singh', 'moddate': '2025-11-09T01:52:04+05:30', 'source': '..\\data\\TCS1.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1', 'source_file': 'TCS1.pdf', 'file_type': 'pdf'}, page_content='Tata Consultancy Services, commonly known as TCS, stands as one of the most prominent global \ntechnology and consulting organizations originating from India. It began its journey in 1968 \nunder the Tata Group and steadily expanded into a worldwide leader in digital and IT services. \nFrom its early days of handling basic computing tasks for Indian enterprises, TCS has \ntransformed into a powerhouse that supports the digital operations of major institutions across \nthe world. Its growth has been deeply rooted in a long-term vision, disciplined execution, value-\ndriven leadership, and a commitment to excellence in technology and business transf

In [6]:
## Embedding and VectorStore DB
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""


    def __init__(self, model_name:str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager

        Args:
        model_name: Huggingface model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def load_model(self):
        """Load the sentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded successfully. Embedding dimension:{self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model{self.model_name}: {e}")
            raise

    def generate_embeddings(self,texts: List[str])-> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generate embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape:{embeddings.shape}")
        return embeddings
    
    def get_embedding_dimension(self)->int:
        """Get the embeding dimension of the model"""
        if not self.model:
            raise ValueError("Model not loaded")
        return self.model.get_sentence_embedding_dimension()
