In [None]:
#RAG Pipelined -Data Ingestion to vector DB Pipeline

In [1]:
!pip install langchain langchain-core langchain-community langchain-openai faiss-cpu pypdf pymupdf tiktoken


Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-openai
  Downloading langchain_openai-1.0.3-py3-none-any.whl.metadata (2.6 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Collecting pypdf
  Downloading pypdf-6.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-community
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
  Downloading langchain_community-0.3.31-py3-none-any.whl.metadata (3.0 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7

In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

In [4]:
## READ all the pdfs inside the directory
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)

    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")

        except Exception as e:
            print(f"  ✗ Error: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("data")

Found 2 PDF files to process

Processing: Rag.pdf
  ✓ Loaded 2 pages

Processing: embeding.pdf
  ✓ Loaded 1 pages

Total documents loaded: 3


In [5]:
all_pdf_documents

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-11-19T07:41:40+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-11-19T07:41:40+00:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'data/pdf/Rag.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'Rag.pdf', 'file_type': 'pdf'}, page_content='Page 1 — What is RAG?\nRetrieval-Augmented Generation (RAG) is a technique that enhances Large Language Models by\nallowing them to retrieve relevant external knowledge before generating answers.\nKey Benefits:\n- Reduces hallucinations\n- Allows up■to■date information retrieval\n- Enables domain■specific knowledge grounding\nRAG Components:\n1. Document Loader\n2. Text Splitter\n3. Embedding Model\n4. Vector Database\n5. Retriever\n6. Generator (LLM)\nPage 2 — RAG Architecture Workflow\n1. Ingestion Phase:\n- Load documents (PDF, Web pages, text file

In [8]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

# Text Spliting get into chunks

def split_docs(documents,chunk_size=1000,chunk_overlap=200):
  """Split documents into chunks"""
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                 chunk_overlap=chunk_overlap,
                                                 length_function=len,
                                                 separators=["\n\n","\n"," ",""])
  documents = text_splitter.split_documents(documents)
  print(f"Split {len(documents)} documents into {len(documents)} chunks")
  if documents:
    print(f"\nExample chunk:")
    print(f"Content: {documents[0].page_content[:200]}...")
    print(f"Metadata: {documents[0].metadata}")
  return documents


In [11]:
chunks=split_docs(all_pdf_documents)
chunks

Split 5 documents into 5 chunks

Example chunk:
Content: Page 1 — What is RAG?
Retrieval-Augmented Generation (RAG) is a technique that enhances Large Language Models by
allowing them to retrieve relevant external knowledge before generating answers.
Key Be...
Metadata: {'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-11-19T07:41:40+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-11-19T07:41:40+00:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'data/pdf/Rag.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'Rag.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-11-19T07:41:40+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-11-19T07:41:40+00:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'data/pdf/Rag.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'Rag.pdf', 'file_type': 'pdf'}, page_content='Page 1 — What is RAG?\nRetrieval-Augmented Generation (RAG) is a technique that enhances Large Language Models by\nallowing them to retrieve relevant external knowledge before generating answers.\nKey Benefits:\n- Reduces hallucinations\n- Allows up■to■date information retrieval\n- Enables domain■specific knowledge grounding\nRAG Components:\n1. Document Loader\n2. Text Splitter\n3. Embedding Model\n4. Vector Database\n5. Retriever\n6. Generator (LLM)\nPage 2 — RAG Architecture Workflow\n1. Ingestion Phase:\n- Load documents (PDF, Web pages, text file

In [12]:
#embedding
# Install Sentence Transformers for creating embeddings
!pip install -q sentence-transformers

# Install FAISS-CPU for high-performance vector search (CPU version)
# !pip install -q faiss-cpu

# Install ChromaDB as your open-source vector store
!pip install -q chromadb

# # Optionally, install LangChain for easier orchestration (highly recommended for RAG)
# !pip install -q langchain


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.4/21.4 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m37.8 MB/s[0m eta [36m0:00:

In [13]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager

        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts

        Args:
            texts: List of text strings to embed

        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")

        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings


## initialize the embedding manager

embedding_manager=EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x7b6ce73bdfd0>

In [20]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "data/vector_store"):
        """
        Initialize the vector store

        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store

        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"Adding {len(documents)} documents to vector store...")

        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x7b6ce4509100>

In [17]:
chunks

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-11-19T07:41:40+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-11-19T07:41:40+00:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'data/pdf/Rag.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'Rag.pdf', 'file_type': 'pdf'}, page_content='Page 1 — What is RAG?\nRetrieval-Augmented Generation (RAG) is a technique that enhances Large Language Models by\nallowing them to retrieve relevant external knowledge before generating answers.\nKey Benefits:\n- Reduces hallucinations\n- Allows up■to■date information retrieval\n- Enables domain■specific knowledge grounding\nRAG Components:\n1. Document Loader\n2. Text Splitter\n3. Embedding Model\n4. Vector Database\n5. Retriever\n6. Generator (LLM)\nPage 2 — RAG Architecture Workflow\n1. Ingestion Phase:\n- Load documents (PDF, Web pages, text file

In [21]:
### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]

## Generate the Embeddings

embeddings=embedding_manager.generate_embeddings(texts)

##store int he vector dtaabase
vectorstore.add_documents(chunks,embeddings)


Generating embeddings for 5 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (5, 384)
Adding 5 documents to vector store...
Successfully added 5 documents to vector store
Total documents in collection: 5
