# RAG Pipelines- Data Ingestion to Vector Database

In [5]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

## 1.1 Data Ingestion (Read Data)

In [7]:
# Read all the pdf's

def process_all_pdfs(pdf_path):
    """
    Process all PDF files in the given directory
    """
    all_documents = []
    pdf_dir = Path(pdf_path)
    
    # Find all pdf file recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"[INFO] Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing : {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            # Append to all_documents list
            all_documents.extend(documents)
            print(f"[SUCCESS] Loaded {len(documents)} pages")
        except Exception as e:
            print(f"[ERROR] Failed to load document {pdf_file.name} with Error: {e}")
    
    print(f"[INFO] \nTotal documents Loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")
    
    

[INFO] Found 4 PDF files to process

Processing : nerf.pdf


[SUCCESS] Loaded 25 pages

Processing : attention.pdf
[SUCCESS] Loaded 15 pages

Processing : resnet.pdf
[SUCCESS] Loaded 12 pages

Processing : openGPT.pdf
[SUCCESS] Loaded 30 pages
[INFO] 
Total documents Loaded: 82


In [8]:
all_pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.17', 'creator': 'LaTeX with hyperref package', 'creationdate': '2020-08-05T00:07:51+00:00', 'author': '', 'keywords': '', 'moddate': '2021-12-07T17:21:35+05:30', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf/nerf.pdf', 'total_pages': 25, 'page': 0, 'page_label': '1', 'source_file': 'nerf.pdf', 'file_type': 'pdf'}, page_content='NeRF: Representing Scenes as\nNeural Radiance Fields for View Synthesis\nBen Mildenhall1⋆ Pratul P. Srinivasan1⋆ Matthew Tancik1⋆\nJonathan T. Barron2 Ravi Ramamoorthi3 Ren Ng1\n1UC Berkeley 2Google Research 3UC San Diego\nAbstract. We present a method that achieves state-of-the-art results\nfor synthesizing novel views of complex scenes by optimizing an under-\nlying continuous volumetric scene function using a sparse set of input\nviews. Our algorithm represents a scene using a fully-

## 1.2 Chunk the Loaded Data

In [17]:
## Text splitting into Chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into smaller chunk for better RAG pipeline (Vector DB).
    """
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs=text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # show example of  chunk
    if split_docs:
        print(f"\nExample Chunk")
        print(f"Content: {split_docs[0].page_content[:200]}.....")
        print(f"Metadata: {split_docs[0].metadata}")
    return split_docs

In [18]:
chunks = split_documents(all_pdf_documents)
chunks

Split 82 documents into 365 chunks

Example Chunk
Content: NeRF: Representing Scenes as
Neural Radiance Fields for View Synthesis
Ben Mildenhall1⋆ Pratul P. Srinivasan1⋆ Matthew Tancik1⋆
Jonathan T. Barron2 Ravi Ramamoorthi3 Ren Ng1
1UC Berkeley 2Google Resea.....
Metadata: {'producer': 'pdfTeX-1.40.17', 'creator': 'LaTeX with hyperref package', 'creationdate': '2020-08-05T00:07:51+00:00', 'author': '', 'keywords': '', 'moddate': '2021-12-07T17:21:35+05:30', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf/nerf.pdf', 'total_pages': 25, 'page': 0, 'page_label': '1', 'source_file': 'nerf.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'pdfTeX-1.40.17', 'creator': 'LaTeX with hyperref package', 'creationdate': '2020-08-05T00:07:51+00:00', 'author': '', 'keywords': '', 'moddate': '2021-12-07T17:21:35+05:30', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf/nerf.pdf', 'total_pages': 25, 'page': 0, 'page_label': '1', 'source_file': 'nerf.pdf', 'file_type': 'pdf'}, page_content='NeRF: Representing Scenes as\nNeural Radiance Fields for View Synthesis\nBen Mildenhall1⋆ Pratul P. Srinivasan1⋆ Matthew Tancik1⋆\nJonathan T. Barron2 Ravi Ramamoorthi3 Ren Ng1\n1UC Berkeley 2Google Research 3UC San Diego\nAbstract. We present a method that achieves state-of-the-art results\nfor synthesizing novel views of complex scenes by optimizing an under-\nlying continuous volumetric scene function using a sparse set of input\nviews. Our algorithm represents a scene using a fully-

### 1.3 Embedding and VectorStoreDB

In [24]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity


Caching the list of root modules, please wait!
(This will only be done once - type '%rehashx' to reset cache!)



In [26]:
class EmbeddingsManager:
    """
    Handles document embedding generation using SentenceTransformer.
    """
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the EmbeddingsManager
        
        Args:
            model_name: HuggingFace model for sentence Embeddings
        """
        
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        """
        Load the SentenceTransformer model.
        """
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimesions: {self.model.get_sentence_embedding_dimension()}")
        except Ecxeption as e:
            print(f"Error loading model {self.model_name} : {e}")
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text-strings to embedd (strings from chunks)
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dimension).
        """
        
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts....")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
# Initialize the EmbeddingsManager
embedding_manager = EmbeddingsManager()
embedding_manager
    

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimesions: 384


<__main__.EmbeddingsManager at 0x70b78a572f30>

### 1.4 Vectorstore

In [31]:
class VectorStore:
    """
    Manage document embeddings in a ChromaDB vector store.
    """
    
    def __init__(self, collection_name: str ="pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initalize the vector store.
        
        Args:
            collection_name: Name of the ChromaDB collection.
            persist_directory: Directory to store the vector embeddings which is a persistance volume.
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
        
    
    def _initialize_store(self):
        """
        Initialize the  ChormaDB client and collections.
        """
        try:
            # Create persistent ChromaDB client initially
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise
    
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store.
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore
            
        
        
        

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x70b78a570650>

In [33]:
# get texts from chunks and convert text to embeddings)
texts = [document.page_content for  document in chunks]

# convert this texts to embedding

embeddings = embedding_manager.generate_embeddings(texts)

# store the embeddings to vectorstore
vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 365 texts....


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches: 100%|██████████| 12/12 [01:19<00:00,  6.65s/it]


Generated embeddings with shape: (365, 384)
Adding 365 documents to vector store...
Successfully added 365 documents to vector store
Total documents in collection: 365


## Retriever From VectorStore

In [53]:
class RAGRetriever:
    """
    Handels query-based retrival from the vector store.
    """
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingsManager):
        """
        Initialize the retriever.
        
        Args:
            vector_store: Vector Database contain document embeddings.
            embedding_manager: Manager to generate embeddings of user input queries.
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
        
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant document based on the input query.
        
        Args:
            query: the user input query.
            top_k: Number of top results to return based on similarity score.
            score_threshold: Minimum similarity score thereshold.
        
        Returns:
            List of dictionaries containing retrieved documents and metadata.
        """
        
        print(f"[INFO] Retrieving document for query: '{query}")
        print(f"[INFO] Top K : {top_k}, score_threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store based on the user_query embeddings
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            print(results)
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id' : doc_id,
                            'content': document,
                            'metadata' : metadata,
                            'similarity_score' : similarity_score,
                            'distance' : distance,
                            'rank': i + 1
                        })
                print(f"[SUCCESS] Retrieved {len(retrieved_docs)} documents after filtering. ")
            
            else:
                print("[INFO] No document found.")
                
            return retrieved_docs
        except Exception as e:
            print(f"[ERROR] Error during retrieval: {e}")
            return []


rag_retriever = RAGRetriever(vectorstore, embedding_manager)
rag_retriever        
    

<__main__.RAGRetriever at 0x70b78a70d5e0>

In [67]:
rag_retriever.retrieve("What is Attention All You Need", score_threshold=0.0)

[INFO] Retrieving document for query: 'What is Attention All You Need
[INFO] Top K : 5, score_threshold: 0.0
Generating embeddings for 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 42.87it/s]

Generated embeddings with shape: (1, 384)
{'ids': [['doc_b1ec708b_159', 'doc_3f89091f_132', 'doc_4ac29118_136', 'doc_a1dc9490_117', 'doc_7e1c577b_121']], 'embeddings': None, 'documents': [['Attention Visualizations\nInput-Input Layer5\nIt\nis\nin\nthis\nspirit\nthat\na\nmajority\nof\nAmerican\ngovernments\nhave\npassed\nnew\nlaws\nsince\n2009\nmaking\nthe\nregistration\nor\nvoting\nprocess\nmore\ndifficult\n.\n<EOS>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\nIt\nis\nin\nthis\nspirit\nthat\na\nmajority\nof\nAmerican\ngovernments\nhave\npassed\nnew\nlaws\nsince\n2009\nmaking\nthe\nregistration\nor\nvoting\nprocess\nmore\ndifficult\n.\n<EOS>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\nFigure 3: An example of the attention mechanism following long-distance dependencies in the\nencoder self-attention in layer 5 of 6. Many of the attention heads attend to a distant dependency of\nthe verb ‘making’, completing the phrase ‘making...more difﬁcult’. Attentions here shown only for\nthe word ‘makin




[{'id': 'doc_b1ec708b_159',
  'content': 'Attention Visualizations\nInput-Input Layer5\nIt\nis\nin\nthis\nspirit\nthat\na\nmajority\nof\nAmerican\ngovernments\nhave\npassed\nnew\nlaws\nsince\n2009\nmaking\nthe\nregistration\nor\nvoting\nprocess\nmore\ndifficult\n.\n<EOS>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\nIt\nis\nin\nthis\nspirit\nthat\na\nmajority\nof\nAmerican\ngovernments\nhave\npassed\nnew\nlaws\nsince\n2009\nmaking\nthe\nregistration\nor\nvoting\nprocess\nmore\ndifficult\n.\n<EOS>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\nFigure 3: An example of the attention mechanism following long-distance dependencies in the\nencoder self-attention in layer 5 of 6. Many of the attention heads attend to a distant dependency of\nthe verb ‘making’, completing the phrase ‘making...more difﬁcult’. Attentions here shown only for\nthe word ‘making’. Different colors represent different heads. Best viewed in color.\n13',
  'metadata': {'source_file': 'attention.pdf',
   'title': '',
   'trapp

In [61]:
rag_retriever.retrieve("residual neural network")

[INFO] Retrieving document for query: 'residual neural network
[INFO] Top K : 5, score_threshold: 0.0
Generating embeddings for 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 43.18it/s]

Generated embeddings with shape: (1, 384)
{'ids': [['doc_a7cb0684_162', 'doc_2279509d_170', 'doc_5f896197_208', 'doc_aa9c0405_171', 'doc_f97f5daf_169']], 'embeddings': None, 'documents': [['Deep Residual Learning for Image Recognition\nKaiming He Xiangyu Zhang Shaoqing Ren Jian Sun\nMicrosoft Research\n{kahe, v-xiangz, v-shren, jiansun}@microsoft.com\nAbstract\nDeeper neural networks are more difﬁcult to train. We\npresent a residual learning framework to ease the training\nof networks that are substantially deeper than those used\npreviously. We explicitly reformulate the layers as learn-\ning residual functions with reference to the layer inputs, in-\nstead of learning unreferenced functions. We provide com-\nprehensive empirical evidence showing that these residual\nnetworks are easier to optimize, and can gain accuracy from\nconsiderably increased depth. On the ImageNet dataset we\nevaluate residual nets with a depth of up to 152 layers—8 ×\ndeeper than VGG nets [41] but still havi




[{'id': 'doc_a7cb0684_162',
  'content': 'Deep Residual Learning for Image Recognition\nKaiming He Xiangyu Zhang Shaoqing Ren Jian Sun\nMicrosoft Research\n{kahe, v-xiangz, v-shren, jiansun}@microsoft.com\nAbstract\nDeeper neural networks are more difﬁcult to train. We\npresent a residual learning framework to ease the training\nof networks that are substantially deeper than those used\npreviously. We explicitly reformulate the layers as learn-\ning residual functions with reference to the layer inputs, in-\nstead of learning unreferenced functions. We provide com-\nprehensive empirical evidence showing that these residual\nnetworks are easier to optimize, and can gain accuracy from\nconsiderably increased depth. On the ImageNet dataset we\nevaluate residual nets with a depth of up to 152 layers—8 ×\ndeeper than VGG nets [41] but still having lower complex-\nity. An ensemble of these residual nets achieves 3.57% error\non the ImageNettest set. This result won the 1st place on the\nILSV

# RAG Pipeline - VectorDB to LLM Output Generation

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

# Load the GROQ_API_KEY
groq_api_key =os.getenv("GROQ_API_KEY")

llm = ChatGroq(groq_api_key=groq_api_key, model_name="gemma2-7b-instruct", temperature=0.1, max_tokens=1024)

## Simple RAG function to retrieve context + generate response

def rag_simple(query, retriever, llm, top_k=3):
    # Retrieve the context
    results = rag_retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    # Generate the answer using GROQ LLM
    prompt =f""" Use the following context to answer the question concisely.
        Context: {context}
        
        Question: {query}
        
        Answer: 
    """
    response = llm.invoke([prompt.format(context=context, query=query)])
    return response.content  


In [81]:
answer=rag_simple("What is attention mechanism?",rag_retriever,llm)
print(answer)

[INFO] Retrieving document for query: 'What is attention mechanism?
[INFO] Top K : 3, score_threshold: 0.0
Generating embeddings for 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 44.72it/s]

Generated embeddings with shape: (1, 384)
{'ids': [['doc_b1ec708b_159', 'doc_73de9e4e_161', 'doc_4ac29118_136']], 'embeddings': None, 'documents': [['Attention Visualizations\nInput-Input Layer5\nIt\nis\nin\nthis\nspirit\nthat\na\nmajority\nof\nAmerican\ngovernments\nhave\npassed\nnew\nlaws\nsince\n2009\nmaking\nthe\nregistration\nor\nvoting\nprocess\nmore\ndifficult\n.\n<EOS>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\nIt\nis\nin\nthis\nspirit\nthat\na\nmajority\nof\nAmerican\ngovernments\nhave\npassed\nnew\nlaws\nsince\n2009\nmaking\nthe\nregistration\nor\nvoting\nprocess\nmore\ndifficult\n.\n<EOS>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\nFigure 3: An example of the attention mechanism following long-distance dependencies in the\nencoder self-attention in layer 5 of 6. Many of the attention heads attend to a distant dependency of\nthe verb ‘making’, completing the phrase ‘making...more difﬁcult’. Attentions here shown only for\nthe word ‘making’. Different colors represent different




NotFoundError: Error code: 404 - {'error': {'message': 'The model `gemma2-7b` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'code': 'model_not_found'}}

In [73]:
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage

In [None]:
llm = ChatGroq(groq_api_key= )