In [1]:
import os
os.makedirs("../data/pdf_files", exist_ok=True)

### Data Ingestion

In [2]:
from langchain_core.documents import Document
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from typing import List, Any, Dict
import numpy as np
import chromadb
from chromadb.config import Settings
import uuid

In [4]:
class DataLoader:
    def __init__(self, directory_path):
        self.directory_path = directory_path
        # self.all_pdf_document = self.process_all_pdfs()

    def process_all_pdfs(self):
        all_pdf = []
        pdf_dir = Path(self.directory_path)

        pdf_files = list(pdf_dir.glob("*.pdf") )
    
        print(f"Found {len(pdf_files)} PDF files in {self.directory_path}")

        for pdf_file in pdf_files:
            print(f"Processing file: {pdf_file.name}")
            try:
                loader = PyMuPDFLoader(str(pdf_file))
                documents = loader.load()

                # Add source information to metadata
                for doc in documents:
                    doc.metadata['source_file'] = pdf_file.name
                    doc.metadata['file_type'] = 'pdf'

                all_pdf.extend(documents)
                print(f"Loaded {len(documents)} pages from {pdf_file.name}")
            except Exception as e:
                print(f"Error processing {pdf_file.name}: {e}")

        print(f"Total documents loaded: {len(all_pdf)}")
        return all_pdf

In [5]:
data_loader = DataLoader("../data/pdf_files")

### Document Splitting

In [6]:
class DocumentSplitter:
    def __init__(self, documents, chunk_size=1000, chunk_overlap=200):
        self.documents = documents
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        # self.split_documents()

    def split_documents(self):
        """Split documents into smaller chunks."""
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
        chunks = text_splitter.split_documents(self.documents)
        print(f"Split {len(self.documents)} documents into {len(chunks)} chunks")
        # Show example of a chunk
        if chunks:
            print(f"\nExample chunk:")
            print(f"Content: {chunks[0].page_content[:200]}...")
            print(f"Metadata: {chunks[0].metadata}")

        return chunks

In [7]:
doc_split = DocumentSplitter(data_loader.process_all_pdfs())
chunks = doc_split.split_documents()

Found 5 PDF files in ../data/pdf_files
Processing file: ALL-IN-ONE RAG FRAMEWORK.pdf
Loaded 18 pages from ALL-IN-ONE RAG FRAMEWORK.pdf
Processing file: HyperbolicRAG.pdf
Loaded 12 pages from HyperbolicRAG.pdf
Processing file: RETRIEVAL-AUGMENTED CODE GENERATION.pdf
Loaded 38 pages from RETRIEVAL-AUGMENTED CODE GENERATION.pdf
Processing file: Retrieval-Augmented Generation with Implicit Queries.pdf
Loaded 13 pages from Retrieval-Augmented Generation with Implicit Queries.pdf
Processing file: When Retrieval Succeeds and Fails.pdf
Loaded 11 pages from When Retrieval Succeeds and Fails.pdf
Total documents loaded: 92
Split 92 documents into 520 chunks

Example chunk:
Content: RAG-ANYTHING: ALL-IN-ONE RAG FRAMEWORK
RAG-ANYTHING: ALL-IN-ONE RAG FRAMEWORK
Zirui Guo, Xubin Ren, Lingrui Xu, Jiahao Zhang, Chao Huang∗
The University of Hong Kong
zrguo101@hku.hk
xubinrencs@gmail.c...
Metadata: {'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:e76afa9)', 'creationdate': '', 'source': 

### Embedding and Vector Store DataBase

In [8]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        
        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings


embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x22fc547fd40>

In [9]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore
    

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x22fc5759640>

In [10]:
texts = [chunk.page_content for chunk in chunks]
embeddings = embedding_manager.generate_embeddings(texts)
vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 520 texts...


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Batches: 100%|██████████| 17/17 [00:22<00:00,  1.31s/it]


Generated embeddings with shape: (520, 384)
Adding 520 documents to vector store...
Successfully added 520 documents to vector store
Total documents in collection: 520


### Retriever Pipeline From VectroStore

In [11]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)



In [12]:
rag_retriever.retrieve("Can you tell me about recent publication of RAG?")

Retrieving documents for query: 'Can you tell me about recent publication of RAG?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 31.24it/s]

Generated embeddings with shape: (1, 384)
Retrieved 2 documents (after filtering)





[{'id': 'doc_6de717ba_466',
  'content': 'Gutiérrez et al. [2024, 2025], Leung et al. [2025]. The first line of RAG research focuses on\ntransforming the original queries to facilitate more effective subsequent retrieval Gao et al. [2023],\nMa et al. [2023b]; Some other studies aim to fine-tune the embedding model to accurately retrieve\nthe most relevant content Li and Li [2024], Zhang et al. [2025b]; Recently, much of the research has\nPreprint.\narXiv:2510.09106v1  [cs.CL]  10 Oct 2025',
  'metadata': {'author': 'Yongjie Wang; Yue Yu; Kaisong Song; Jun Lin; Zhiqi Shen',
   'creator': 'arXiv GenPDF (tex2pdf:e76afa9)',
   'modDate': '',
   'source_file': 'When Retrieval Succeeds and Fails.pdf',
   'creationDate': '',
   'page': 0,
   'producer': 'pikepdf 8.15.1',
   'keywords': '',
   'subject': '',
   'title': 'When Retrieval Succeeds and Fails: Rethinking Retrieval-Augmented Generation for LLMs',
   'source': '..\\data\\pdf_files\\When Retrieval Succeeds and Fails.pdf',
   'creation

### VectorDB To LLM Output Generation

In [17]:
# LLM

# import sys
# print(sys.executable)

# LLM
import vertexai
from vertexai.generative_models import GenerativeModel
from google.oauth2 import service_account
from dotenv import load_dotenv

load_dotenv()

# print(load_dotenv())
# print(os.getenv("GEMINI_API_KEY"))

True

In [18]:
class GeminiRAG:
    """Complete RAG pipeline with Gemini LLM via Vertex AI"""
    
    def __init__(
        self, 
        service_account_key_path: str = None,
        project_id: str = None,
        location: str = "us-central1",
        model_name: str = "gemini-2.5-flash"
    ):
        """
        Initialize Gemini via Vertex AI
        
        Args:
            service_account_key_path: Path to service account JSON key file
            project_id: Google Cloud project ID
            location: Google Cloud location (default: us-central1)
            model_name: Gemini model name (default: gemini-2.5-flash)
        """
        # Get configuration from parameters or environment
        self.key_path = service_account_key_path or os.getenv("VERTEX_AI_KEY_PATH")
        self.project_id = project_id or os.getenv("VERTEX_AI_PROJECT_ID")
        self.location = location or os.getenv("VERTEX_AI_LOCATION", "us-central1")
        self.model_name = model_name
        
        if not self.key_path or not self.project_id:
            raise ValueError(
                "Service account key path and project ID required. "
                "Set VERTEX_AI_KEY_PATH and VERTEX_AI_PROJECT_ID in .env or pass as parameters"
            )
        
        # Initialize Vertex AI with service account
        try:
            creds = service_account.Credentials.from_service_account_file(self.key_path)
            vertexai.init(
                project=self.project_id,
                location=self.location,
                credentials=creds
            )
            
            self.model = GenerativeModel(self.model_name)
            print(f"Vertex AI Gemini initialized: {self.model_name}")
            print(f"Project: {self.project_id}, Location: {self.location}")
            
        except Exception as e:
            print(f"Error initializing Vertex AI: {e}")
            raise
    
    def generate_response(self, query: str, retrieved_docs: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Generate response using Gemini with retrieved context"""
        
        # Prepare context from retrieved documents
        context = "\n\n".join([
            f"Document {i+1} (Score: {doc['similarity_score']:.3f}):\n{doc['content']}"
            for i, doc in enumerate(retrieved_docs)
        ])
        
        # Create prompt
        prompt = f"""Use the following context to answer the question. If you don't know the answer based on the context, say so.

Context:
{context}

Question: {query}

Answer:"""
        
        try:
            print("Generating response with Gemini...")
            # Use native SDK's generate_content method
            response = self.model.generate_content(prompt)
            
            return {
                'answer': response.text,
                'query': query,
                'num_sources': len(retrieved_docs),
                'sources': [
                    {
                        'rank': doc['rank'],
                        'score': doc['similarity_score'],
                        'source': doc['metadata'].get('source_file', 'Unknown'),
                        'page': doc['metadata'].get('page', 'N/A')
                    }
                    for doc in retrieved_docs
                ]
            }
        except Exception as e:
            print(f"Error generating response: {e}")
            return {
                'answer': f"Error generating response: {str(e)}",
                'query': query,
                'num_sources': 0,
                'sources': []
            }

In [15]:
# Complete Pipeline
class RAGPipeline:
    """Complete RAG pipeline orchestrator"""
    
    def __init__(self, pdf_directory: str, persist_directory: str = "../data/vector_store"):
        self.pdf_directory = pdf_directory
        self.persist_directory = persist_directory
        
        # Initialize components
        self.embedding_manager = EmbeddingManager()
        self.vector_store = VectorStore(persist_directory=persist_directory)
        self.retriever = RAGRetriever(self.vector_store, self.embedding_manager)
        self.gemini_rag = GeminiRAG()
    
    def index_documents(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        """Load, split, embed, and index documents"""
        print("="*50)
        print("Starting document indexing...")
        print("="*50)
        
        # Load documents
        data_loader = DataLoader(self.pdf_directory)
        documents = data_loader.process_all_pdfs()
        
        # Split documents
        doc_splitter = DocumentSplitter(documents, chunk_size, chunk_overlap)
        chunks = doc_splitter.split_documents()
        
        # Generate embeddings
        texts = [chunk.page_content for chunk in chunks]
        embeddings = self.embedding_manager.generate_embeddings(texts)
        
        # Add to vector store
        self.vector_store.add_documents(chunks, embeddings)
        
        print("="*50)
        print("Document indexing complete!")
        print("="*50)
    
    def query(self, question: str, top_k: int = 5) -> Dict[str, Any]:
        """Query the RAG pipeline"""
        print("\n" + "="*50)
        print(f"Processing query: {question}")
        print("="*50)
        
        # Retrieve relevant documents
        retrieved_docs = self.retriever.retrieve(question, top_k=top_k)
        
        if not retrieved_docs:
            return {
                'answer': "No relevant documents found to answer your question.",
                'query': question,
                'num_sources': 0,
                'sources': []
            }
        
        # Generate response
        response = self.gemini_rag.generate_response(question, retrieved_docs)
        
        return response

In [19]:
pipeline = RAGPipeline(pdf_directory="../data")

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384
Vector store initialized. Collection: pdf_documents
Existing documents in collection: 520
Vertex AI Gemini initialized: gemini-2.5-flash
Project: vertex-ai-learning-480916, Location: us-central1


In [20]:
response = pipeline.query("Can you give some modern knowlodge about RAG", top_k=3)


Processing query: Can you give some modern knowlodge about RAG
Retrieving documents for query: 'Can you give some modern knowlodge about RAG'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 27.97it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Generating response with Gemini...





In [21]:
print(f"\nAnswer: {response['answer']}")


Answer: Based on the context, here is some modern knowledge about RAG:

Modern RAG frameworks are increasingly focusing on multimodal capabilities. Traditional RAG systems, which are often text-only, systematically exclude vital knowledge sources such as plots, diagrams, statistical visualizations, market charts, correlation matrices, performance tables, radiological images, diagnostic charts, and clinical data tables. This exclusion creates fundamental gaps, making them inadequate for real-world applications that require comprehensive information understanding in fields like scientific research, financial analysis, and medical literature analysis.

While current multimodal RAG systems show promising capabilities, they still have limitations and challenges. Researchers are actively investigating failure patterns to understand where and why these systems break down, which is crucial for advancing the field beyond current performance plateaus.
