## RAG Pipeline data Ingestion to Vector DB Pipeline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### Read all the pdf's inside the directory
from pathlib import Path

def process_all_pdfs(pdf_directory):
    """Process all PDFs in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
                
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
                
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents
# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")
                

Found 2 PDF files to process

Processing: qb.pdf
  ✓ Loaded 310 pages

Processing: thebook.pdf
  ✓ Loaded 234 pages

Total documents loaded: 544


In [3]:
all_pdf_documents

[Document(metadata={'producer': 'MiKTeX GPL Ghostscript  9.0', 'creator': 'dvips(k) 5.991 Copyright 2011 Radical Eye Software', 'creationdate': '2013-05-05T13:32:37+01:00', 'moddate': '2013-05-05T13:32:37+01:00', 'title': 'all.dvi', 'source': '../data/pdf/qb.pdf', 'total_pages': 310, 'page': 0, 'page_label': '1', 'source_file': 'qb.pdf', 'file_type': 'pdf'}, page_content='The Physics of Quantum Mechanics\nJames Binney\nand\nDavid Skinner'),
 Document(metadata={'producer': 'MiKTeX GPL Ghostscript  9.0', 'creator': 'dvips(k) 5.991 Copyright 2011 Radical Eye Software', 'creationdate': '2013-05-05T13:32:37+01:00', 'moddate': '2013-05-05T13:32:37+01:00', 'title': 'all.dvi', 'source': '../data/pdf/qb.pdf', 'total_pages': 310, 'page': 1, 'page_label': '2', 'source_file': 'qb.pdf', 'file_type': 'pdf'}, page_content='iv\nThis book is a consequence of the vision and muniﬁcence of\nWalter of Merton, who in 1264 launched something good\nCopyright c⃝ 2008–2013 James Binney and David Skinner\nPublis

In [4]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [5]:
chunks = split_documents(all_pdf_documents)
chunks

Split 544 documents into 1662 chunks

Example chunk:
Content: The Physics of Quantum Mechanics
James Binney
and
David Skinner...
Metadata: {'producer': 'MiKTeX GPL Ghostscript  9.0', 'creator': 'dvips(k) 5.991 Copyright 2011 Radical Eye Software', 'creationdate': '2013-05-05T13:32:37+01:00', 'moddate': '2013-05-05T13:32:37+01:00', 'title': 'all.dvi', 'source': '../data/pdf/qb.pdf', 'total_pages': 310, 'page': 0, 'page_label': '1', 'source_file': 'qb.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'MiKTeX GPL Ghostscript  9.0', 'creator': 'dvips(k) 5.991 Copyright 2011 Radical Eye Software', 'creationdate': '2013-05-05T13:32:37+01:00', 'moddate': '2013-05-05T13:32:37+01:00', 'title': 'all.dvi', 'source': '../data/pdf/qb.pdf', 'total_pages': 310, 'page': 0, 'page_label': '1', 'source_file': 'qb.pdf', 'file_type': 'pdf'}, page_content='The Physics of Quantum Mechanics\nJames Binney\nand\nDavid Skinner'),
 Document(metadata={'producer': 'MiKTeX GPL Ghostscript  9.0', 'creator': 'dvips(k) 5.991 Copyright 2011 Radical Eye Software', 'creationdate': '2013-05-05T13:32:37+01:00', 'moddate': '2013-05-05T13:32:37+01:00', 'title': 'all.dvi', 'source': '../data/pdf/qb.pdf', 'total_pages': 310, 'page': 1, 'page_label': '2', 'source_file': 'qb.pdf', 'file_type': 'pdf'}, page_content='iv\nThis book is a consequence of the vision and muniﬁcence of\nWalter of Merton, who in 1264 launched something good\nCopyright c⃝ 2008–2013 James Binney and David Skinner\nPublis

## embedding And VectorStoreDB

In [6]:
import os
from langchain_openai import OpenAIEmbeddings
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


In [18]:
OPENAI_API_KEY = "OPENAI_API_KEY"
# Just create the object - that's it!
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# Use it directly
vector = embeddings.embed_query("Test text")


## VectorStore

In [19]:

from langchain_community.vectorstores import Chroma
# Set your API key
open_api_key = "OPENAI_API_KEY"
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "./data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self._initialize_store()
    
    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        self.client = Chroma(
            collection_name=self.collection_name,
            persist_directory=self.persist_directory,
            embedding_function=OpenAIEmbeddings(model="text-embedding-3-small") # هذا المودل فقط للتحليل والحساب
        )
        print(f"✅ Vector store initialized at {self.persist_directory}")
    
    def add_documents(self, documents):
        """Add documents to the vector store"""
        self.client.add_documents(documents)
        print(f"✅ Added {len(documents)} documents")
    
    def search(self, query, k=3):
        """Search for similar documents"""
        return self.client.similarity_search(query, k=k)

vectorstore = VectorStore()
vectorstore


✅ Vector store initialized at ./data/vector_store


<__main__.VectorStore at 0x175018980>

In [10]:
chunks

[Document(metadata={'producer': 'MiKTeX GPL Ghostscript  9.0', 'creator': 'dvips(k) 5.991 Copyright 2011 Radical Eye Software', 'creationdate': '2013-05-05T13:32:37+01:00', 'moddate': '2013-05-05T13:32:37+01:00', 'title': 'all.dvi', 'source': '../data/pdf/qb.pdf', 'total_pages': 310, 'page': 0, 'page_label': '1', 'source_file': 'qb.pdf', 'file_type': 'pdf'}, page_content='The Physics of Quantum Mechanics\nJames Binney\nand\nDavid Skinner'),
 Document(metadata={'producer': 'MiKTeX GPL Ghostscript  9.0', 'creator': 'dvips(k) 5.991 Copyright 2011 Radical Eye Software', 'creationdate': '2013-05-05T13:32:37+01:00', 'moddate': '2013-05-05T13:32:37+01:00', 'title': 'all.dvi', 'source': '../data/pdf/qb.pdf', 'total_pages': 310, 'page': 1, 'page_label': '2', 'source_file': 'qb.pdf', 'file_type': 'pdf'}, page_content='iv\nThis book is a consequence of the vision and muniﬁcence of\nWalter of Merton, who in 1264 launched something good\nCopyright c⃝ 2008–2013 James Binney and David Skinner\nPublis

In [11]:
# You don't need to manually generate embeddings!
# Just pass your chunks directly to the vector store

# Step 1: Create the vector store
vector_store = VectorStore()

# Step 2: Add documents (embeddings are generated automatically)
vector_store.add_documents(chunks)

# That's it! The VectorStore class uses OpenAIEmbeddings internally

✅ Vector store initialized at ./data/vector_store
✅ Added 1662 documents


## RETRIEVER PIPELINE FROM VECTORESTORE

In [13]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store):
        """
        Initialize the retriever
        Args:
            vector_store: Vector store containing document embeddings
        """
        self.vector_store = vector_store

    def retrieve(self, query: str, top_k: int = 5,  score_threshold: float = 0.22):
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Search in vector store
        # Returns list of (Document, score) tuples
        results = self.vector_store.client.similarity_search_with_score(query, k=2)
        
        retrieved_docs = []
        
        # Process results
        for i, (doc, distance) in enumerate(results):
            # Convert distance to similarity score (ChromaDB uses cosine distance)
            similarity_score = 1.0 - distance
            
            if similarity_score >= score_threshold:
                # Extract all metadata
                metadata = doc.metadata
                
                # Create structured response matching your image
                doc_info = {
                    'id': f"doc_{i+1}",  # Generate a display ID
                    'content': doc.page_content,
                    'metadata': metadata,
                    'similarity_score': round(similarity_score, 4),
                    'distance': round(distance, 4),
                    'rank': i + 1,
                    # Extract specific fields for easy access
                    'source': metadata.get('source_file', 'Unknown'),
                    'author': metadata.get('author', 'Unknown'),
                    'page': metadata.get('page', 'Unknown'),
                    'title': metadata.get('title', 'Unknown')
                }
                
                retrieved_docs.append(doc_info)
        
        print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
        
        if not retrieved_docs:
            print("No documents found")
            
        return retrieved_docs

# --- TEST CODE ---
store = VectorStore()
retriever = RAGRetriever(store)
docs = retriever.retrieve("what is Probability and probability Probability and probability amplitudes?")

for d in docs:
    print(f"\n {d['title']} (Page {d['page']})")
    print(f"   Author: {d['author']}")
    print(f"   Score: {d['similarity_score']}")
    print(f"   Content: {d['content'][:150]}...")

✅ Vector store initialized at ./data/vector_store
Retrieving documents for query: 'what is Probability and probability Probability and probability amplitudes?'
Top K: 5, Score threshold: 0.22
Retrieved 1 documents (after filtering)

 all.dvi (Page 12)
   Author: Unknown
   Score: 0.319
   Content: 1.2 Probability amplitudes 5
Figure 1.1 The two-slit interference experiment.
1.2 Probability amplitudes
Many branches of the social, physical and med...


## LLM Agent

In [14]:
# 1. Define the Generator (if you haven't already included it)
class RAGGenerator:
    def __init__(self, model_name="gpt-3.5-turbo"):
        self.llm = ChatOpenAI(model=model_name, temperature=0)
        self.prompt = ChatPromptTemplate.from_template("""
        You are a helpful AI assistant. Use the following context to answer the user's question.
        If the answer is not in the context, say "I don't know based on the context provided."
        
        Context:
        {context}
        
        Question: 
        {question}
        
        Answer:
        """)
        
    def generate_answer(self, question, retrieved_docs):
        print(f"🤖 Generating answer for: '{question}'")
        context_text = "\n\n---\n\n".join([doc['content'] for doc in retrieved_docs])
        chain = self.prompt | self.llm | StrOutputParser()
        return chain.invoke({"context": context_text, "question": question})

# 2. RUN WITH IMPROVED SETTINGS (k=8)
# We ask for top 8 documents to make sure we catch the right one
docs = retriever.retrieve("what is Probability and probability Probability and probability amplitudes?", top_k=8, score_threshold=0.20)

generator = RAGGenerator()
answer = generator.generate_answer("what is Probability and probability Probability and probability amplitudes?", docs)

print(f"\n✨ FINAL ANSWER:\n{answer}")

Retrieving documents for query: 'what is Probability and probability Probability and probability amplitudes?'
Top K: 8, Score threshold: 0.2
Retrieved 2 documents (after filtering)
🤖 Generating answer for: 'what is Probability and probability Probability and probability amplitudes?'

✨ FINAL ANSWER:
Probability and probability amplitudes in quantum mechanics are calculated using complex numbers, where the probability of an event happening is evaluated as the mod-square of a certain complex number A. The complex number A is referred to as the probability amplitude for that event. Quantum mechanics is unique in using probability amplitudes to calculate probabilities, and these amplitudes give rise to phenomena that have no analogues in classical physics.


## Enhanced RAG pipeline features


In [16]:

# 1. Setup the LLM directly
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

def rag_advanced(query, retriever, llm_model, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    
    # 1. Retrieve Docs
    # (Uses your existing RAGRetriever class)
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    
    if not results:
        return {
            'answer': 'No relevant context found.', 
            'sources': [], 
            'confidence': 0.0, 
            'context': ''
        }

    # 2. Prepare Context and Sources
    context = "\n\n".join([doc['content'] for doc in results])
    
    sources = [{
        'source': doc['metadata'].get('source_file', 'unknown'),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:120] + '...'
    } for doc in results]

    # Calculate confidence (use the Max similarity score of the top result)
    confidence = results[0]['similarity_score'] if results else 0.0

    # 3. Generate Answer
    prompt = ChatPromptTemplate.from_template("""
    Use the following context to answer the question concisely.
    
    Context:
    {context}
    
    Question: 
    {question}
    
    Answer:
    """)
    
    chain = prompt | llm_model | StrOutputParser()
    answer_text = chain.invoke({"context": context, "question": query})

    # 4. Build Output
    output = {
        'answer': answer_text,
        'sources': sources,
        'confidence': round(confidence, 4)
    }
    
    if return_context:
        output['context'] = context
        
    return output

# --- TEST IT ---
# Using your existing 'retriever' and 'llm'
result = rag_advanced(
    "what is Probability and probability Probability and probability amplitudes?", 
    retriever=retriever, 
    llm_model=llm, 
    top_k=8,        # Increased k for better recall
    min_score=0.2,  # Lower threshold
    return_context=False
)

print("\n🤖 ANSWER:")
print(result['answer'])
print(f"\n📊 Confidence: {result['confidence']}")
print("\n📚 Sources:")
for s in result['sources']:
    print(f" - {s['source']} (Page {s['page']}) [Score: {s['score']}]")

Retrieving documents for query: 'what is Probability and probability Probability and probability amplitudes?'
Top K: 8, Score threshold: 0.2
Retrieved 2 documents (after filtering)

🤖 ANSWER:
Probability amplitudes are complex numbers in quantum mechanics that are used to calculate probabilities. The probability of an event happening is the modulus squared of the probability amplitude.

📊 Confidence: 0.319

📚 Sources:
 - qb.pdf (Page 12) [Score: 0.319]
 - qb.pdf (Page 19) [Score: 0.215]
