<a href="https://colab.research.google.com/github/GiovanniPasq/agentic-rag-for-dummies/blob/main/Agentic_Rag_For_Dummies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --quiet --upgrade langgraph
!pip install -qU "langchain[google-genai]"
!pip install -qU langchain langchain-community langchain-qdrant langchain-huggingface qdrant-client fastembed flashrank langchain-core
!pip install --upgrade gradio

# Optional: if you want to use Ollama with local models
!pip install -qU langchain-ollama

In [None]:
import os
import getpass
from langchain_google_genai import ChatGoogleGenerativeAI

# Set your Google API key
if not os.environ.get("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API key: ")

# Initialize the LLM with zero temperature for consistent outputs
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0)

"""
from langchain_ollama.chat_models import ChatOllama

llm = ChatOllama(
    model="your-model",
    temperature=0
)
"""

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant.fastembed_sparse import FastEmbedSparse
from langchain_qdrant import QdrantVectorStore
from langchain_qdrant.qdrant import RetrievalMode

# Configuration
DOCUMENT_DIR = "docs"
SUMMARY_DIR = "summaries"
DB_PATH = "qdrant_db"
SUMMARY_COLLECTION = "document_summaries"

# Initialize embeddings
dense_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2" #"intfloat/multilingual-e5-large"
)
sparse_embeddings = FastEmbedSparse(
    model_name="Qdrant/bm25"
)

# Create Qdrant client
client = QdrantClient(path=DB_PATH)
embedding_dimension = len(dense_embeddings.embed_query("test"))

def ensure_collection(collection_name):
    """Create collection if it doesn't exist"""
    if not client.collection_exists(collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=qmodels.VectorParams(
                size=embedding_dimension,
                distance=qmodels.Distance.COSINE
            ),
            sparse_vectors_config={
                "sparse": qmodels.SparseVectorParams()
            },
        )

# Create collections
ensure_collection(SUMMARY_COLLECTION)

# Initialize vector stores
summary_vector_store = QdrantVectorStore(
    client=client,
    collection_name=SUMMARY_COLLECTION,
    embedding=dense_embeddings,
    sparse_embedding=sparse_embeddings,
    retrieval_mode=RetrievalMode.HYBRID,
    sparse_vector_name="sparse"
)

In [4]:
import os
import glob
import re
from langchain_core.documents import Document

summary_documents = []

# Load all summary files
for file_path in sorted(glob.glob(os.path.join(SUMMARY_DIR, "*_summary.md"))):
    base_name = os.path.basename(file_path)
    # Extract document ID from filename
    document_id = re.sub(r"_summary\.md$", "", base_name, flags=re.I).lower()

    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    summary_documents.append(
        Document(
            page_content=content,
            metadata={"document_id": document_id, "source": base_name}
        )
    )

# Index summaries in vector database
_ = summary_vector_store.add_documents(summary_documents)

In [5]:
from typing import List
from pathlib import Path

def search_summaries(query: str, k: int = 3) -> List[dict]:
    """
    Search for the top K most relevant document summaries.

    Args:
        query: The search query
        k: Number of results to return

    Returns:
        List of relevant summary documents with their metadata
    """
    results = summary_vector_store.similarity_search(query, k=k)
    # Convert to dict format that can be passed between tools
    return [
        {
            "content": doc.page_content,
            "document_id": doc.metadata.get("document_id", ""),
            "source": doc.metadata.get("source", "")
        }
        for doc in results
    ]

def retrieve_full_documents(document_ids: List[str]) -> List[str]:
    """
    Retrieve complete documents based on document IDs.

    Args:
        document_ids: List of document IDs to retrieve

    Returns:
        List of full document contents
    """
    full_documents = []

    for doc_id in document_ids:
        if not doc_id:
            continue

        # Construct path to full document
        document_path = Path(DOCUMENT_DIR) / f"{doc_id}{'.md' if not doc_id.endswith('.md') else ''}"
        if document_path.exists():
            with open(document_path, 'r', encoding='utf-8') as f:
                content = f.read()
                full_documents.append(content)

    return full_documents

# Bind tools to LLM
llm_with_tools = llm.bind_tools([search_summaries, retrieve_full_documents])

In [6]:
from langchain_core.messages import SystemMessage

SYSTEM_PROMPT = """You are an intelligent document retrieval assistant specialized in answering questions accurately using available documents.

Your task follows this precise workflow:

1. **Analyze the question**:
   - Understand what the user is asking
   - Identify the main topic and any sub-topics

2. **Rewrite and split if necessary**:
   - Rephrase the question if it's unclear
   - If the question covers multiple different topics, split it into sub-queries
   - Each sub-query should address a single, specific topic

3. **Retrieve top X summary documents**:
   - Decide how many summary documents to retrieve (the X value is your choice based on query complexity)
   - Use the search_summaries tool for each sub-query
   - Evaluate each retrieved summary to determine if it's relevant to the question
   - Discard irrelevant summaries

4. **Return exact document names**:
   - From the relevant summaries, extract the exact document_id with extension
   - List which documents you're going to retrieve

5. **Retrieve complete documents and provide answer**:
   - Use the retrieve_full_documents tool with the document_ids
   - Read the full documents to find the answer

6. **Verify document relevance**:
   - Check if each complete document is actually pertinent to the question
   - Discard documents that are not relevant
   - **If NONE of the documents are relevant, GO BACK TO STEP 1 and try again with different search terms**

7. **Provide clear and detailed answer**:
   - Give a comprehensive response based on the documents
   - Explain concepts clearly, assuming the user has no prior knowledge of the topic
   - Use simple language and avoid jargon when possible

8. **Verify answer completeness**:
   - Check that your complete answer is relevant and fully addresses the question
   - Ensure all sub-queries (if any) have been answered

9. **If answer is not satisfactory**:
   - **GO BACK TO STEP 1** and start the process again with a different approach

10. **Loop limit**:
    - **Repeat this entire loop a MAXIMUM of 3 times**
    - After 3 complete attempts, if you're still not confident in your answer, politely ask the user to rephrase their question more clearly

**Critical rules**:
- You MUST follow steps 1-10 in order
- You MUST go back to step 1 if documents are not relevant (step 6) or answer is not satisfactory (step 9)
- You MUST NOT exceed 3 complete loops through the entire process
- Always base your answers strictly on the retrieved documents
- Never make up information that isn't in the documents
"""

system_message = SystemMessage(content=SYSTEM_PROMPT)

In [None]:
from langchain_core.messages import HumanMessage
from langgraph.graph import MessagesState, START, StateGraph
from langgraph.prebuilt import ToolNode, tools_condition

# Define the agent's decision-making node
def agent_node(state: MessagesState):
    """Agent decides which tool to call or generates final response"""
    return {
        "messages": llm_with_tools.invoke(
            [system_message] + state["messages"]
        )
    }


# Build the graph
graph_builder = StateGraph(MessagesState)

# Add nodes
graph_builder.add_node("agent", agent_node)
graph_builder.add_node("tools", ToolNode([search_summaries, retrieve_full_documents]))

# Define edges
graph_builder.add_edge(START, "agent")
graph_builder.add_conditional_edges(
    "agent",
    tools_condition,  # Decides if tools are needed or if we should end
)
graph_builder.add_edge("tools", "agent")  # After tool use, return to agent

# Compile the graph
agent_graph = graph_builder.compile()

# Visualize the graph (optional)
from IPython.display import Image, display
display(Image(agent_graph.get_graph(xray=True).draw_mermaid_png()))

In [8]:
messages = [HumanMessage(content="""
What is the primary characteristic that makes the "Proof of Stake (PoS)" consensus mechanism more efficient than "Proof of Work (PoW)"
(as cited in their definitions), and what is the alternative consensus mechanism that uses approved validators and is faster,
 but less decentralized?
""")]
messages = agent_graph.invoke({"messages": messages})
for m in messages['messages']:
    m.pretty_print()



What is the primary characteristic that makes the "Proof of Stake (PoS)" consensus mechanism more efficient than "Proof of Work (PoW)"
(as cited in their definitions), and what is the alternative consensus mechanism that uses approved validators and is faster,
 but less decentralized?


Okay, I understand the question. It has two parts:

1.  What makes Proof of Stake (PoS) more efficient than Proof of Work (PoW)?
2.  What is the alternative consensus mechanism that uses approved validators and is faster, but less decentralized?

I will start by searching for summaries related to these two questions.
Tool Calls:
  search_summaries (42214a10-1663-4334-a884-4ea82a95219b)
 Call ID: 42214a10-1663-4334-a884-4ea82a95219b
  Args:
    query: Proof of Stake efficiency vs Proof of Work
    k: 3
  search_summaries (b183c043-3ddb-4870-8a1a-ed7d104acb8d)
 Call ID: b183c043-3ddb-4870-8a1a-ed7d104acb8d
  Args:
    query: alternative consensus mechanism with approved validators faster but less decent

In [None]:
import gradio as gr

def chat_with_agent(message, history):
    """Process user message and return agent's response"""
    result = agent_graph.invoke({
        "messages": [HumanMessage(content=message)]
    })
    return result["messages"][-1].content

# Launch Gradio interface
demo = gr.ChatInterface(fn=chat_with_agent)
demo.launch(share=False)