In [16]:
!pip install python-dotenv groq langchain-pinecone --quiet requests beautifulsoup4 langchain langchain_classic langchain_community -qU  rank_bm25 sentence_transformers


In [17]:

import os
from dotenv import load_dotenv

load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")

if not GROQ_API_KEY:
    raise ValueError("Set GROQ_API_KEY in your environment before running this notebook.")

In [18]:
from langchain_openai import ChatOpenAI

_groq_client_kwargs = {
    "api_key": GROQ_API_KEY,
    "base_url": GROQ_BASE_URL,
}

model = ChatOpenAI(
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    stream_usage=True,
    temperature=0,
    **_groq_client_kwargs,
)

In [19]:
import os
from dotenv import load_dotenv
load_dotenv("C:/Users/HP/Desktop/agentic-rag/.env", override=True)
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")


from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
from pinecone import ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "agentic-rag"

# Only create if not exists
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        vector_type="dense",
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
else:
    print(f"Index '{index_name}' already exists. Skipping creation.")


Index 'agentic-rag' already exists. Skipping creation.


In [20]:
# Cell 3 ‚Äî Depth-limited Web Crawler (Clean Version, No Emojis)



import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
import time

def clean_text(text: str) -> str:
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_visible_text(soup: BeautifulSoup) -> str:
    for script in soup(["script", "style", "noscript"]):
        script.decompose()
    text = soup.get_text(separator=' ')
    return clean_text(text)

def is_internal_link(base_url: str, link: str) -> bool:
    base_domain = urlparse(base_url).netloc
    link_domain = urlparse(link).netloc
    return (not link_domain) or (link_domain == base_domain)

def crawl_website(base_url: str, max_pages: int = 10, max_depth: int = 2):
    visited = set()
    to_visit = [(base_url, 0)]
    crawled_pages = []

    while to_visit and len(crawled_pages) < max_pages:
        url, depth = to_visit.pop(0)

        if depth > max_depth:
            continue
        if url in visited:
            continue

        visited.add(url)

        try:
            res = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
            if res.status_code != 200:
                continue
        except Exception:
            continue

        soup = BeautifulSoup(res.text, "html.parser")
        page_text = extract_visible_text(soup)

        crawled_pages.append({
            "url": url,
            "depth": depth,
            "text": page_text
        })

        for a in soup.find_all("a", href=True):
            link = urljoin(url, a["href"])
            if is_internal_link(base_url, link) and link not in visited:
                to_visit.append((link, depth + 1))

        time.sleep(0.2)

    return crawled_pages

# Quick test
pages = crawl_website("https://en.wikipedia.org/wiki/India", max_pages=2)
pages




[{'url': 'https://en.wikipedia.org/wiki/India',
  'depth': 0,
  'text': 'India - Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main page Contents Current events Random article About Wikipedia Contact us Contribute Help Learn to edit Community portal Recent changes Upload file Special pages Search Search Appearance Donate Create account Log in Personal tools Donate Create account Log in Contents move to sidebar hide (Top) 1 Etymology 2 History Toggle History subsection 2.1 Ancient India 2.2 Medieval India 2.3 Early modern India 2.4 Modern India 3 Geography Toggle Geography subsection 3.1 Climate 3.2 Biodiversity 4 Government and politics Toggle Government and politics subsection 4.1 Politics 4.2 Government 4.3 Administrative divisions 4.3.1 States 4.3.2 Union territories 4.4 Foreign relations 4.5 Military 5 Economy Toggle Economy subsection 5.1 Industries 6 Demographics Toggle Demographics subsection 6.1 Languages 6.2 Religion 6.3 Education 6.4 Health 7 C

In [21]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
    )

In [22]:

all_splits = None



In [23]:


def ingestion_tool(website="https://en.wikipedia.org/wiki/India"):
    pages = crawl_website(website, max_pages=2)
    pages
    doc = "\n\n".join(p["text"] for p in pages)
    # print(len(doc))

    # chunking
    global all_splits
    from langchain_text_splitters import RecursiveCharacterTextSplitter

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000, chunk_overlap=400, add_start_index=True
    )
    all_splits = text_splitter.split_text(doc)

    # print(len(all_splits))

    # embeddings
    
    

    embeddings = hf.embed_documents(all_splits)
    # print("Number of embeddings:", len(embeddings))
    # print("Embedding dimension:", len(embeddings[0]))

    

    index = pc.Index("agentic-rag")


    records = []

    for i, (doc, emb) in enumerate(zip(all_splits, embeddings)):
        records.append({
            "id": str(i),
            "values": emb,   # your dense embedding
            "metadata": {
            "source_url": "local-rag",
            "chunk_index": i,
            "text": doc
        }
        
        })

# 4. Upsert in batches
    for i in range(0, len(records), 100):
        batch = records[i:i+100]
        index.upsert(
            vectors=batch,
            namespace="rag-namespace"
        )

    
    return "Ingestion completed successfully."


    
    
ingestion_tool()







'Ingestion completed successfully.'

In [40]:


def retriever_tool(mainquery: str):
    
    from langchain_pinecone import PineconeVectorStore
    vectorstore = PineconeVectorStore(
        index_name="agentic-rag",
        embedding=hf,
        namespace="rag-namespace"
    )
    dense_retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 5}
    )

    from langchain_community.retrievers import BM25Retriever
    bm25_retriever = BM25Retriever.from_texts(all_splits)
    bm25_retriever.k = 5   # return top 5 keyword matches
    


    from langchain_classic.retrievers import EnsembleRetriever
    ensemble_retriever = EnsembleRetriever(
        retrievers=[bm25_retriever, dense_retriever],
        weights=[0.4, 0.6]
    )

    from langchain_community.cross_encoders import HuggingFaceCrossEncoder
    from langchain_classic.retrievers.document_compressors import CrossEncoderReranker
    cross_encoder_model = HuggingFaceCrossEncoder(
        model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"
    )

    reranker = CrossEncoderReranker(
        model=cross_encoder_model,
        top_n=5
    )

    from langchain_classic.retrievers import ContextualCompressionRetriever
    
    final_retriever = ContextualCompressionRetriever(
        base_compressor=reranker,
        base_retriever=ensemble_retriever
    )
    docs = final_retriever.invoke(mainquery)

    # ---------- Return ONLY context ----------
    return {
        "context": [d.page_content for d in docs]
    }
    

    

retriever_tool("where is india")

{'context': ['political and historical reasons but is geographically in West Asia. Category Asia portal Portals : India Countries Asia Authority control databases International ISNI VIAF GND FAST WorldCat National United States France 2 3 BnF data 2 3 Japan Australia Czech Republic Spain Portugal Norway Chile Sweden Vatican Israel Geographic MusicBrainz area Academics CiNii Artists KulturNav People Trove UK Parliament Other IdRef Historical Dictionary of Switzerland NARA Encyclopedia of Modern Ukraine ƒ∞sl√¢m Ansiklopedisi Yale LUX 21¬∞N 78¬∞E \ufeff / \ufeff 21¬∞N 78¬∞E \ufeff / 21; 78 Retrieved from " https://en.wikipedia.org/w/index.php?title=India&oldid=1324024825 " Categories : India BRICS nations Countries and territories where English is an official language Countries and territories where Hindi is an official language Countries in Asia Federal constitutional republics Former British colonies and protectorates in Asia G15 nations G20 members Member states of the Commonwealth of 

In [37]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

template = """
You are a helpful assistant.
Use ONLY the provided context to answer the question.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question:
{question}

Answer:
"""

answer_prompt = PromptTemplate.from_template(template)

def answering_tool(context: list, question: str):
    # Join retrieved context into single string
    context_text = "\n\n".join(context)

    chain = answer_prompt | model | StrOutputParser()

    return chain.invoke({
        "context": context_text,
        "question": question
    })


In [26]:
def calculator(expression: str) -> str:
    try:
        return str(eval(expression))
    except Exception as e:
        return f"Error: {str(e)}"
from langchain.tools import tool
# CORRECT WAY: Pass function as first argument
retriever_tool = tool(retriever_tool,description="Retrieve relevant context chunks from the vector store. Input: natural language query.")
ingestion_tool = tool(ingestion_tool, description="Crawl and ingest a website. Input: website URL as a string.")
calculator_tool = tool(calculator,description="Evaluate a mathematical expression. Input: arithmetic expression as a string.")
answering_tool=tool(answering_tool,description="tool that gives the final output/answer")

tools_array= [retriever_tool,ingestion_tool, calculator_tool,answering_tool]

In [41]:
from langchain.agents import create_agent
router_system_prompt = """
You are the Router Agent.

YOUR JOB:
Decide which tools to call and return the FINAL USER ANSWER.
You never output JSON to the user. Only use tool calls behind the scenes.

TOOLS YOU CAN CALL:
- ingestion_tool(url) ‚Üí The output is accessible as a result variable (e.g., 'ingestion_result').
- retriever_tool(query) ‚Üí The output is accessible as a result variable (e.g., 'retriever_result').
- answering_tool(context: list, question: str) ‚Üí returns the FINAL ANSWER.
  **IMPORTANT**: The 'context' parameter MUST be a JSON list/array containing the result variable name from the previous step (e.g., ["retriever_result"]).

ROUTING RULES:

1. INGESTION:
- If the user input contains a real http/https URL ‚Üí call ingestion_tool(url)
- Remove the URL from the text
- Then call retriever_tool(question)
- Then call answering_tool(context=["retriever_result"], question)

2. RETRIEVAL:
If the user asks a factual question:
- Call retriever_tool(question)
- Then call answering_tool(context=["retriever_result"], question)

3. MATH:
If the input is a math expression ‚Üí call calculator

4. SIMPLE QUESTIONS:
If you can answer directly without retrieval ‚Üí answer directly.

5. CLARIFICATION:
If unclear ‚Üí ask: "What do you mean?"

IMPORTANT:
You ALWAYS return the final answer.
Never return JSON.
always return tool traces as well.
"""


router_agent = create_agent(
    model=model,
    tools=tools_array,
    system_prompt=router_system_prompt
)

In [42]:
response = router_agent.invoke({
    "messages": [("user", "https://en.wikipedia.org/wiki/United_States explain the economy")]
})

print(response["messages"][-1].content)


The economy of the United States is a mixed economy, which is characterized by a combination of private enterprise and government intervention. It is the world's largest economy, with a nominal GDP of over $22 trillion and a purchasing power parity (PPP) GDP of over $22 trillion.

The economy is driven by a highly developed service sector, which accounts for approximately 80% of GDP, followed by the industrial sector, which accounts for around 20%. The agricultural sector accounts for a relatively small share of GDP.

The United States has a highly developed financial system, with a strong banking sector and a well-established stock market. The country is also a major player in international trade, with a large share of global exports and imports.

The economy has experienced significant growth over the past century, with an average annual GDP growth rate of around 2.5%. However, the economy has also experienced periods of recession, including the Great Depression of the 1930s and the 

In [53]:
response = router_agent.invoke({
    "messages": [("user", "What is $15.50$ plus the cost of running an LLM at $\$0.005$ per token for $8,000$ tokens a day for $30$ days, and then subtract the total from $\$200$ to find the remaining budget?")]
})

print(response["messages"][-1].content)

  "messages": [("user", "What is $15.50$ plus the cost of running an LLM at $\$0.005$ per token for $8,000$ tokens a day for $30$ days, and then subtract the total from $\$200$ to find the remaining budget?")]


The remaining budget is $-1015.50.
