In [6]:
#Sri Rama Jayam
import os
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from rank_bm25 import BM25Okapi
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain.tools import Tool
from langchain.prompts import ChatPromptTemplate
import time
import numpy as np

# Set API Key in Environment
os.environ["OPENAI_API_KEY"] = "the apikey"
# Web Search using DuckDuckGo
def web_search(query, num_results=3):
    """Fetches search results from DuckDuckGo."""
    with DDGS() as ddgs:
        results = list(ddgs.text(query, max_results=num_results))
        if not results:
            print("[ERROR] No search results found.")
            return []
    return results

# Scrape Web Page Content
def page_scrape(url):
    """Fetches and scrapes webpage content."""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        paras = [p.get_text() for p in soup.find_all("p")]
        return "\n".join(paras).strip()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return ""

# Load Documents from Search
def load_documents(query, num_results=3):
    """Fetches search results and scrapes webpage content."""
    search_results = web_search(query, num_results)
    documents = []

    for result in search_results:
        url = result["href"]
        print(f"Fetching {url}")
        content = page_scrape(url)

        if content:
            documents.append({
                "url": url,
                "title": result["title"],
                "content": content
            })
        time.sleep(1)

    return documents

# BM25 Retriever
class BM25Retriever:
    def __init__(self, documents):
        self.documents = documents
        self.texts = [doc["content"] for doc in documents]
        self.tokenized_texts = [text.split() for text in self.texts]
        self.bm25 = BM25Okapi(self.tokenized_texts)

    def retrieve(self, query, k=3):
        query_tokens = query.split()
        scores = self.bm25.get_scores(query_tokens)
        return scores, self.documents

# Hybrid Retriever combining BM25 and FAISS
class HybridRetriever:
    def __init__(self, documents, bm25_weight=0.5):
        self.documents = documents
        self.bm25_retriever = BM25Retriever(documents)
        self.embeddings = OpenAIEmbeddings()
        texts = [doc["content"] for doc in documents]
        if texts:
            self.vectorstore = FAISS.from_texts(texts, self.embeddings)
        else:
            self.vectorstore = None
        self.bm25_weight = bm25_weight

    def retrieve(self, query, k=3):
        if not self.documents:
            return []

        # Get BM25 scores
        bm25_scores, docs = self.bm25_retriever.retrieve(query)

        # Get vector similarity scores
        if self.vectorstore:
            vector_results = self.vectorstore.similarity_search_with_score(query, k=len(self.documents))
            vector_scores = [1.0 - score for _, score in vector_results]  # Convert distance to similarity
        else:
            vector_scores = [0] * len(self.documents)

        # Normalize scores
        if max(bm25_scores) > 0:
            bm25_scores = [score / max(bm25_scores) for score in bm25_scores]
        if max(vector_scores) > 0:
            vector_scores = [score / max(vector_scores) for score in vector_scores]

        # Combine scores
        combined_scores = [self.bm25_weight * bm25 + (1 - self.bm25_weight) * vector
                           for bm25, vector in zip(bm25_scores, vector_scores)]

        # Get top k documents
        top_indices = sorted(range(len(combined_scores)), key=lambda i: combined_scores[i], reverse=True)[:k]
        return [self.documents[i] for i in top_indices]

# Retrieve Documents using Hybrid Search
def search_docs(query, hybrid_retriever):
    """Fetches relevant documents using hybrid search."""
    top_docs = hybrid_retriever.retrieve(query)
    if not top_docs:
        return "No relevant information found."
    return "\n\n".join([f"[Source: {doc['url']}]\n{doc['content'][:500]}..." for doc in top_docs])

# Main execution
if __name__ == "__main__":
    query = input("Enter your question: ")
    print("Loading documents...")
    documents = load_documents(query)

    # Build hybrid retriever
    hybrid_retriever = HybridRetriever(documents, bm25_weight=0.6)  # Adjust weight as needed

    # Define Search Tool
    tools = [
        Tool(
            name="WebContentSearch",
            func=lambda q: search_docs(q, hybrid_retriever),
            description="Searches for information in the web content database using hybrid search."
        )
    ]

    # Initialize LLM
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    # Define Prompt for LLM Agent
    system_prompt = """You are an AI assistant that finds information from web sources.
    - Use the WebContentSearch tool to find relevant information.
    - If no relevant information is found, return 'No relevant information available.'
    - Do not call WebContentSearch more than once per query.
    - Do not repeat queries or enter a loop."""

    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", "{input}"),
        ("ai", "{agent_scratchpad}")
    ])

    # Create LLM Agent
    agent = create_openai_tools_agent(llm, tools, prompt)
    agent_executor = AgentExecutor(
        agent=agent,
        tools=tools,
        verbose=True,
        handle_parsing_errors=True,
        max_iterations=10
    )

    # Run the agent
    response = agent_executor.invoke({"input": query, "agent_scratchpad": ""})
    print("\nFINAL ANSWER:")
    print(response["output"])


Enter your question: Difference between saving and investment
Loading documents...
Fetching https://keydifferences.com/difference-between-savings-and-investment.html
Fetching https://www.usbank.com/financialiq/invest-your-money/investment-strategies/saving-vs-investing-whats-the-difference.html
Fetching https://www.bankrate.com/investing/saving-vs-investing/


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `WebContentSearch` with `difference between saving and investment`


[0m[36;1m[1;3m[Source: https://www.bankrate.com/investing/saving-vs-investing/]
Compare accounts
Get guidance
Compare accounts
Get guidance
Money market accounts
Money market accounts are similar to savings accounts, but offer some checking features as well.
Get guidance
Banking
Unlock financial rewards by signing up for a savings or checking account with a bonus offer.
Get guidance
Compare rates
Get guidance
Compare rates
Get guidance
Buying & selling
Find an expert who knows the market. C

In [1]:
!pip install langchain langchain_community langchain_openai

Collecting langchain_community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.3.12-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-core<1.0.0,>=0.3.49 (from langchain)
  Downloading langchain_core-0.3.51-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain
  Downloading langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting tiktoken<1

In [2]:
!pip install rank_bm25 duckduckgo_search

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting duckduckgo_search
  Downloading duckduckgo_search-2025.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting primp>=0.14.0 (from duckduckgo_search)
  Downloading primp-0.14.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Downloading duckduckgo_search-2025.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading primp-0.14.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rank_bm25, primp, duckduckgo_search
Successfully installed duckduckgo_search-2025.4.4 pr

In [5]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
