## MCP based Web-Scraping

In [1]:
import http.client
import json
import os
from dotenv import load_dotenv

In [2]:
# query = "TDENGINE DB CONNECTION PYTHON"
load_dotenv() # to load all api keys from .env

True

### Testing the web search functionality in serper

Serper is a third-party Google Search API for developers which gives structured, real-time Google search results in a fast and affordable way. It allows developers to integrate Google search data into their applications designed for getting organic data for AI Agents, Knowledge Graphs or pull data from internet etc.

In [16]:
def web_Search(query: str) -> dict | None:     #type hints
    conn = http.client.HTTPSConnection("google.serper.dev")
    payload = json.dumps({
    "q": query,
    "num": 2
    })

    api_key = os.getenv("SERPER_API_KEY")
    if not api_key:
        raise ValueError("‚ùå SERPER_API_KEY not found in .env file")

    headers = {
    'X-API-KEY': str(api_key),
    'Content-Type': 'application/json'
    }

    conn.request("POST", "/search", payload, headers)
    res = conn.getresponse()
    data = res.read()
    # print(data.decode("utf-8"))
    result = json.loads(data.decode("utf-8"))
    return result

In [4]:
web_Search(query="Chroma DB")

{"searchParameters":{"q":"Chroma DB","type":"search","num":2,"engine":"google"},"organic":[{"title":"Chroma","link":"https://www.trychroma.com/","snippet":"Chroma is the open-source search and retrieval database for AI applications. ... Getting started is as easy as pip install... pip install chromadb ¬∑ Full ...","position":1},{"title":"chroma-core/chroma: Open-source search and retrieval ...","link":"https://github.com/chroma-core/chroma","snippet":"Chroma - the open-source embedding database. The fastest way to build Python or JavaScript LLM apps with memory!","position":2}],"credits":1}


In [18]:
msg = web_Search(query="langchain with QDrant")
print(msg) 

{'searchParameters': {'q': 'langchain with QDrant', 'type': 'search', 'num': 2, 'engine': 'google'}, 'organic': [{'title': 'Qdrant | ü¶úÔ∏èüîó LangChain', 'link': 'https://python.langchain.com/docs/integrations/vectorstores/qdrant/', 'snippet': 'Qdrant (read: quadrant ) is a vector similarity search engine. It provides a production-ready service with a convenient API to store, search, and manage ...', 'position': 1}, {'title': 'Langchain', 'link': 'https://qdrant.tech/documentation/frameworks/langchain/', 'snippet': 'Langchain is a library that makes developing Large Language Model-based applications much easier. It unifies the interfaces to different libraries.', 'position': 2}], 'credits': 1}


In [19]:
print(msg["organic"])

[{'title': 'Qdrant | ü¶úÔ∏èüîó LangChain', 'link': 'https://python.langchain.com/docs/integrations/vectorstores/qdrant/', 'snippet': 'Qdrant (read: quadrant ) is a vector similarity search engine. It provides a production-ready service with a convenient API to store, search, and manage ...', 'position': 1}, {'title': 'Langchain', 'link': 'https://qdrant.tech/documentation/frameworks/langchain/', 'snippet': 'Langchain is a library that makes developing Large Language Model-based applications much easier. It unifies the interfaces to different libraries.', 'position': 2}]


### Making the search asyncronous

In [2]:
import httpx
import asyncio

In [3]:
SERPER_URL = "https://google.serper.dev/search"

async def web_Search(query: str) -> dict | None:     #type hints
    payload = json.dumps({
    "q": query,
    "num": 2
    })

    api_key = os.getenv("SERPER_API_KEY")
    if not api_key:
        raise ValueError("‚ùå SERPER_API_KEY not found in .env file")

    headers = {
    'X-API-KEY': str(api_key),
    'Content-Type': 'application/json'
    }

    async with httpx.AsyncClient() as client:
        response = await client.post(
            SERPER_URL, headers=headers,
            data=payload, timeout=30.0
        )
        response.raise_for_status()
        return response.json()
    
query = "Chroma DB vs Pinecone"
resp = asyncio.run(web_Search(query))
print(resp)
#web_Search(query="Chroma DB") # awaited error 

RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:
# creating ai web scraping tool
# step 1: search web

import http.client
import json
import os
import httpx
import asyncio
from utils import clean_html_to_txt
from dotenv import load_dotenv

# query = "TDENGINE DB CONNECTION PYTHON"
load_dotenv()

SERPER_URL = "https://google.serper.dev/search"
# query = "Chroma DB with Pinecone"

async def web_Search(query: str) -> dict | None:     #type hints
    payload = json.dumps({
    "q": query,
    "num": 2
    })

    api_key = os.getenv("SERPER_API_KEY")
    if not api_key:
        raise ValueError("‚ùå SERPER_API_KEY not found in .env file")

    headers = {
    'X-API-KEY': str(api_key),
    'Content-Type': 'application/json'
    }

    async with httpx.AsyncClient() as client:
        response = await client.post(
            SERPER_URL, headers=headers,
            data=payload, timeout=30.0
        )
        response.raise_for_status()
        return response.json()
    

res = asyncio.run(web_Search(query="Chroma DB"))
print(res)


# step 2: go and open official documentation only not any other due to security purposes

async def fetch_url(url: str) -> str:

    #client
    async with httpx.AsyncClient() as client:
        # hit request to url
        response = await client.post(url,timeout=30.0) #response from internet

        #parse n clean Data
        cleaned_reponse = clean_html_to_txt(response.text)

        #return cleaned data
        return cleaned_reponse

# step 3: after reading doc debug code accordingly - write tool function based on above supporting functions

docs_urls = [
        "chroma-db":"https://chromadb.readthedocs.io/en/latest/",
        "Pinecone":"https://docs.pinecone.io/docs/quickstart",
        "Langchain":"https://python.langchain.com/docs/",
        "uv":"https://www.uvicorn.org/",
        "Openai":"https://platform.openai.com/docs",
        "llama-index":"https://gpt-index.readthedocs.io/en/latest/",
        "FastAPI":"https://fastapi.tiangolo.com/"
]

async def get_docs(query :str, library: str):

    """ 
    Search to get official latest documentation content based on query and library name 
    supports langchain, openai, chromadb, pinecone, uvicorn, fastapi, llama-index.

    Args:
        query (str): The query to find e.g. "Build a REST API endpoint to upload files".
        library (str): The library to be searched is e.g. "fastapi".
    
    Returns:
        str: Summarized content from official documentation.
    """

    # we dont wanrt to scrape general docs, but official docs reponses should be received from web search
    # if tool not found in docs then create general responses

    if library not in docs_urls:
        raise ValueError(f"‚ùå Library {library}  documentation URL not found")

    # strategy to get what we want => https://python.langchain.com/docs/ chromadb connection
    query = f"site:{docs_urls[library]} {query}"

    results = await web_Search(query=query)

    # if we dont get any results
    if len(results['organic']) == 0:
        return f"‚ùå No results found for query: {query}"
    
    # looping thru results in organic key and fetching url content
    for r in results['organic']:

        link = r.get("link","") # getting link from each organic result

        text_part = []

        raw = await fetch_url(link)

        # we want to see authentic info, so need to see from which url we are getting content
        if raw:
            labeled = f"Source: {link}\n{raw}"
            print("Source :",link)
            text_part.append(labeled)
        
    return "\n\n".join(text_part)