# Web Q&A Agent with Thordata (SERP + Universal + LLM)

This notebook shows how to build a simple **Web Question‑Answering Agent**:

1. Take a natural language question.
2. Use **Thordata SERP API** to search the web (Google by default).
3. For the top results, use **Thordata Universal Scraping API** to fetch and clean
   the page content.
4. Call an LLM (e.g. OpenAI) to generate an answer based on the retrieved pages,
   including citations/links.

We also support:

- **Live mode** (`USE_LIVE_THORDATA = True`) to call real APIs.
- **Offline mode** (`USE_LIVE_THORDATA = False`) to load cached documents
  from `data/web_qa_sample.json` without consuming credits.

In [1]:
import json
import os
from pathlib import Path
from typing import Any

import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from thordata import Engine, ThordataClient

# Optional: try to import OpenAI for LLM
try:
    from openai import OpenAI
except ImportError:
    OpenAI = None  # We will handle this gracefully later

# -----------------------------
# Resolve project root
# -----------------------------
# 情况 1：在 .py 脚本里，有 __file__，可以直接从脚本位置往上算
if "__file__" in globals():
    ROOT_DIR = Path(__file__).resolve().parents[2]
else:
    # 情况 2：在 Notebook 里，当前工作目录是 notebooks/ai
    # notebooks/ai  -> parent(ai) -> parent(notebooks) -> parent(仓库根)
    ROOT_DIR = Path.cwd().parents[1]

# Load .env from the project root
ENV_PATH = ROOT_DIR / ".env"
load_dotenv(ENV_PATH, override=True)

# Toggle between live API calls and local cached data
USE_LIVE_THORDATA = True

# Cache path for web QA documents (统一放在仓库根目录的 data/)
CACHE_DIR = ROOT_DIR / "data"
DOCS_CACHE_PATH = CACHE_DIR / "web_qa_sample.json"

print("CWD:", os.getcwd())
print("ROOT_DIR:", ROOT_DIR)
print("ENV_PATH exists? ->", ENV_PATH.is_file())
print("DOCS_CACHE_PATH:", DOCS_CACHE_PATH)

CWD: D:\Thordata_Work\thordata-cookbook\notebooks\ai
ROOT_DIR: D:\Thordata_Work\thordata-cookbook
ENV_PATH exists? -> True
DOCS_CACHE_PATH: D:\Thordata_Work\thordata-cookbook\data\web_qa_sample.json


In [2]:
SCRAPER_TOKEN = os.getenv("THORDATA_SCRAPER_TOKEN")
PUBLIC_TOKEN = os.getenv("THORDATA_PUBLIC_TOKEN")
PUBLIC_KEY = os.getenv("THORDATA_PUBLIC_KEY")

if not SCRAPER_TOKEN:
    raise RuntimeError(
        "THORDATA_SCRAPER_TOKEN is missing. "
        "Please configure your .env file at the project root."
    )

td_client = ThordataClient(
    scraper_token=SCRAPER_TOKEN,
    public_token=PUBLIC_TOKEN,
    public_key=PUBLIC_KEY,
)

td_client

<thordata.client.ThordataClient at 0x14a6ee0fb60>

In [3]:
def search_web_serp(
    query: str,
    num_results: int = 3,
    engine: Engine = Engine.GOOGLE,
    location: str | None = None,
) -> list[dict[str, Any]]:
    """
    Use Thordata SERP API to search the web and return a list of basic results.
    """
    extra_params: dict[str, Any] = {}
    if location:
        extra_params["location"] = location

    print(f"Searching {engine.value} for: {query!r}")
    results = td_client.serp_search(
        query=query,
        engine=engine,
        num=num_results,
        **extra_params,
    )

    organic = results.get("organic") or []
    cleaned: list[dict[str, Any]] = []
    for item in organic:
        cleaned.append(
            {
                "title": item.get("title"),
                "link": item.get("link"),
                "snippet": item.get("snippet"),
            }
        )
    print(f"Got {len(cleaned)} organic results.")
    return cleaned

In [4]:
def clean_html_to_text(html: str) -> str:
    """
    Convert raw HTML into a cleaned plain-text representation.

    - Removes scripts, styles, navigation, footers, SVGs, iframes.
    - Collapses whitespace and drops empty lines.
    """
    soup = BeautifulSoup(html, "html.parser")

    # Remove noisy elements
    for tag in soup(["script", "style", "nav", "footer", "svg", "iframe", "noscript"]):
        tag.decompose()

    text = soup.get_text(separator="\n")
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    clean_text = "\n".join(lines)
    return clean_text

In [5]:
def fetch_docs_from_web(
    query: str,
    num_results: int = 3,
    engine: Engine = Engine.GOOGLE,
    location: str | None = None,
    js_render: bool = True,
    per_doc_max_chars: int = 4000,
) -> list[dict[str, Any]]:
    """
    High-level function: search the web and fetch cleaned text for top results.

    Returns:
        List of documents: [{ "url", "title", "snippet", "content" }, ...]
    """
    serp_results = search_web_serp(query, num_results=num_results, engine=engine, location=location)

    docs: list[dict[str, Any]] = []
    for idx, item in enumerate(serp_results, start=1):
        url = item.get("link")
        if not url:
            continue

        print(f"\n[{idx}/{len(serp_results)}] Fetching via Universal API: {url}")
        html = td_client.universal_scrape(
            url=url,
            js_render=js_render,
            output_format="HTML",
        )

        if not html or len(html) < 200:
            print("  Skipping: content too short or empty.")
            continue

        text = clean_html_to_text(html)
        if per_doc_max_chars and len(text) > per_doc_max_chars:
            text = text[:per_doc_max_chars]

        docs.append(
            {
                "url": url,
                "title": item.get("title"),
                "snippet": item.get("snippet"),
                "content": text,
            }
        )
        print(f"  Collected {len(text)} characters of cleaned text.")

    print(f"\nTotal documents collected: {len(docs)}")
    return docs

In [6]:
def get_docs_for_question(
    question: str,
    num_results: int = 3,
    engine: Engine = Engine.GOOGLE,
    location: str | None = None,
) -> list[dict[str, Any]]:
    """
    Orchestrator: in live mode, fetch docs from web and cache them.
    In offline mode, load docs from the local cache.
    """
    if USE_LIVE_THORDATA:
        docs = fetch_docs_from_web(
            query=question,
            num_results=num_results,
            engine=engine,
            location=location,
            js_render=True,
            per_doc_max_chars=4000,
        )
        # Cache to local JSON
        CACHE_DIR.mkdir(parents=True, exist_ok=True)
        with DOCS_CACHE_PATH.open("w", encoding="utf-8") as f:
            json.dump(docs, f, ensure_ascii=False, indent=2)
        print(f"\nCached docs to {DOCS_CACHE_PATH}")
    else:
        print(f"Loading docs from cache: {DOCS_CACHE_PATH}")
        if not DOCS_CACHE_PATH.is_file():
            raise FileNotFoundError(
                f"Cached docs not found at {DOCS_CACHE_PATH}. "
                "Set USE_LIVE_THORDATA = True and run once to create them."
            )
        with DOCS_CACHE_PATH.open("r", encoding="utf-8") as f:
            docs = json.load(f)

    print(f"Loaded {len(docs)} documents.")
    return docs


# Example: run once to see the structure (will use cache or live depending on flag)
sample_question = "What is Thordata used for?"
docs = get_docs_for_question(sample_question, num_results=3)
pd.DataFrame(
    [{"title": d["title"], "url": d["url"]} for d in docs]
)

Searching google for: 'What is Thordata used for?'
Got 3 organic results.

[1/3] Fetching via Universal API: https://www.thordata.com/
  Collected 4000 characters of cleaned text.

[2/3] Fetching via Universal API: https://www.thordata.com/blog/proxies/thordata-review
  Collected 4000 characters of cleaned text.

[3/3] Fetching via Universal API: https://www.youtube.com/watch?v=9k10yMGKQAE
  Collected 4000 characters of cleaned text.

Total documents collected: 3

Cached docs to D:\Thordata_Work\thordata-cookbook\data\web_qa_sample.json
Loaded 3 documents.


Unnamed: 0,title,url
0,Thordata - High-Quality Proxy Service for Web ...,https://www.thordata.com/
1,The Best Proxies for AI and Streaming,https://www.thordata.com/blog/proxies/thordata...
2,Is This The BEST Web Scraping/Proxy Platform f...,https://www.youtube.com/watch?v=9k10yMGKQAE


In [7]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

def summarize_with_llm(
    question: str,
    docs: list[dict[str, Any]],
    model: str = "gpt-4o-mini",
) -> str:
    """
    Ask an LLM to answer the question based on the provided documents.

    If OpenAI is not installed or API key is missing, a helpful message is returned.
    """
    if OpenAI is None:
        return (
            "LLM backend is not configured. Please install the 'openai' package:\n"
            "  pip install openai\n"
            "and set OPENAI_API_KEY in your .env file."
        )

    if not OPENAI_API_KEY:
        return (
            "OPENAI_API_KEY is missing. Please set it in your .env file to enable LLM calls."
        )

    client = OpenAI(api_key=OPENAI_API_KEY)

    # Build context from docs
    context_parts = []
    for idx, doc in enumerate(docs, start=1):
        context_parts.append(
            f"[Source {idx}] {doc.get('title')}\n"
            f"URL: {doc.get('url')}\n"
            f"Snippet: {doc.get('snippet')}\n"
            f"Content:\n{doc.get('content')}\n"
        )
    context_text = "\n\n".join(context_parts)

    system_prompt = (
        "You are a helpful web research assistant. "
        "Use ONLY the provided sources to answer the user's question. "
        "Include citations like [1], [2] that refer to the sources listed at the end."
    )

    user_prompt = (
        f"Question:\n{question}\n\n"
        f"Sources:\n{context_text}\n\n"
        "Please provide a concise answer (in English or the question's language), "
        "with citations [1], [2], etc. Then list the sources with their URLs."
    )

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.3,
    )

    answer = response.choices[0].message.content
    return answer

In [8]:
# You can change this question to anything you like
question = "What are the main use cases of Thordata for AI data pipelines?"

docs = get_docs_for_question(
    question,
    num_results=3,
    engine=Engine.GOOGLE,
    location=None,  # e.g. "United States"
)

print(f"\nCollected {len(docs)} docs. Asking LLM...")

answer = summarize_with_llm(question, docs)
print("\n=== LLM Answer ===\n")
print(answer)

Searching google for: 'What are the main use cases of Thordata for AI data pipelines?'
Got 3 organic results.

[1/3] Fetching via Universal API: https://www.thordata.com/blog/proxies/thordata-review
  Collected 4000 characters of cleaned text.

[2/3] Fetching via Universal API: https://skywork.ai/skypage/en/Thordata:-Unlocking-AI's-Potential---A-Comprehensive-2025-Guide-for-AI-Professionals/1972882733226061824
  Skipping: content too short or empty.

[3/3] Fetching via Universal API: https://techbullion.com/is-thordata-the-next-big-name-in-enterprise-data-infrastructure-a-comprehensive-look/
  Collected 4000 characters of cleaned text.

Total documents collected: 2

Cached docs to D:\Thordata_Work\thordata-cookbook\data\web_qa_sample.json
Loaded 2 documents.

Collected 2 docs. Asking LLM...

=== LLM Answer ===

LLM backend is not configured. Please install the 'openai' package:
  pip install openai
and set OPENAI_API_KEY in your .env file.
