# üî¨ Research Notebook ‚Äî Powered by OpenClaw

This notebook uses your **Paw / OpenClaw** AI agent to browse the web, gather data from multiple sources, and compile structured findings into a comprehensive research report.

**How it works:**
1. Connects to your local OpenClaw gateway (started by Paw)
2. Sends a research prompt to the agent with web search & scraping tools
3. The agent autonomously searches, reads pages, and cross-references facts
4. Results are structured into a pandas DataFrame and exported as a report

> **Prerequisite:** Make sure Paw is running and the gateway is active on port `18789`.

## 1. Install and Import Dependencies

In [None]:
# Install dependencies (run once)
import subprocess, sys
for pkg in ["websockets", "pandas", "matplotlib", "beautifulsoup4", "requests", "duckduckgo-search"]:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg],
                          stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
print("‚úÖ All dependencies installed")

In [None]:
import asyncio
import json
import os
import re
import hashlib
from datetime import datetime
from urllib.parse import urlparse
from collections import Counter

import websockets
import requests
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from IPython.display import display, Markdown, HTML

print("‚úÖ All imports loaded")

## 2. Configure Gateway Connection & Agent Settings

Connect to your local OpenClaw gateway. Paw starts this automatically on port `18789`.

In [None]:
# ‚îÄ‚îÄ Gateway settings ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
GATEWAY_PORT = int(os.environ.get("OPENCLAW_PORT", "18789"))
GATEWAY_WS_URL = f"ws://localhost:{GATEWAY_PORT}"
GATEWAY_HTTP_URL = f"http://localhost:{GATEWAY_PORT}"

# ‚îÄ‚îÄ Agent settings ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
MAX_SEARCH_RESULTS = 8        # Max results per search query
MAX_PAGE_LENGTH = 4000         # Max chars to extract per page
MAX_AGENT_ITERATIONS = 15      # Safety limit on agent tool-call loops
REQUEST_TIMEOUT = 10           # Seconds for HTTP requests

# ‚îÄ‚îÄ Get gateway token ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
try:
    resp = requests.get(f"{GATEWAY_HTTP_URL}/api/health", timeout=5)
    resp.raise_for_status()
    health = resp.json()
    print(f"‚úÖ Gateway is running ‚Äî version {health.get('version', 'unknown')}")
except Exception as e:
    print(f"‚ö†Ô∏è  Cannot reach gateway at {GATEWAY_HTTP_URL}: {e}")
    print("   Make sure Paw is running and the gateway is started.")

## 3. Define Web Search Function

Uses DuckDuckGo search (no API key required) to find relevant sources for a query.

In [None]:
def web_search(query: str, max_results: int = MAX_SEARCH_RESULTS) -> list[dict]:
    """Search the web using DuckDuckGo and return structured results."""
    try:
        with DDGS() as ddgs:
            results = list(ddgs.text(query, max_results=max_results))
        return [
            {
                "title": r.get("title", ""),
                "url": r.get("href", r.get("link", "")),
                "snippet": r.get("body", r.get("snippet", "")),
            }
            for r in results
            if r.get("href") or r.get("link")
        ]
    except Exception as e:
        print(f"  ‚ö†Ô∏è  Search error: {e}")
        return []

# Quick test
test_results = web_search("OpenClaw AI agent gateway", max_results=3)
print(f"‚úÖ Web search working ‚Äî got {len(test_results)} results")
for r in test_results[:3]:
    print(f"   ‚Ä¢ {r['title'][:60]}")

## 4. Define Web Page Scraping Function

Extracts clean text content from a web page using `requests` + `BeautifulSoup`. Handles timeouts, errors, and strips irrelevant HTML elements.

In [None]:
def scrape_page(url: str, max_length: int = MAX_PAGE_LENGTH) -> str:
    """Fetch a URL and extract the main text content."""
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Research Bot; OpenClaw/Paw)"}
        resp = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, "html.parser")

        # Remove non-content elements
        for tag in soup(["script", "style", "nav", "footer", "header",
                         "aside", "iframe", "noscript", "form"]):
            tag.decompose()

        # Try to find main content area
        main = soup.find("main") or soup.find("article") or soup.find("body")
        if not main:
            return "(Could not extract content)"

        text = main.get_text(separator="\n", strip=True)
        # Clean up excessive whitespace
        text = re.sub(r"\n{3,}", "\n\n", text)
        text = re.sub(r" {2,}", " ", text)

        if len(text) > max_length:
            text = text[:max_length] + "\n\n[‚Ä¶truncated]"

        return text

    except requests.Timeout:
        return f"(Timeout fetching {url})"
    except requests.HTTPError as e:
        return f"(HTTP error {e.response.status_code} for {url})"
    except Exception as e:
        return f"(Error fetching {url}: {e})"

# Quick test
test_text = scrape_page("https://example.com")
print(f"‚úÖ Scraper working ‚Äî extracted {len(test_text)} chars from example.com")

## 5. Build the Research Agent Loop

This connects to your OpenClaw gateway via WebSocket and runs a research agent that can call `web_search` and `scrape_page` tools. The agent decides which sources to search, which pages to read, and when it has enough information.

In [None]:
# ‚îÄ‚îÄ Tool registry ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
TOOLS = {
    "web_search": web_search,
    "scrape_page": scrape_page,
}

# Track all searches and scraped pages for analysis
research_log: list[dict] = []

def execute_tool(name: str, args: dict) -> str:
    """Execute a tool call and log the result."""
    if name == "web_search":
        query = args.get("query", "")
        print(f"  üîç Searching: {query}")
        results = web_search(query)
        research_log.append({"type": "search", "query": query, "results": results})
        return json.dumps(results, indent=2)

    elif name == "scrape_page":
        url = args.get("url", "")
        print(f"  üìÑ Reading: {url[:80]}")
        content = scrape_page(url)
        research_log.append({"type": "scrape", "url": url, "length": len(content)})
        return content

    return f"Unknown tool: {name}"


async def run_research_agent(topic: str) -> str:
    """
    Connect to OpenClaw gateway and run a research session.
    The agent uses chat.send to converse and we feed tool results back.
    Returns the agent's final compiled research.
    """
    research_log.clear()
    request_id = 0

    def next_id():
        nonlocal request_id
        request_id += 1
        return request_id

    system_prompt = f"""You are a thorough research assistant. Your task is to research the following topic and compile comprehensive findings.

TOPIC: {topic}

INSTRUCTIONS:
1. Start by searching for the topic from multiple angles (different keywords, perspectives)
2. Read the most relevant pages to gather detailed information
3. Cross-reference facts across sources ‚Äî note agreements and contradictions
4. Collect at least 5 distinct sources before compiling your report
5. When you have enough information, compile a final research report

For each finding, note:
- The source URL and title
- Key facts or data points
- How reliable/authoritative the source appears

YOUR TOOLS:
- web_search(query): Search the web. Returns titles, URLs, and snippets.
- scrape_page(url): Read a web page's content. Returns extracted text.

When you're done researching, write your final report with these sections:
## Executive Summary
## Key Findings
## Detailed Analysis
## Sources & References

Format your final report in Markdown."""

    async with websockets.connect(GATEWAY_WS_URL) as ws:
        # Handshake ‚Äî connect to gateway
        connect_frame = json.dumps([1, next_id(), "connect", {
            "name": "research-notebook",
            "scopes": ["operator.read", "operator.write", "operator.admin"],
            "protocol": 3
        }])
        await ws.send(connect_frame)
        connect_resp = json.loads(await ws.recv())
        if not connect_resp[1]:
            raise Exception(f"Gateway connect failed: {connect_resp}")
        print("‚úÖ Connected to OpenClaw gateway")

        # Send the research prompt via chat.send
        send_id = next_id()
        send_frame = json.dumps([1, send_id, "chat.send", {
            "content": system_prompt,
            "stream": False
        }])
        await ws.send(send_frame)
        print("üì® Research prompt sent ‚Äî agent is working...\n")

        # Collect the response (may include tool calls)
        full_response = ""
        iterations = 0

        while iterations < MAX_AGENT_ITERATIONS:
            iterations += 1
            try:
                raw = await asyncio.wait_for(ws.recv(), timeout=120)
            except asyncio.TimeoutError:
                print("  ‚è∞ Timeout waiting for response")
                break

            frame = json.loads(raw)

            # Response frame: [1, ok, result, error]
            if isinstance(frame, list) and len(frame) >= 3:
                # Event frame: [2, event_name, data]
                if frame[0] == 2:
                    event_name = frame[1]
                    event_data = frame[2] if len(frame) > 2 else {}

                    if event_name == "chat.token":
                        token = event_data.get("token", "")
                        full_response += token

                    elif event_name == "chat.tool_call":
                        tool_name = event_data.get("name", "")
                        tool_args = event_data.get("arguments", {})
                        if isinstance(tool_args, str):
                            try:
                                tool_args = json.loads(tool_args)
                            except json.JSONDecodeError:
                                tool_args = {"query": tool_args}
                        result = execute_tool(tool_name, tool_args)
                        # Send tool result back
                        inject_id = next_id()
                        inject_frame = json.dumps([1, inject_id, "chat.inject", {
                            "role": "tool",
                            "content": result,
                            "name": tool_name
                        }])
                        await ws.send(inject_frame)

                    elif event_name in ("chat.done", "chat.end", "chat.complete"):
                        print("\n‚úÖ Agent finished research")
                        break

                # Response to our request
                elif frame[0] == 1:
                    ok = frame[1]
                    result = frame[2] if len(frame) > 2 else None
                    if ok and isinstance(result, dict):
                        msg = result.get("message", result.get("content", ""))
                        if msg:
                            full_response += msg
                    if ok:
                        break

        print(f"\nüìä Research complete ‚Äî {len(research_log)} tool calls, {iterations} iterations")
        return full_response

print("‚úÖ Research agent ready")

## 6. Define Research Topic & Execute

Change the `RESEARCH_TOPIC` below to whatever you want to research. The agent will autonomously search, read pages, and compile findings.

In [None]:
# ‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
# ‚ïë  CHANGE THIS to your research topic                                  ‚ïë
# ‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
RESEARCH_TOPIC = "Current state of AI agent frameworks in 2026: comparing OpenClaw, LangGraph, CrewAI, and AutoGen"

print(f"üî¨ Research Topic: {RESEARCH_TOPIC}")
print(f"   Max iterations: {MAX_AGENT_ITERATIONS}")
print(f"   Max search results: {MAX_SEARCH_RESULTS}")
print("‚îÄ" * 60)

# Run the research agent
raw_report = await run_research_agent(RESEARCH_TOPIC)

## 7. Parse and Structure Raw Findings

Extract structured data from the research log ‚Äî every search and every page scraped ‚Äî into a clean DataFrame.

In [None]:
# Build structured findings from the research log
findings = []

for entry in research_log:
    if entry["type"] == "search":
        for result in entry.get("results", []):
            url = result.get("url", "")
            domain = urlparse(url).netloc if url else "unknown"
            findings.append({
                "source": "search",
                "query": entry["query"],
                "title": result.get("title", ""),
                "url": url,
                "domain": domain,
                "snippet": result.get("snippet", ""),
                "content_length": len(result.get("snippet", "")),
            })
    elif entry["type"] == "scrape":
        url = entry.get("url", "")
        domain = urlparse(url).netloc if url else "unknown"
        findings.append({
            "source": "scrape",
            "query": "",
            "title": "",
            "url": url,
            "domain": domain,
            "snippet": "",
            "content_length": entry.get("length", 0),
        })

df_findings = pd.DataFrame(findings)

if not df_findings.empty:
    print(f"üìä Structured {len(df_findings)} findings from {len(research_log)} tool calls")
    print(f"   Unique domains: {df_findings['domain'].nunique()}")
    print(f"   Search queries: {df_findings[df_findings['source'] == 'search']['query'].nunique()}")
    print(f"   Pages scraped: {len(df_findings[df_findings['source'] == 'scrape'])}")
    display(df_findings[["source", "domain", "title", "content_length"]].head(15))
else:
    print("‚ö†Ô∏è  No findings collected ‚Äî the agent may not have used tools.")

## 8. Deduplicate and Rank Sources

Remove duplicate URLs, rank sources by information density, and show a summary table of the best sources found.

In [None]:
if not df_findings.empty:
    # Deduplicate by URL
    df_unique = df_findings.drop_duplicates(subset=["url"], keep="first").copy()

    # Compute a simple relevance score based on content availability
    df_unique["relevance_score"] = df_unique.apply(
        lambda row: (
            (3 if row["source"] == "scrape" else 1) +        # Scraped pages are more valuable
            min(row["content_length"] / 1000, 5) +            # More content = more relevant
            (1 if row["snippet"] else 0)                       # Having a snippet helps
        ),
        axis=1
    )

    # Sort by relevance
    df_ranked = df_unique.sort_values("relevance_score", ascending=False).reset_index(drop=True)

    print(f"üìã {len(df_ranked)} unique sources (deduplicated from {len(df_findings)})\n")
    display(df_ranked[["domain", "title", "relevance_score", "content_length"]].head(10))

    # ‚îÄ‚îÄ Visualize source distribution ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Domain distribution
    domain_counts = df_ranked["domain"].value_counts().head(10)
    domain_counts.plot(kind="barh", ax=axes[0], color="#6366f1")
    axes[0].set_title("Sources by Domain")
    axes[0].set_xlabel("Count")
    axes[0].invert_yaxis()

    # Source type distribution
    type_counts = df_ranked["source"].value_counts()
    type_counts.plot(kind="pie", ax=axes[1], autopct="%1.0f%%",
                     colors=["#6366f1", "#22c55e"], startangle=90)
    axes[1].set_title("Search Results vs. Scraped Pages")
    axes[1].set_ylabel("")

    plt.tight_layout()
    plt.show()
else:
    df_ranked = pd.DataFrame()
    print("‚ö†Ô∏è  No findings to deduplicate.")

## 9. Compile Findings into a Formatted Report

Display the agent's compiled research report with full Markdown rendering.

In [None]:
# Build the final report
report_header = f"""# Research Report
**Topic:** {RESEARCH_TOPIC}
**Date:** {datetime.now().strftime("%B %d, %Y at %H:%M")}
**Sources consulted:** {len(df_ranked) if not df_ranked.empty else 0}
**Tool calls:** {len(research_log)} (searches + page reads)

---

"""

# Use the agent's compiled report if available, otherwise summarize from findings
if raw_report and len(raw_report.strip()) > 100:
    final_report = report_header + raw_report
else:
    # Fallback: build a basic report from the findings data
    sources_section = "\n".join(
        f"- [{row.get('title', row['url'])}]({row['url']}) ‚Äî {row['domain']}"
        for _, row in (df_ranked.head(15).iterrows() if not df_ranked.empty else [])
    )
    final_report = report_header + f"""## Summary

Research was conducted on the topic above. The agent gathered information from {len(df_ranked) if not df_ranked.empty else 0} unique sources.

## Sources Found

{sources_section if sources_section else "(No sources collected)"}

> **Note:** The agent's full analysis was not captured in streaming mode.
> Re-run with the gateway connected for the full compiled report.
"""

# Render the report
display(Markdown(final_report))

## 10. Export Report & Data

Save the research report as a Markdown file and the raw findings as CSV for further analysis.

In [None]:
# Create output directory
os.makedirs("research_output", exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
topic_slug = re.sub(r"[^a-z0-9]+", "_", RESEARCH_TOPIC.lower())[:50].strip("_")

# Export report as Markdown
report_path = f"research_output/{timestamp}_{topic_slug}.md"
with open(report_path, "w") as f:
    f.write(final_report)
print(f"üìù Report saved: {report_path}")

# Export findings as CSV
if not df_ranked.empty:
    csv_path = f"research_output/{timestamp}_{topic_slug}_sources.csv"
    df_ranked.to_csv(csv_path, index=False)
    print(f"üìä Source data saved: {csv_path}")
    print(f"   {len(df_ranked)} sources, {df_ranked.columns.tolist()}")

# Export raw research log as JSON
log_path = f"research_output/{timestamp}_{topic_slug}_log.json"
with open(log_path, "w") as f:
    json.dump({
        "topic": RESEARCH_TOPIC,
        "timestamp": datetime.now().isoformat(),
        "research_log": research_log,
        "agent_response_length": len(raw_report) if raw_report else 0,
    }, f, indent=2, default=str)
print(f"üìã Research log saved: {log_path}")

print(f"\n‚úÖ All outputs saved to research_output/")