Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 186 additions & 2 deletions src/scrapegraph_mcp/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
"""

import os
from typing import Any, Dict
import json
from typing import Any, Dict, Optional, List, Union

import httpx
from fastmcp import FastMCP
Expand All @@ -33,7 +34,7 @@ def __init__(self, api_key: str):
"SGAI-APIKEY": api_key,
"Content-Type": "application/json"
}
self.client = httpx.Client(timeout=60.0)
self.client = httpx.Client(timeout=httpx.Timeout(120.0))

def markdownify(self, website_url: str) -> Dict[str, Any]:
"""
Expand Down Expand Up @@ -126,6 +127,85 @@ def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scr

return response.json()

def scrape(self, website_url: str, render_heavy_js: Optional[bool] = None) -> Dict[str, Any]:
"""
Basic scrape endpoint to fetch page content.

Args:
website_url: URL to scrape
render_heavy_js: Whether to render heavy JS (optional)

Returns:
Dictionary containing the scraped result
"""
url = f"{self.BASE_URL}/scrape"
payload: Dict[str, Any] = {"website_url": website_url}
if render_heavy_js is not None:
payload["render_heavy_js"] = render_heavy_js

response = self.client.post(url, headers=self.headers, json=payload)
response.raise_for_status()
return response.json()

def sitemap(self, website_url: str) -> Dict[str, Any]:
"""
Extract sitemap for a given website.

Args:
website_url: Base website URL

Returns:
Dictionary containing sitemap URLs/structure
"""
url = f"{self.BASE_URL}/sitemap"
payload: Dict[str, Any] = {"website_url": website_url}

response = self.client.post(url, headers=self.headers, json=payload)
response.raise_for_status()
return response.json()

def agentic_scrapper(
self,
url: str,
user_prompt: Optional[str] = None,
output_schema: Optional[Dict[str, Any]] = None,
steps: Optional[List[str]] = None,
ai_extraction: Optional[bool] = None,
persistent_session: Optional[bool] = None,
timeout_seconds: Optional[float] = None,
) -> Dict[str, Any]:
"""
Run the Agentic Scraper workflow (no live session/browser interaction).

Args:
url: Target website URL
user_prompt: Instructions for what to do/extract (optional)
output_schema: Desired structured output schema (optional)
steps: High-level steps/instructions for the agent (optional)
ai_extraction: Whether to enable AI extraction mode (optional)
persistent_session: Whether to keep session alive between steps (optional)
timeout_seconds: Per-request timeout override in seconds (optional)
"""
endpoint = f"{self.BASE_URL}/agentic-scrapper"
payload: Dict[str, Any] = {"url": url}
if user_prompt is not None:
payload["user_prompt"] = user_prompt
if output_schema is not None:
payload["output_schema"] = output_schema
if steps is not None:
payload["steps"] = steps
if ai_extraction is not None:
payload["ai_extraction"] = ai_extraction
if persistent_session is not None:
payload["persistent_session"] = persistent_session

if timeout_seconds is not None:
response = self.client.post(endpoint, headers=self.headers, json=payload, timeout=timeout_seconds)
else:
response = self.client.post(endpoint, headers=self.headers, json=payload)
response.raise_for_status()
return response.json()

def smartcrawler_initiate(
self,
url: str,
Expand Down Expand Up @@ -371,6 +451,110 @@ def smartcrawler_fetch_results(request_id: str) -> Dict[str, Any]:
return {"error": str(e)}


# Add tool for basic scrape
@mcp.tool()
def scrape(website_url: str, render_heavy_js: Optional[bool] = None) -> Dict[str, Any]:
"""
Fetch page content for a URL.

Args:
website_url: URL to scrape
render_heavy_js: Whether to render heavy JS (optional)
"""
if scrapegraph_client is None:
return {"error": "ScapeGraph client not initialized. Please provide an API key."}

try:
return scrapegraph_client.scrape(website_url=website_url, render_heavy_js=render_heavy_js)
except httpx.HTTPError as http_err:
return {"error": str(http_err)}
except ValueError as val_err:
return {"error": str(val_err)}


# Add tool for sitemap extraction
@mcp.tool()
def sitemap(website_url: str) -> Dict[str, Any]:
"""
Extract sitemap for a website.

Args:
website_url: Base website URL
"""
if scrapegraph_client is None:
return {"error": "ScapeGraph client not initialized. Please provide an API key."}

try:
return scrapegraph_client.sitemap(website_url=website_url)
except httpx.HTTPError as http_err:
return {"error": str(http_err)}
except ValueError as val_err:
return {"error": str(val_err)}


# Add tool for Agentic Scraper (no live session/browser interaction)
@mcp.tool()
def agentic_scrapper(
url: str,
user_prompt: Optional[str] = None,
output_schema: Optional[Union[str, Dict[str, Any]]] = None,
steps: Optional[Union[str, List[str]]] = None,
ai_extraction: Optional[bool] = None,
persistent_session: Optional[bool] = None,
timeout_seconds: Optional[float] = None,
) -> Dict[str, Any]:
"""
Run the Agentic Scraper workflow. Accepts flexible input forms for steps and schema.
"""
if scrapegraph_client is None:
return {"error": "ScapeGraph client not initialized. Please provide an API key."}

# Normalize inputs
normalized_steps: Optional[List[str]] = None
if isinstance(steps, list):
normalized_steps = steps
elif isinstance(steps, str):
parsed_steps: Optional[Any] = None
try:
parsed_steps = json.loads(steps)
except json.JSONDecodeError:
parsed_steps = None
if isinstance(parsed_steps, list):
normalized_steps = parsed_steps
else:
normalized_steps = [steps]

normalized_schema: Optional[Dict[str, Any]] = None
if isinstance(output_schema, dict):
normalized_schema = output_schema
elif isinstance(output_schema, str):
try:
parsed_schema = json.loads(output_schema)
if isinstance(parsed_schema, dict):
normalized_schema = parsed_schema
else:
return {"error": "output_schema must be a JSON object"}
except json.JSONDecodeError as e:
return {"error": f"Invalid JSON for output_schema: {str(e)}"}

try:
return scrapegraph_client.agentic_scrapper(
url=url,
user_prompt=user_prompt,
output_schema=normalized_schema,
steps=normalized_steps,
ai_extraction=ai_extraction,
persistent_session=persistent_session,
timeout_seconds=timeout_seconds,
)
except httpx.TimeoutException as timeout_err:
return {"error": f"Request timed out: {str(timeout_err)}"}
except httpx.HTTPError as http_err:
return {"error": str(http_err)}
except ValueError as val_err:
return {"error": str(val_err)}


def main() -> None:
"""Run the ScapeGraph MCP server."""
print("Starting ScapeGraph MCP server!")
Expand Down
Loading
Loading