diff --git a/install.md b/install.md index 07cb1d5..d057bb3 100644 --- a/install.md +++ b/install.md @@ -11,27 +11,29 @@ description: 'Install and get started with ScrapeGraphAI v2 SDKs' ## Python SDK +Requires **Python ≥ 3.12**. + ```bash -pip install scrapegraph-py +pip install "scrapegraph-py>=2.1.0" ``` **Usage:** ```python -from scrapegraph_py import Client +from scrapegraph_py import ScrapeGraphAI -client = Client(api_key="your-api-key-here") +sgai = ScrapeGraphAI(api_key="your-api-key-here") # Extract data from a website -response = client.extract( +res = sgai.extract( + "Extract information about the company", url="https://scrapegraphai.com", - prompt="Extract information about the company" ) -print(response) +print(res.data.json_data if res.status == "success" else res.error) ``` -You can also set the `SGAI_API_KEY` environment variable and initialize the client without parameters: `client = Client()` +You can also set the `SGAI_API_KEY` environment variable and initialize the client without parameters: `sgai = ScrapeGraphAI()`. For more advanced usage, see the [Python SDK documentation](/sdks/python). @@ -110,22 +112,25 @@ Both SDKs support structured output using schemas: ### Python Example ```python -from scrapegraph_py import Client -from pydantic import BaseModel, Field +from scrapegraph_py import ScrapeGraphAI -class CompanyInfo(BaseModel): - company_name: str = Field(description="The company name") - description: str = Field(description="Company description") - website: str = Field(description="Company website URL") - industry: str = Field(description="Industry sector") +sgai = ScrapeGraphAI(api_key="your-api-key") -client = Client(api_key="your-api-key") -response = client.extract( +res = sgai.extract( + "Extract company information", url="https://scrapegraphai.com", - prompt="Extract company information", - output_schema=CompanyInfo + schema={ + "type": "object", + "properties": { + "company_name": {"type": "string", "description": "The company name"}, + "description": {"type": "string", "description": "Company description"}, + "website": {"type": "string", "description": "Company website URL"}, + "industry": {"type": "string", "description": "Industry sector"}, + }, + "required": ["company_name"], + }, ) -print(response) +print(res.data.json_data if res.status == "success" else res.error) ``` ### JavaScript Example diff --git a/sdks/python.mdx b/sdks/python.mdx index 727cdc2..9b0cd3e 100644 --- a/sdks/python.mdx +++ b/sdks/python.mdx @@ -14,22 +14,22 @@ icon: 'python' -These docs cover **`scrapegraph-py` ≥ 2.0.1**. Earlier `1.x` releases expose the deprecated v1 API and point to a different backend — none of the snippets on this page work there. +These docs cover **`scrapegraph-py` ≥ 2.1.0** and require **Python ≥ 3.12**. Earlier `1.x` releases expose the deprecated v1 API and point to a different backend — none of the snippets on this page work there. The `2.0.x` series used typed request wrappers (`ScrapeRequest`, `ExtractRequest`, …); **2.1.0 removed those wrappers** in favour of direct positional/keyword arguments, so upgrade if you are pinned to `2.0.x`. ## Installation ```bash -pip install "scrapegraph-py>=2.0.1" +pip install "scrapegraph-py>=2.1.0" # or -uv add "scrapegraph-py>=2.0.1" +uv add "scrapegraph-py>=2.1.0" ``` ## What's New in v2 - **Complete rewrite** built on [Pydantic v2](https://docs.pydantic.dev) + [httpx](https://www.python-httpx.org). - **Client rename**: `Client` → `ScrapeGraphAI`, `AsyncClient` → `AsyncScrapeGraphAI`. -- **Typed request models**: every method takes a Pydantic request (`ScrapeRequest`, `ExtractRequest`, `SearchRequest`, `CrawlRequest`, `MonitorCreateRequest`, …). +- **Direct arguments** (v2.1.0): every method accepts positional/keyword args — no more `ScrapeRequest`/`ExtractRequest`/… wrappers. - **`ApiResult[T]` wrapper**: no exceptions on API errors — every call returns `status: "success" | "error"`, `data`, `error`, and `elapsed_ms`. - **Nested resources**: `sgai.crawl.*`, `sgai.monitor.*`, `sgai.history.*`. - **camelCase on the wire, snake_case in Python**: automatic via Pydantic's `alias_generator`. @@ -42,13 +42,13 @@ v2 is a breaking release. See the [Migration Guide](/transition-from-v1-to-v2) i ## Quick Start ```python -from scrapegraph_py import ScrapeGraphAI, ScrapeRequest +from scrapegraph_py import ScrapeGraphAI # reads SGAI_API_KEY from env, or pass it explicitly: # sgai = ScrapeGraphAI(api_key="sgai-...") sgai = ScrapeGraphAI() -result = sgai.scrape(ScrapeRequest(url="https://example.com")) +result = sgai.scrape("https://example.com") if result.status == "success": print(result.data.results["markdown"]["data"]) @@ -86,7 +86,7 @@ The client supports context managers for automatic session cleanup: ```python with ScrapeGraphAI() as sgai: - result = sgai.scrape(ScrapeRequest(url="https://example.com")) + result = sgai.scrape("https://example.com") ``` ## Services @@ -97,14 +97,14 @@ Fetch a page in one or more formats (markdown, html, screenshot, json, links, im ```python from scrapegraph_py import ( - ScrapeGraphAI, ScrapeRequest, FetchConfig, + ScrapeGraphAI, FetchConfig, MarkdownFormatConfig, ScreenshotFormatConfig, JsonFormatConfig, ) sgai = ScrapeGraphAI() -res = sgai.scrape(ScrapeRequest( - url="https://example.com", +res = sgai.scrape( + "https://example.com", formats=[ MarkdownFormatConfig(mode="reader"), ScreenshotFormatConfig(full_page=True, width=1440, height=900), @@ -118,18 +118,18 @@ res = sgai.scrape(ScrapeRequest( wait=2000, scrolls=3, ), -)) +) if res.status == "success": markdown = res.data.results["markdown"]["data"] ``` -#### `ScrapeRequest` fields +#### `scrape()` parameters -| Field | Type | Required | Description | +| Parameter | Type | Required | Description | | -------------- | ------------------------ | -------- | --------------------------------------------------------------------------- | -| `url` | `HttpUrl` | Yes | URL to scrape | -| `formats` | `list[ScrapeFormatEntry]`| No | Defaults to `[MarkdownFormatConfig()]` | +| `url` | `str` | Yes | URL to scrape (positional) | +| `formats` | `list[FormatConfig]` | No | Defaults to `[MarkdownFormatConfig()]` | | `content_type` | `str` | No | Override detected content type (e.g. `"application/pdf"`, `"text/html"`) | | `fetch_config` | `FetchConfig` | No | Fetch configuration (mode, stealth, timeout, cookies, country, …) | @@ -155,13 +155,13 @@ Duplicate `type` entries in `formats` are rejected by a Pydantic validator. Run structured extraction against a URL, HTML, or markdown using AI. ```python -from scrapegraph_py import ScrapeGraphAI, ExtractRequest +from scrapegraph_py import ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.extract(ExtractRequest( +res = sgai.extract( + "Extract product names and prices", url="https://example.com", - prompt="Extract product names and prices", schema={ "type": "object", "properties": { @@ -177,28 +177,28 @@ res = sgai.extract(ExtractRequest( }, }, }, -)) +) if res.status == "success": print(res.data.json_data) print(f"Tokens: {res.data.usage.prompt_tokens} / {res.data.usage.completion_tokens}") ``` -#### `ExtractRequest` fields +#### `extract()` parameters -| Field | Type | Required | Description | +| Parameter | Type | Required | Description | | -------------- | ------------- | -------- | --------------------------------------------------------------------------------- | -| `url` | `HttpUrl` | Yes\* | Page URL | +| `prompt` | `str` | Yes | 1–10,000 chars (positional) | +| `url` | `str` | Yes\* | Page URL | | `html` | `str` | Yes\* | Raw HTML (alternative to `url`) | | `markdown` | `str` | Yes\* | Raw markdown (alternative to `url`) | -| `prompt` | `str` | Yes | 1–10,000 chars | | `schema` | `dict` | No | JSON Schema for the structured output | | `mode` | `str` | No | `"normal"` (default), `"reader"`, `"prune"` | | `content_type` | `str` | No | Override detected content type | | `fetch_config` | `FetchConfig` | No | Fetch configuration | -\*At least one of `url`, `html`, or `markdown` is required — enforced by a Pydantic validator. +\*At least one of `url`, `html`, or `markdown` is required. ### Search @@ -206,17 +206,17 @@ if res.status == "success": Run a web search and optionally extract structured data from the results. ```python -from scrapegraph_py import ScrapeGraphAI, SearchRequest +from scrapegraph_py import ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.search(SearchRequest( - query="best programming languages 2024", +res = sgai.search( + "best programming languages 2024", num_results=5, prompt="Summarize the top languages and reasons", time_range="past_week", location_geo_code="us", -)) +) if res.status == "success": for hit in res.data.results: @@ -224,11 +224,11 @@ if res.status == "success": print(res.data.json_data) # when prompt/schema are set ``` -#### `SearchRequest` fields +#### `search()` parameters -| Field | Type | Required | Description | +| Parameter | Type | Required | Description | | ------------------- | ------------- | -------- | ------------------------------------------------------------------------- | -| `query` | `str` | Yes | 1–500 chars | +| `query` | `str` | Yes | 1–500 chars (positional) | | `num_results` | `int` | No | 1–20, default `3` | | `format` | `str` | No | `"markdown"` (default) or `"html"` | | `mode` | `str` | No | HTML processing: `"prune"` (default), `"normal"`, `"reader"` | @@ -243,20 +243,20 @@ if res.status == "success": Crawl a site and its linked pages asynchronously. Access via the `sgai.crawl` resource. ```python -from scrapegraph_py import ScrapeGraphAI, CrawlRequest, MarkdownFormatConfig +from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig sgai = ScrapeGraphAI() # Start -start = sgai.crawl.start(CrawlRequest( - url="https://example.com", +start = sgai.crawl.start( + "https://example.com", formats=[MarkdownFormatConfig()], max_depth=2, max_pages=50, max_links_per_page=10, include_patterns=["/blog/*"], exclude_patterns=["/admin/*"], -)) +) crawl_id = start.data.id @@ -270,12 +270,12 @@ sgai.crawl.resume(crawl_id) sgai.crawl.delete(crawl_id) ``` -#### `CrawlRequest` fields +#### `crawl.start()` parameters -| Field | Type | Required | Description | +| Parameter | Type | Required | Description | | -------------------- | ------------------------- | -------- | -------------------------------------------------------- | -| `url` | `HttpUrl` | Yes | Starting URL | -| `formats` | `list[ScrapeFormatEntry]` | No | Defaults to `[MarkdownFormatConfig()]` | +| `url` | `str` | Yes | Starting URL (positional) | +| `formats` | `list[FormatConfig]` | No | Defaults to `[MarkdownFormatConfig()]` | | `max_depth` | `int` | No | `≥ 0`, default `2` | | `max_pages` | `int` | No | `1–1000`, default `50` | | `max_links_per_page` | `int` | No | `≥ 1`, default `10` | @@ -290,25 +290,23 @@ sgai.crawl.delete(crawl_id) Scheduled extraction jobs. Access via the `sgai.monitor` resource. ```python -from scrapegraph_py import ( - ScrapeGraphAI, MonitorCreateRequest, MonitorUpdateRequest, MarkdownFormatConfig, -) +from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig sgai = ScrapeGraphAI() -mon = sgai.monitor.create(MonitorCreateRequest( - url="https://example.com", +mon = sgai.monitor.create( + "https://example.com", + "0 * * * *", # cron expression (positional) name="Price Monitor", - interval="0 * * * *", # cron expression formats=[MarkdownFormatConfig()], webhook_url="https://example.com/webhook", -)) +) cron_id = mon.data.cron_id sgai.monitor.list() sgai.monitor.get(cron_id) -sgai.monitor.update(cron_id, MonitorUpdateRequest(interval="0 */6 * * *")) +sgai.monitor.update(cron_id, interval="0 */6 * * *") sgai.monitor.pause(cron_id) sgai.monitor.resume(cron_id) sgai.monitor.delete(cron_id) @@ -319,9 +317,7 @@ sgai.monitor.delete(cron_id) Paginate through the per-run ticks a monitor has produced (what changed on each scheduled run). ```python -from scrapegraph_py import MonitorActivityRequest - -act = sgai.monitor.activity(cron_id, MonitorActivityRequest(limit=20)) +act = sgai.monitor.activity(cron_id, limit=20) if act.status == "success": for tick in act.data.ticks: @@ -329,22 +325,20 @@ if act.status == "success": print(f"[{tick.created_at}] {tick.status} - {status} ({tick.elapsed_ms}ms)") if act.data.next_cursor: - more = sgai.monitor.activity( - cron_id, MonitorActivityRequest(limit=20, cursor=act.data.next_cursor), - ) + more = sgai.monitor.activity(cron_id, limit=20, cursor=act.data.next_cursor) ``` -`MonitorActivityRequest` fields: `limit` (1–100, default `20`) and optional `cursor` for pagination. Each `MonitorTickEntry` exposes `id`, `created_at`, `status`, `changed`, `elapsed_ms`, and a `diffs` model with per-format deltas. +`monitor.activity()` accepts `limit` (1–100, default `20`) and optional `cursor` for pagination. Each `MonitorTickEntry` exposes `id`, `created_at`, `status`, `changed`, `elapsed_ms`, and a `diffs` model with per-format deltas. -#### `MonitorCreateRequest` fields +#### `monitor.create()` parameters -| Field | Type | Required | Description | +| Parameter | Type | Required | Description | | -------------- | ------------------------- | -------- | ---------------------------------------------- | -| `url` | `HttpUrl` | Yes | URL to monitor | -| `interval` | `str` | Yes | Cron expression (1–100 chars) | +| `url` | `str` | Yes | URL to monitor (positional) | +| `interval` | `str` | Yes | Cron expression, 1–100 chars (positional) | | `name` | `str` | No | ≤ 200 chars | -| `formats` | `list[ScrapeFormatEntry]` | No | Defaults to `[MarkdownFormatConfig()]` | -| `webhook_url` | `HttpUrl` | No | Webhook invoked on change detection | +| `formats` | `list[FormatConfig]` | No | Defaults to `[MarkdownFormatConfig()]` | +| `webhook_url` | `str` | No | Webhook invoked on change detection | | `fetch_config` | `FetchConfig` | No | Fetch configuration | ### History @@ -352,11 +346,11 @@ if act.status == "success": Fetch recent request history. Access via the `sgai.history` resource. ```python -from scrapegraph_py import ScrapeGraphAI, HistoryFilter +from scrapegraph_py import ScrapeGraphAI sgai = ScrapeGraphAI() -page = sgai.history.list(HistoryFilter(service="scrape", page=1, limit=20)) +page = sgai.history.list(service="scrape", page=1, limit=20) for entry in page.data.data: print(entry.id, entry.service, entry.status, entry.elapsed_ms) @@ -401,17 +395,15 @@ Every sync method has an async equivalent on `AsyncScrapeGraphAI`: ```python import asyncio -from scrapegraph_py import AsyncScrapeGraphAI, ScrapeRequest, CrawlRequest +from scrapegraph_py import AsyncScrapeGraphAI async def main(): async with AsyncScrapeGraphAI() as sgai: - res = await sgai.scrape(ScrapeRequest(url="https://example.com")) + res = await sgai.scrape("https://example.com") if res.status == "success": print(res.data.results["markdown"]["data"]) - start = await sgai.crawl.start(CrawlRequest( - url="https://example.com", max_pages=25, - )) + start = await sgai.crawl.start("https://example.com", max_pages=25) status = await sgai.crawl.get(start.data.id) print(status.data.status) diff --git a/tests/python-v2.1.0/README.md b/tests/python-v2.1.0/README.md new file mode 100644 index 0000000..c668a0f --- /dev/null +++ b/tests/python-v2.1.0/README.md @@ -0,0 +1,35 @@ +# Python SDK v2.1.0 — Endpoint Verification + +These scripts back the `sdks/python.mdx` refresh. Each exercises one +endpoint of `scrapegraph-py>=2.1.0` five or more times against the live +v2 API and prints `status` + `elapsed_ms` per call. + +## How to run + +```bash +python3 -m venv venv +./venv/bin/pip install "scrapegraph-py>=2.1.0" +export SGAI_API_KEY="sgai-..." # your key + +./venv/bin/python test_scrape.py +./venv/bin/python test_extract.py +./venv/bin/python test_search.py +./venv/bin/python test_utilities.py # credits + health +./venv/bin/python test_crawl.py +./venv/bin/python test_monitor.py +``` + +## Results (2026-04-21, sdk 2.1.0) + +Every run was `status=success` on every call. Captured output is in +[`results.txt`](./results.txt). Summary: + +| Endpoint | Calls | Success | Notes | +| --------------- | ----- | ------- | --------------------------------------------------- | +| `scrape` | 5 | 5 | 5 distinct URLs, `MarkdownFormatConfig` | +| `extract` | 5 | 5 | 5 URLs × distinct prompts, `json_data` populated | +| `search` | 5 | 5 | `num_results=3`, 2–3 hits per query | +| `credits` | 5 | 5 | `remaining` / `used` returned | +| `health` | 5 | 5 | `status=ok` | +| `crawl` | 5 | 5 | `max_pages=2, max_depth=1`, polled to `completed` | +| `monitor` | 5 | 5 | create → delete lifecycle, `cron_id` returned | diff --git a/tests/python-v2.1.0/results.txt b/tests/python-v2.1.0/results.txt new file mode 100644 index 0000000..bc43f93 --- /dev/null +++ b/tests/python-v2.1.0/results.txt @@ -0,0 +1,53 @@ +scrapegraph-py 2.1.0 — live endpoint verification (2026-04-21) + +=== test_scrape.py === +[scrape 1/5] https://example.com -> status=success len=168 elapsed_ms=66 +[scrape 2/5] https://scrapegraphai.com -> status=success len=25939 elapsed_ms=428 +[scrape 3/5] https://httpbin.org/html -> status=success len=3600 elapsed_ms=903 +[scrape 4/5] https://www.iana.org/ -> status=success len=2557 elapsed_ms=822 +[scrape 5/5] https://example.org -> status=success len=168 elapsed_ms=69 + +=== test_extract.py === +[extract 1/5] https://example.com -> status=success elapsed_ms=342 keys=['title', 'heading'] +[extract 2/5] https://scrapegraphai.com -> status=success elapsed_ms=4581 keys=['answer', 'description'] +[extract 3/5] https://www.iana.org/ -> status=success elapsed_ms=391 keys=['main_purpose'] +[extract 4/5] https://example.org -> status=success elapsed_ms=304 keys=['title', 'description'] +[extract 5/5] https://httpbin.org/html -> status=success elapsed_ms=474 keys=['summary'] + +=== test_search.py === +[search 1/5] 'best programming languages 2025' -> status=success results=2 elapsed_ms=16309 +[search 2/5] 'latest AI research breakthroughs' -> status=success results=3 elapsed_ms=2155 +[search 3/5] 'python web scraping libraries' -> status=success results=3 elapsed_ms=1874 +[search 4/5] 'top e-commerce platforms' -> status=success results=3 elapsed_ms=2161 +[search 5/5] 'climate change recent news' -> status=success results=3 elapsed_ms=2137 + +=== test_utilities.py (credits + health) === +[utils 1/5] credits.status=success remaining=998723 used=1663 | health.status=success service=ok +[utils 2/5] credits.status=success remaining=998723 used=1663 | health.status=success service=ok +[utils 3/5] credits.status=success remaining=998723 used=1663 | health.status=success service=ok +[utils 4/5] credits.status=success remaining=998723 used=1663 | health.status=success service=ok +[utils 5/5] credits.status=success remaining=998723 used=1663 | health.status=success service=ok + +=== test_crawl.py === +[crawl 1/5] https://example.com id=41ef82d7 final=completed +[crawl 2/5] https://scrapegraphai.com id=5cc234e7 final=completed +[crawl 3/5] https://example.org id=a29f5267 final=completed +[crawl 4/5] https://www.iana.org/ id=8d51e6e5 final=completed +[crawl 5/5] https://httpbin.org/ id=ee8d26ca final=completed + +=== test_monitor.py === +[monitor 1/5] created id=e4606611 interval=0 * * * * +[monitor 2/5] created id=7cb417bf interval=0 * * * * +[monitor 3/5] created id=8ec56850 interval=0 * * * * +[monitor 4/5] created id=793ff7ff interval=0 * * * * +[monitor 5/5] created id=353213ae interval=0 * * * * +Cleaned up 5 monitors + +--- Summary --- +scrape: 5/5 success +extract: 5/5 success +search: 5/5 success +credits: 5/5 success +health: 5/5 success +crawl: 5/5 success (all reached status=completed) +monitor: 5/5 success (create + delete lifecycle) diff --git a/tests/python-v2.1.0/test_crawl.py b/tests/python-v2.1.0/test_crawl.py new file mode 100644 index 0000000..73b2d77 --- /dev/null +++ b/tests/python-v2.1.0/test_crawl.py @@ -0,0 +1,33 @@ +"""Start + poll + cleanup a 2-page crawl against 5 URLs. Reads SGAI_API_KEY from env.""" +import time + +from scrapegraph_py import MarkdownFormatConfig, ScrapeGraphAI + +sgai = ScrapeGraphAI() + +urls = [ + "https://example.com", + "https://scrapegraphai.com", + "https://example.org", + "https://www.iana.org/", + "https://httpbin.org/", +] + +for i, u in enumerate(urls, 1): + start = sgai.crawl.start(u, formats=[MarkdownFormatConfig()], max_pages=2, max_depth=1) + if start.status != "success" or not start.data: + print(f"[crawl {i}/5] start failed: {start.error}") + continue + cid = start.data.id + final_status = start.data.status + for _ in range(15): + if final_status in ("completed", "failed", "stopped"): + break + time.sleep(2) + g = sgai.crawl.get(cid) + if g.status == "success" and g.data: + final_status = g.data.status + else: + break + print(f"[crawl {i}/5] {u} id={cid[:8]} final={final_status}") + sgai.crawl.delete(cid) diff --git a/tests/python-v2.1.0/test_extract.py b/tests/python-v2.1.0/test_extract.py new file mode 100644 index 0000000..d2ba393 --- /dev/null +++ b/tests/python-v2.1.0/test_extract.py @@ -0,0 +1,21 @@ +"""Extract against 5 URLs with scrapegraph-py>=2.1.0. Reads SGAI_API_KEY from env.""" +import time + +from scrapegraph_py import ScrapeGraphAI + +sgai = ScrapeGraphAI() + +cases = [ + ("https://example.com", "Extract the page title and main heading"), + ("https://scrapegraphai.com", "What does this company do in one sentence?"), + ("https://www.iana.org/", "Extract the main purpose of this organization"), + ("https://example.org", "Extract title and description"), + ("https://httpbin.org/html", "Summarize the page content in one line"), +] + +for i, (u, p) in enumerate(cases, 1): + res = sgai.extract(p, url=u) + j = res.data.json_data if res.status == "success" and res.data else None + keys = list(j.keys()) if isinstance(j, dict) else type(j).__name__ + print(f"[extract {i}/5] {u} -> status={res.status} elapsed_ms={res.elapsed_ms} keys={keys}") + time.sleep(0.5) diff --git a/tests/python-v2.1.0/test_monitor.py b/tests/python-v2.1.0/test_monitor.py new file mode 100644 index 0000000..c66a113 --- /dev/null +++ b/tests/python-v2.1.0/test_monitor.py @@ -0,0 +1,31 @@ +"""Create + delete 5 monitors. Reads SGAI_API_KEY from env.""" +from scrapegraph_py import MarkdownFormatConfig, ScrapeGraphAI + +sgai = ScrapeGraphAI() + +urls = [ + "https://example.com", + "https://example.org", + "https://www.iana.org/", + "https://httpbin.org/html", + "https://scrapegraphai.com", +] + +created = [] +for i, u in enumerate(urls, 1): + res = sgai.monitor.create( + u, + "0 * * * *", + name=f"doc-test-monitor-{i}", + formats=[MarkdownFormatConfig()], + ) + if res.status != "success" or not res.data: + print(f"[monitor {i}/5] create failed: {res.error}") + continue + cron_id = res.data.cron_id + created.append(cron_id) + print(f"[monitor {i}/5] created id={cron_id[:8]} interval={res.data.interval}") + +for cid in created: + sgai.monitor.delete(cid) +print(f"Cleaned up {len(created)} monitors") diff --git a/tests/python-v2.1.0/test_scrape.py b/tests/python-v2.1.0/test_scrape.py new file mode 100644 index 0000000..dab5e9d --- /dev/null +++ b/tests/python-v2.1.0/test_scrape.py @@ -0,0 +1,24 @@ +"""Scrape 5 distinct URLs with scrapegraph-py>=2.1.0. Reads SGAI_API_KEY from env.""" +import time + +from scrapegraph_py import MarkdownFormatConfig, ScrapeGraphAI + +sgai = ScrapeGraphAI() + +urls = [ + "https://example.com", + "https://scrapegraphai.com", + "https://httpbin.org/html", + "https://www.iana.org/", + "https://example.org", +] + +for i, u in enumerate(urls, 1): + res = sgai.scrape(u, formats=[MarkdownFormatConfig()]) + md = "" + if res.status == "success": + md = (res.data.results.get("markdown", {}) or {}).get("data") or "" + if isinstance(md, list): + md = md[0] if md else "" + print(f"[scrape {i}/5] {u} -> status={res.status} len={len(md)} elapsed_ms={res.elapsed_ms}") + time.sleep(0.5) diff --git a/tests/python-v2.1.0/test_search.py b/tests/python-v2.1.0/test_search.py new file mode 100644 index 0000000..e52a63c --- /dev/null +++ b/tests/python-v2.1.0/test_search.py @@ -0,0 +1,20 @@ +"""Search 5 queries with scrapegraph-py>=2.1.0. Reads SGAI_API_KEY from env.""" +import time + +from scrapegraph_py import ScrapeGraphAI + +sgai = ScrapeGraphAI() + +queries = [ + "best programming languages 2025", + "latest AI research breakthroughs", + "python web scraping libraries", + "top e-commerce platforms", + "climate change recent news", +] + +for i, q in enumerate(queries, 1): + res = sgai.search(q, num_results=3) + n = len(res.data.results) if res.status == "success" and res.data else 0 + print(f"[search {i}/5] {q!r} -> status={res.status} results={n} elapsed_ms={res.elapsed_ms}") + time.sleep(0.5) diff --git a/tests/python-v2.1.0/test_utilities.py b/tests/python-v2.1.0/test_utilities.py new file mode 100644 index 0000000..797d77f --- /dev/null +++ b/tests/python-v2.1.0/test_utilities.py @@ -0,0 +1,18 @@ +"""Call credits() + health() 5 times each. Reads SGAI_API_KEY from env.""" +import time + +from scrapegraph_py import ScrapeGraphAI + +sgai = ScrapeGraphAI() + +for i in range(1, 6): + c = sgai.credits() + h = sgai.health() + remaining = c.data.remaining if c.status == "success" and c.data else "?" + used = c.data.used if c.status == "success" and c.data else "?" + h_status = h.data.status if h.status == "success" and h.data else "?" + print( + f"[utils {i}/5] credits.status={c.status} remaining={remaining} used={used}" + f" | health.status={h.status} service={h_status}" + ) + time.sleep(0.3)