diff --git a/README.md b/README.md index 0041e4b..4bae412 100644 --- a/README.md +++ b/README.md @@ -4,322 +4,397 @@ Command-line interface for [ScrapeGraph AI](https://scrapegraphai.com) — AI-powered web scraping, data extraction, search, and crawling. -## Installation +## Project Structure -### From npm (recommended) +``` +just-scrape/ +├── src/ +│ ├── cli.ts # Entry point, citty main command + subcommands +│ ├── lib/ +│ │ ├── env.ts # Zod-parsed env config (API key, debug, timeout) +│ │ ├── folders.ts # API key resolution + interactive prompt +│ │ ├── scrapegraphai.ts # SDK layer — all API functions +│ │ ├── schemas.ts # Zod validation schemas +│ │ └── log.ts # Logger factory + syntax-highlighted JSON output +│ ├── types/ +│ │ └── index.ts # Zod-derived types + ApiResult +│ ├── commands/ +│ │ ├── smart-scraper.ts +│ │ ├── search-scraper.ts +│ │ ├── markdownify.ts +│ │ ├── crawl.ts +│ │ ├── sitemap.ts +│ │ ├── scrape.ts +│ │ ├── agentic-scraper.ts +│ │ ├── generate-schema.ts +│ │ ├── history.ts +│ │ ├── credits.ts +│ │ └── validate.ts +│ └── utils/ +│ └── banner.ts # ASCII banner + version from package.json +├── tests/ +│ └── scrapegraphai.test.ts # SDK layer tests (mocked fetch) +├── dist/ # Build output (git-ignored) +│ └── cli.mjs # Bundled ESM with shebang +├── package.json +├── tsconfig.json +├── tsup.config.ts +├── biome.json +└── .gitignore +``` -Install globally to use `just-scrape` from anywhere: +## Installation ```bash -npm install -g just-scrape +npm install -g just-scrape # npm (recommended) +pnpm add -g just-scrape # pnpm +yarn global add just-scrape # yarn +bun add -g just-scrape # bun +npx just-scrape --help # or run without installing ``` -Or use it directly without installing via `npx`: +Package: [just-scrape](https://www.npmjs.com/package/just-scrape) on npm. + +## Configuration + +The CLI needs a ScrapeGraph API key. Get one at [dashboard.scrapegraphai.com](https://dashboard.scrapegraphai.com). + +Four ways to provide it (checked in order): + +1. **Environment variable**: `export SGAI_API_KEY="sgai-..."` +2. **`.env` file**: `SGAI_API_KEY=sgai-...` in project root +3. **Config file**: `~/.scrapegraphai/config.json` +4. **Interactive prompt**: the CLI asks and saves to config ```bash -npx just-scrape --help +export JUST_SCRAPE_TIMEOUT_S=300 # Request/polling timeout in seconds (default: 120) +JUST_SCRAPE_DEBUG=1 just-scrape ... # Debug logging to stderr ``` -You can also install with other package managers: +## JSON Mode (`--json`) + +All commands support `--json` for machine-readable output. When set, banner, spinners, and interactive prompts are suppressed — only raw JSON on stdout. ```bash -# pnpm -pnpm add -g just-scrape +just-scrape credits --json | jq '.remaining_credits' +just-scrape smart-scraper https://example.com -p "Extract data" --json > result.json +just-scrape history smartscraper --json | jq '.requests[].status' +``` + +--- -# yarn -yarn global add just-scrape +## Smart Scraper -# bun -bun add -g just-scrape -``` +Extract structured data from any URL using AI. [docs](https://docs.scrapegraphai.com/services/smartscraper) -Package: [just-scrape](https://www.npmjs.com/package/just-scrape) on npm. +### Usage -### From source (local development) +```bash +just-scrape smart-scraper -p # Extract data with AI +just-scrape smart-scraper -p --schema # Enforce output schema +just-scrape smart-scraper -p --scrolls # Infinite scroll (0-100) +just-scrape smart-scraper -p --pages # Multi-page (1-100) +just-scrape smart-scraper -p --render-js # JS rendering (+1 credit) +just-scrape smart-scraper -p --stealth # Anti-bot bypass (+4 credits) +just-scrape smart-scraper -p --cookies --headers +just-scrape smart-scraper -p --plain-text # Plain text instead of JSON +``` -Requires [Bun](https://bun.sh) and Node.js 22+. +### Examples ```bash -# Clone the repository -git clone https://github.com/ScrapeGraphAI/just-scrape.git -cd just-scrape +# Extract product listings from an e-commerce page +just-scrape smart-scraper https://store.example.com/shoes -p "Extract all product names, prices, and ratings" -# Install dependencies -bun install +# Extract with a strict schema, scrolling to load more content +just-scrape smart-scraper https://news.example.com -p "Get all article headlines and dates" \ + --schema '{"type":"object","properties":{"articles":{"type":"array","items":{"type":"object","properties":{"title":{"type":"string"},"date":{"type":"string"}}}}}}' \ + --scrolls 5 -# Run directly from source (no build needed) -bun run dev --help +# Scrape a JS-heavy SPA behind anti-bot protection +just-scrape smart-scraper https://app.example.com/dashboard -p "Extract user stats" \ + --render-js --stealth +``` -# Or build and link globally -bun run build -npm link -just-scrape --help +## Search Scraper + +Search the web and extract structured data from results. [docs](https://docs.scrapegraphai.com/services/searchscraper) + +### Usage + +```bash +just-scrape search-scraper # AI-powered web search +just-scrape search-scraper --num-results # Sources to scrape (3-20, default 3) +just-scrape search-scraper --no-extraction # Markdown only (2 credits vs 10) +just-scrape search-scraper --schema # Enforce output schema +just-scrape search-scraper --stealth --headers ``` -### Verify installation +### Examples ```bash -just-scrape --help -just-scrape validate # check your API key +# Research a topic across multiple sources +just-scrape search-scraper "What are the best Python web frameworks in 2025?" --num-results 10 + +# Get raw markdown from search results (cheaper) +just-scrape search-scraper "React vs Vue comparison" --no-extraction --num-results 5 + +# Structured output with schema +just-scrape search-scraper "Top 5 cloud providers pricing" \ + --schema '{"type":"object","properties":{"providers":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"free_tier":{"type":"string"}}}}}}' ``` -## Tech Stack +## Markdownify -| Concern | Tool | -|---|---| -| Language | **TypeScript 5.8** | -| Dev Runtime | **Bun** | -| Build | **tsup** (esbuild) | -| CLI Framework | **citty** (unjs) | -| Prompts | **@clack/prompts** | -| Styling | **chalk** v5 (ESM) | -| Validation | **zod** v4 | -| Env | **dotenv** | -| Lint / Format | **Biome** | -| Testing | **Bun test** (built-in) | -| Target | **Node.js 22+**, ESM-only | +Convert any webpage to clean markdown. [docs](https://docs.scrapegraphai.com/services/markdownify) -## Setup +### Usage ```bash -bun install +just-scrape markdownify # Convert to markdown +just-scrape markdownify --render-js # JS rendering (+1 credit) +just-scrape markdownify --stealth # Anti-bot bypass (+4 credits) +just-scrape markdownify --headers # Custom headers ``` -## Configuration +### Examples -The CLI needs a ScrapeGraph API key. Get one at [dashboard.scrapegraphai.com](https://dashboard.scrapegraphai.com). +```bash +# Convert a blog post to markdown +just-scrape markdownify https://blog.example.com/my-article -Four ways to provide it (checked in order): +# Convert a JS-rendered page behind Cloudflare +just-scrape markdownify https://protected.example.com --render-js --stealth -1. **Environment variable**: `export SGAI_API_KEY="sgai-..."` -2. **`.env` file**: create a `.env` file in the project root with `SGAI_API_KEY=sgai-...` -3. **Config file**: stored in `~/.scrapegraphai/config.json` -4. **Interactive prompt**: if none of the above are set, the CLI prompts you and saves it to the config file +# Pipe markdown to a file +just-scrape markdownify https://docs.example.com/api --json | jq -r '.result' > api-docs.md +``` + +## Crawl -### Environment Variables +Crawl multiple pages and extract data from each. [docs](https://docs.scrapegraphai.com/services/smartcrawler) -| Variable | Default | Description | -|---|---|---| -| `JUST_SCRAPE_TIMEOUT_S` | `120` | Request/polling timeout in seconds | -| `JUST_SCRAPE_DEBUG` | `0` | Set to `1` to enable debug logging (outputs to stderr) | +### Usage ```bash -export JUST_SCRAPE_TIMEOUT_S=300 -JUST_SCRAPE_DEBUG=1 just-scrape smart-scraper https://example.com -p "Extract data" +just-scrape crawl -p # Crawl + extract +just-scrape crawl -p --max-pages # Max pages (default 10) +just-scrape crawl -p --depth # Crawl depth (default 1) +just-scrape crawl --no-extraction --max-pages # Markdown only (2 credits/page) +just-scrape crawl -p --schema # Enforce output schema +just-scrape crawl -p --rules # Crawl rules (include_paths, same_domain) +just-scrape crawl -p --no-sitemap # Skip sitemap discovery +just-scrape crawl -p --render-js --stealth # JS + anti-bot ``` -## Commands - -### `smart-scraper` — Extract structured data from a URL [docs](https://docs.scrapegraphai.com/services/smartscraper) +### Examples ```bash -just-scrape smart-scraper -p "Extract all product names and prices" +# Crawl a docs site and extract all code examples +just-scrape crawl https://docs.example.com -p "Extract all code snippets with their language" \ + --max-pages 20 --depth 3 -# With JSON schema -just-scrape smart-scraper https://example.com/products -p "Extract products" \ - --schema '{"type":"object","properties":{"products":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"price":{"type":"number"}}}}}}' +# Crawl only blog pages, skip everything else +just-scrape crawl https://example.com -p "Extract article titles and summaries" \ + --rules '{"include_paths":["/blog/*"],"same_domain":true}' --max-pages 50 -# With options -just-scrape smart-scraper https://example.com -p "Extract data" \ - --stealth --render-js --scrolls 10 --pages 5 +# Get raw markdown from all pages (no AI extraction, cheaper) +just-scrape crawl https://example.com --no-extraction --max-pages 10 ``` -| Option | Description | -|---|---| -| `-p, --prompt` | Extraction prompt (required) | -| `--schema` | Output JSON schema (JSON string) | -| `--scrolls` | Infinite scroll count (0-100) | -| `--pages` | Total pages to scrape (1-100) | -| `--render-js` | Enable JS rendering (+1 credit) | -| `--stealth` | Bypass bot detection (+4 credits) | -| `--cookies` | Cookies as JSON object string | -| `--headers` | Custom headers as JSON object string | -| `--plain-text` | Return plain text instead of JSON | - -### `search-scraper` — Search the web and extract data [docs](https://docs.scrapegraphai.com/services/searchscraper) +## Sitemap + +Get all URLs from a website's sitemap. [docs](https://docs.scrapegraphai.com/services/sitemap) + +### Usage ```bash -just-scrape search-scraper "What are the top Python web frameworks?" +just-scrape sitemap +``` + +### Examples + +```bash +# List all pages on a site +just-scrape sitemap https://example.com -# Markdown only (cheaper) -just-scrape search-scraper "Python frameworks" --no-extraction --num-results 5 +# Pipe URLs to another tool +just-scrape sitemap https://example.com --json | jq -r '.urls[]' ``` -| Option | Description | -|---|---| -| `--num-results` | Number of websites (3-20, default 3) | -| `--no-extraction` | Markdown only (2 credits/site vs 10) | -| `--schema` | Output JSON schema (JSON string) | -| `--stealth` | Bypass bot detection (+4 credits) | -| `--headers` | Custom headers as JSON object string | +## Scrape -### `markdownify` — Convert a webpage to markdown [docs](https://docs.scrapegraphai.com/services/markdownify) +Get raw HTML content from a URL. [docs](https://docs.scrapegraphai.com/services/scrape) + +### Usage ```bash -just-scrape markdownify https://example.com/article -just-scrape markdownify https://example.com --render-js --stealth +just-scrape scrape # Raw HTML +just-scrape scrape --render-js # JS rendering (+1 credit) +just-scrape scrape --stealth # Anti-bot bypass (+4 credits) +just-scrape scrape --branding # Extract branding (+2 credits) +just-scrape scrape --country-code # Geo-targeting ``` -| Option | Description | -|---|---| -| `--render-js` | Enable JS rendering (+1 credit) | -| `--stealth` | Bypass bot detection (+4 credits) | -| `--headers` | Custom headers as JSON object string | - -### `crawl` — Crawl and extract from multiple pages [docs](https://docs.scrapegraphai.com/services/smartcrawler) +### Examples ```bash -just-scrape crawl https://example.com -p "Extract article titles" --max-pages 5 --depth 2 +# Get raw HTML of a page +just-scrape scrape https://example.com -# Markdown only -just-scrape crawl https://example.com --no-extraction --max-pages 10 +# Scrape a geo-restricted page with anti-bot bypass +just-scrape scrape https://store.example.com --stealth --country-code DE -# With crawl rules -just-scrape crawl https://example.com -p "Extract data" \ - --rules '{"include_paths":["/blog/*"],"same_domain":true}' +# Extract branding info (logos, colors, fonts) +just-scrape scrape https://example.com --branding ``` -| Option | Description | -|---|---| -| `-p, --prompt` | Extraction prompt (required when extraction is on) | -| `--no-extraction` | Markdown only (2 credits/page vs 10) | -| `--max-pages` | Max pages to crawl (default 10) | -| `--depth` | Crawl depth (default 1) | -| `--schema` | Output JSON schema (JSON string) | -| `--rules` | Crawl rules as JSON object string | -| `--no-sitemap` | Disable sitemap-based discovery | -| `--render-js` | Enable JS rendering (+1 credit/page) | -| `--stealth` | Bypass bot detection (+4 credits) | - -### `sitemap` — Get all URLs from a website's sitemap [docs](https://docs.scrapegraphai.com/services/sitemap) +## Agentic Scraper + +Browser automation with AI — login, click, navigate, fill forms. [docs](https://docs.scrapegraphai.com/services/agenticscraper) + +### Usage ```bash -just-scrape sitemap https://example.com +just-scrape agentic-scraper -s # Run browser steps +just-scrape agentic-scraper -s --ai-extraction -p +just-scrape agentic-scraper -s --schema +just-scrape agentic-scraper -s --use-session # Persist browser session ``` -### `scrape` — Get raw HTML content [docs](https://docs.scrapegraphai.com/services/scrape) +### Examples ```bash -just-scrape scrape https://example.com -just-scrape scrape https://example.com --stealth --branding --country-code US +# Log in and extract dashboard data +just-scrape agentic-scraper https://app.example.com/login \ + -s "Fill email with user@test.com,Fill password with secret,Click Sign In" \ + --ai-extraction -p "Extract all dashboard metrics" + +# Navigate through a multi-step form +just-scrape agentic-scraper https://example.com/wizard \ + -s "Click Next,Select Premium plan,Fill name with John,Click Submit" + +# Persistent session across multiple runs +just-scrape agentic-scraper https://app.example.com \ + -s "Click Settings" --use-session ``` -| Option | Description | -|---|---| -| `--render-js` | Enable JS rendering (+1 credit) | -| `--stealth` | Bypass bot detection (+4 credits) | -| `--branding` | Extract branding info (+2 credits) | -| `--country-code` | ISO country code for geo-targeting | +## Generate Schema -### `agentic-scraper` — Browser automation with AI [docs](https://docs.scrapegraphai.com/services/agenticscraper) +Generate a JSON schema from a natural language description. + +### Usage ```bash -just-scrape agentic-scraper https://example.com/login \ - -s "Fill email with user@test.com,Fill password with pass123,Click Sign In" \ - --ai-extraction -p "Extract dashboard data" +just-scrape generate-schema # AI generates a schema +just-scrape generate-schema --existing-schema ``` -| Option | Description | -|---|---| -| `-s, --steps` | Comma-separated browser steps | -| `-p, --prompt` | Extraction prompt (with `--ai-extraction`) | -| `--schema` | Output JSON schema (JSON string) | -| `--ai-extraction` | Enable AI extraction after steps | -| `--use-session` | Persist browser session | - -### `generate-schema` — Generate JSON schema from a prompt +### Examples ```bash -just-scrape generate-schema "Schema for an e-commerce product with name, price, and reviews" +# Generate a schema for product data +just-scrape generate-schema "E-commerce product with name, price, ratings, and reviews array" + +# Refine an existing schema +just-scrape generate-schema "Add an availability field" \ + --existing-schema '{"type":"object","properties":{"name":{"type":"string"},"price":{"type":"number"}}}' ``` -| Option | Description | -|---|---| -| `--existing-schema` | Existing schema to modify (JSON string) | +## History -### `credits` — Check credit balance +Browse request history for any service. Interactive by default — arrow keys to navigate, select to view details, "Load more" for infinite scroll. + +### Usage ```bash -just-scrape credits +just-scrape history # Interactive browser +just-scrape history # Fetch specific request +just-scrape history --page # Start from page (default 1) +just-scrape history --page-size # Results per page (default 10, max 100) +just-scrape history --json # Raw JSON (pipeable) ``` -### `validate` — Validate your API key +Services: `markdownify`, `smartscraper`, `searchscraper`, `scrape`, `crawl`, `agentic-scraper`, `sitemap` + +### Examples ```bash -just-scrape validate +# Browse your smart-scraper history interactively +just-scrape history smartscraper + +# Jump to a specific request by ID +just-scrape history smartscraper abc123-def456-7890 + +# Export crawl history as JSON +just-scrape history crawl --json --page-size 100 | jq '.requests[] | {id: .request_id, status}' ``` -## Testing +## Credits -Tests use Bun's built-in test runner with `spyOn(globalThis, "fetch")` to mock all API calls — no network requests, no API key needed. +Check your credit balance. ```bash -bun test +just-scrape credits +just-scrape credits --json | jq '.remaining_credits' ``` -Covers all SDK functions: success paths, polling, HTTP error mapping (401/402/422/429/500), Zod validation, timeouts, and network failures. +## Validate -## Project Structure +Validate your API key (health check). -``` -just-scrape/ -├── src/ -│ ├── cli.ts # Entry point, citty main command + subcommands -│ ├── lib/ -│ │ ├── env.ts # Zod-parsed env config (API key, debug, timeout) -│ │ ├── folders.ts # API key resolution + interactive prompt -│ │ ├── scrapegraphai.ts # SDK layer — all API functions -│ │ ├── schemas.ts # Zod validation schemas -│ │ └── log.ts # Syntax-highlighted JSON output -│ ├── types/ -│ │ └── index.ts # Zod-derived types + ApiResult -│ ├── commands/ -│ │ ├── smart-scraper.ts -│ │ ├── search-scraper.ts -│ │ ├── markdownify.ts -│ │ ├── crawl.ts -│ │ ├── sitemap.ts -│ │ ├── scrape.ts -│ │ ├── agentic-scraper.ts -│ │ ├── generate-schema.ts -│ │ ├── credits.ts -│ │ └── validate.ts -│ └── utils/ -│ └── banner.ts # ASCII banner + version from package.json -├── tests/ -│ └── scrapegraphai.test.ts # SDK layer tests (mocked fetch) -├── dist/ # Build output (git-ignored) -│ └── cli.mjs # Bundled ESM with shebang -├── package.json -├── tsconfig.json -├── tsup.config.ts -├── biome.json -└── .gitignore +```bash +just-scrape validate ``` -## Scripts +--- -| Script | Command | Description | -|---|---|---| -| `dev` | `bun run src/cli.ts` | Run CLI from TS source | -| `build` | `tsup` | Bundle ESM to `dist/cli.mjs` | -| `lint` | `biome check .` | Lint + format check | -| `format` | `biome format . --write` | Auto-format | -| `test` | `bun test` | Run tests | -| `check` | `tsc --noEmit && biome check .` | Type-check + lint | +## Contributing -## Output +### From Source -All commands output pretty-printed JSON to stdout (pipeable). Errors go to stderr via `@clack/prompts`. +Requires [Bun](https://bun.sh) and Node.js 22+. ```bash -# Pipe output to jq -just-scrape credits | jq '.remaining_credits' +git clone https://github.com/ScrapeGraphAI/just-scrape.git +cd just-scrape +bun install +bun run dev --help +``` + +### Tech Stack -# Save to file -just-scrape smart-scraper https://example.com -p "Extract data" > result.json +| Concern | Tool | +|---|---| +| Language | **TypeScript 5.8** | +| Dev Runtime | **Bun** | +| Build | **tsup** (esbuild) | +| CLI Framework | **citty** (unjs) | +| Prompts | **@clack/prompts** | +| Styling | **chalk** v5 (ESM) | +| Validation | **zod** v4 | +| Env | **dotenv** | +| Lint / Format | **Biome** | +| Testing | **Bun test** (built-in) | +| Target | **Node.js 22+**, ESM-only | + +### Scripts + +```bash +bun run dev # Run CLI from TS source +bun run build # Bundle ESM to dist/cli.mjs +bun run lint # Lint + format check +bun run format # Auto-format +bun test # Run tests +bun run check # Type-check + lint ``` +### Testing + +Tests mock all API calls via `spyOn(globalThis, "fetch")` — no network, no API key needed. + +Covers: success paths, polling, HTTP error mapping (401/402/422/429/500), Zod validation, timeouts, and network failures. + ## License ISC diff --git a/src/cli.ts b/src/cli.ts index ea3ab30..0ff7f84 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -19,6 +19,7 @@ const main = defineCommand({ scrape: () => import("./commands/scrape.js").then((m) => m.default), "agentic-scraper": () => import("./commands/agentic-scraper.js").then((m) => m.default), "generate-schema": () => import("./commands/generate-schema.js").then((m) => m.default), + history: () => import("./commands/history.js").then((m) => m.default), credits: () => import("./commands/credits.js").then((m) => m.default), validate: () => import("./commands/validate.js").then((m) => m.default), }, diff --git a/src/commands/agentic-scraper.ts b/src/commands/agentic-scraper.ts index 32c5270..d5d0993 100644 --- a/src/commands/agentic-scraper.ts +++ b/src/commands/agentic-scraper.ts @@ -1,4 +1,3 @@ -import * as p from "@clack/prompts"; import { defineCommand } from "citty"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; @@ -25,26 +24,17 @@ export default defineCommand({ alias: "p", description: "Extraction prompt (used with --ai-extraction)", }, - schema: { - type: "string", - description: "Output JSON schema (as JSON string)", - }, - "ai-extraction": { - type: "boolean", - description: "Enable AI extraction after steps", - }, - "use-session": { - type: "boolean", - description: "Persist browser session across requests", - }, + schema: { type: "string", description: "Output JSON schema (as JSON string)" }, + "ai-extraction": { type: "boolean", description: "Enable AI extraction after steps" }, + "use-session": { type: "boolean", description: "Persist browser session across requests" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { - log.docs("https://docs.scrapegraphai.com/services/agenticscraper"); - const key = await resolveApiKey(); + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/services/agenticscraper"); + const key = await resolveApiKey(!!args.json); - const params: scrapegraphai.AgenticScraperParams = { - url: args.url, - }; + const params: scrapegraphai.AgenticScraperParams = { url: args.url }; if (args.steps) params.steps = args.steps.split(",").map((s) => s.trim()); if (args.prompt) params.user_prompt = args.prompt; @@ -52,14 +42,11 @@ export default defineCommand({ if (args["ai-extraction"]) params.ai_extraction = true; if (args["use-session"]) params.use_session = true; - const s = p.spinner(); - s.start("Running browser automation"); - const result = await scrapegraphai.agenticScraper(key, params, (status) => { - s.message(`Status: ${status}`); - }); - s.stop(`Done in ${log.elapsed(result.elapsedMs)}`); + out.start("Running browser automation"); + const result = await scrapegraphai.agenticScraper(key, params, out.poll); + out.stop(result.elapsedMs); - if (result.data) log.result(result.data); - else log.error(result.error); + if (result.data) out.result(result.data); + else out.error(result.error); }, }); diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts index 0d20238..4b294c6 100644 --- a/src/commands/crawl.ts +++ b/src/commands/crawl.ts @@ -1,4 +1,3 @@ -import * as p from "@clack/prompts"; import { defineCommand } from "citty"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; @@ -24,42 +23,21 @@ export default defineCommand({ type: "boolean", description: "Return markdown only (2 credits/page instead of 10)", }, - "max-pages": { - type: "string", - description: "Maximum pages to crawl (default 10)", - }, - depth: { - type: "string", - description: "Crawl depth (default 1)", - }, - schema: { - type: "string", - description: "Output JSON schema (as JSON string)", - }, - rules: { - type: "string", - description: "Crawl rules as JSON object string", - }, - "no-sitemap": { - type: "boolean", - description: "Disable sitemap-based URL discovery", - }, - "render-js": { - type: "boolean", - description: "Enable heavy JS rendering (+1 credit/page)", - }, - stealth: { - type: "boolean", - description: "Bypass bot detection (+4 credits)", - }, + "max-pages": { type: "string", description: "Maximum pages to crawl (default 10)" }, + depth: { type: "string", description: "Crawl depth (default 1)" }, + schema: { type: "string", description: "Output JSON schema (as JSON string)" }, + rules: { type: "string", description: "Crawl rules as JSON object string" }, + "no-sitemap": { type: "boolean", description: "Disable sitemap-based URL discovery" }, + "render-js": { type: "boolean", description: "Enable heavy JS rendering (+1 credit/page)" }, + stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { - log.docs("https://docs.scrapegraphai.com/services/smartcrawler"); - const key = await resolveApiKey(); + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/services/smartcrawler"); + const key = await resolveApiKey(!!args.json); - const params: scrapegraphai.CrawlParams = { - url: args.url, - }; + const params: scrapegraphai.CrawlParams = { url: args.url }; if (args.prompt) params.prompt = args.prompt; if (args["no-extraction"]) params.extraction_mode = false; @@ -71,14 +49,11 @@ export default defineCommand({ if (args["render-js"]) params.render_heavy_js = true; if (args.stealth) params.stealth = true; - const s = p.spinner(); - s.start("Crawling"); - const result = await scrapegraphai.crawl(key, params, (status) => { - s.message(`Status: ${status}`); - }); - s.stop(`Done in ${log.elapsed(result.elapsedMs)}`); + out.start("Crawling"); + const result = await scrapegraphai.crawl(key, params, out.poll); + out.stop(result.elapsedMs); - if (result.data) log.result(result.data); - else log.error(result.error); + if (result.data) out.result(result.data); + else out.error(result.error); }, }); diff --git a/src/commands/credits.ts b/src/commands/credits.ts index 366abdf..720d856 100644 --- a/src/commands/credits.ts +++ b/src/commands/credits.ts @@ -1,4 +1,3 @@ -import * as p from "@clack/prompts"; import { defineCommand } from "citty"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; @@ -9,14 +8,18 @@ export default defineCommand({ name: "credits", description: "Check your credit balance", }, - run: async () => { - const key = await resolveApiKey(); - const s = p.spinner(); - s.start("Fetching credits"); + args: { + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + const key = await resolveApiKey(!!args.json); + + out.start("Fetching credits"); const result = await scrapegraphai.getCredits(key); - s.stop(`Done in ${log.elapsed(result.elapsedMs)}`); + out.stop(result.elapsedMs); - if (result.data) log.result(result.data); - else log.error(result.error); + if (result.data) out.result(result.data); + else out.error(result.error); }, }); diff --git a/src/commands/generate-schema.ts b/src/commands/generate-schema.ts index 05d1568..eef4795 100644 --- a/src/commands/generate-schema.ts +++ b/src/commands/generate-schema.ts @@ -1,4 +1,3 @@ -import * as p from "@clack/prompts"; import { defineCommand } from "citty"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; @@ -19,24 +18,20 @@ export default defineCommand({ type: "string", description: "Existing schema to modify (as JSON string)", }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { - const key = await resolveApiKey(); - - const params: scrapegraphai.GenerateSchemaParams = { - user_prompt: args.prompt, - }; + const out = log.create(!!args.json); + const key = await resolveApiKey(!!args.json); + const params: scrapegraphai.GenerateSchemaParams = { user_prompt: args.prompt }; if (args["existing-schema"]) params.existing_schema = JSON.parse(args["existing-schema"]); - const s = p.spinner(); - s.start("Generating schema"); - const result = await scrapegraphai.generateSchema(key, params, (status) => { - s.message(`Status: ${status}`); - }); - s.stop(`Done in ${log.elapsed(result.elapsedMs)}`); + out.start("Generating schema"); + const result = await scrapegraphai.generateSchema(key, params, out.poll); + out.stop(result.elapsedMs); - if (result.data) log.result(result.data); - else log.error(result.error); + if (result.data) out.result(result.data); + else out.error(result.error); }, }); diff --git a/src/commands/history.ts b/src/commands/history.ts new file mode 100644 index 0000000..80a95da --- /dev/null +++ b/src/commands/history.ts @@ -0,0 +1,146 @@ +import * as p from "@clack/prompts"; +import chalk from "chalk"; +import { defineCommand } from "citty"; +import { resolveApiKey } from "../lib/folders.js"; +import * as log from "../lib/log.js"; +import { HISTORY_SERVICES } from "../lib/schemas.js"; +import * as scrapegraphai from "../lib/scrapegraphai.js"; + +const VALID = HISTORY_SERVICES.join(", "); +const LOAD_MORE = "__load_more__"; + +function getId(row: Record): string { + return String(row.request_id ?? row.crawl_id ?? row.id ?? "unknown"); +} + +function label(row: Record): string { + const id = getId(row); + const short = id.length > 12 ? `${id.slice(0, 12)}…` : id; + const status = String(row.status ?? "—"); + const url = String(row.website_url ?? row.url ?? row.user_prompt ?? ""); + const urlShort = url.length > 50 ? `${url.slice(0, 49)}…` : url; + + const color = + status === "completed" || status === "done" + ? chalk.green + : status === "failed" + ? chalk.red + : chalk.yellow; + + return `${chalk.dim(short)} ${color(status)} ${urlShort}`; +} + +function hint(row: Record): string { + const ts = row.created_at ?? row.timestamp ?? row.updated_at; + if (!ts) return ""; + const d = new Date(String(ts)); + return Number.isNaN(d.getTime()) ? String(ts) : d.toLocaleString(); +} + +export default defineCommand({ + meta: { + name: "history", + description: "View request history for a service", + }, + args: { + service: { + type: "positional", + description: `Service name (${VALID})`, + required: true, + }, + page: { type: "string", description: "Page number (default: 1)" }, + "page-size": { type: "string", description: "Results per page (default: 10, max: 100)" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const quiet = !!args.json; + const out = log.create(quiet); + const key = await resolveApiKey(quiet); + const service = args.service as scrapegraphai.HistoryParams["service"]; + const requestId = (args as { _: string[] })._.at(1); + const pageSize = args["page-size"] ? Number(args["page-size"]) : 10; + let page = args.page ? Number(args.page) : 1; + + const fetchPage = async (pg: number) => { + const r = await scrapegraphai.history(key, { service, page: pg, page_size: pageSize }); + if (r.status === "error") out.error(r.error); + const d = r.data as { requests: Record[]; next_key?: string }; + return { rows: d.requests ?? [], hasMore: !!d.next_key, ms: r.elapsedMs }; + }; + + if (quiet || requestId) { + const { rows } = await fetchPage(page); + if (requestId) { + const match = rows.find((r) => getId(r) === requestId); + if (!match) out.error(`Request ${requestId} not found on page ${page}`); + out.result(match); + return; + } + out.result(rows); + return; + } + + out.start(`Fetching ${service} history`); + const first = await fetchPage(page); + out.stop(first.ms); + + if (first.rows.length === 0) { + p.log.warning("No history found."); + return; + } + + const allRows = [...first.rows]; + let hasMore = first.hasMore; + + while (true) { + const options = allRows.map((row) => ({ + value: getId(row), + label: label(row), + hint: hint(row), + })); + + if (hasMore) { + options.push({ + value: LOAD_MORE, + label: chalk.blue.bold("↓ Load more…"), + hint: `page ${page + 1}`, + }); + } + + const selected = await p.select({ + message: `${allRows.length} requests — select one to view`, + options, + maxItems: 15, + }); + + if (p.isCancel(selected)) { + p.cancel("Cancelled"); + return; + } + + if (selected === LOAD_MORE) { + page++; + const ls = p.spinner(); + ls.start(`Loading page ${page}`); + const next = await fetchPage(page); + ls.stop("Done"); + + if (next.rows.length === 0) { + hasMore = false; + p.log.warning("No more results."); + continue; + } + + allRows.push(...next.rows); + hasMore = next.hasMore; + continue; + } + + const match = allRows.find((r) => getId(r) === selected); + if (match) out.result(match); + + const back = await p.confirm({ message: "Back to list?" }); + if (p.isCancel(back) || !back) return; + } + }, +}); diff --git a/src/commands/markdownify.ts b/src/commands/markdownify.ts index df1a3ed..a7c0f7f 100644 --- a/src/commands/markdownify.ts +++ b/src/commands/markdownify.ts @@ -1,4 +1,3 @@ -import * as p from "@clack/prompts"; import { defineCommand } from "citty"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; @@ -15,22 +14,15 @@ export default defineCommand({ description: "Website URL to convert", required: true, }, - "render-js": { - type: "boolean", - description: "Enable heavy JS rendering (+1 credit)", - }, - stealth: { - type: "boolean", - description: "Bypass bot detection (+4 credits)", - }, - headers: { - type: "string", - description: "Custom headers as JSON object string", - }, + "render-js": { type: "boolean", description: "Enable heavy JS rendering (+1 credit)" }, + stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, + headers: { type: "string", description: "Custom headers as JSON object string" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { - log.docs("https://docs.scrapegraphai.com/services/markdownify"); - const key = await resolveApiKey(); + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/services/markdownify"); + const key = await resolveApiKey(!!args.json); const params: scrapegraphai.MarkdownifyParams = { website_url: args.url, @@ -40,14 +32,11 @@ export default defineCommand({ if (args.stealth) params.stealth = true; if (args.headers) params.headers = JSON.parse(args.headers); - const s = p.spinner(); - s.start("Converting to markdown"); - const result = await scrapegraphai.markdownify(key, params, (status) => { - s.message(`Status: ${status}`); - }); - s.stop(`Done in ${log.elapsed(result.elapsedMs)}`); + out.start("Converting to markdown"); + const result = await scrapegraphai.markdownify(key, params, out.poll); + out.stop(result.elapsedMs); - if (result.data) log.result(result.data); - else log.error(result.error); + if (result.data) out.result(result.data); + else out.error(result.error); }, }); diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index f937db5..90ae718 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -1,4 +1,3 @@ -import * as p from "@clack/prompts"; import { defineCommand } from "citty"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; @@ -15,44 +14,29 @@ export default defineCommand({ description: "Website URL to scrape", required: true, }, - "render-js": { - type: "boolean", - description: "Enable heavy JS rendering (+1 credit)", - }, - stealth: { - type: "boolean", - description: "Bypass bot detection (+4 credits)", - }, - branding: { - type: "boolean", - description: "Extract branding info (+2 credits)", - }, - "country-code": { - type: "string", - description: "ISO country code for geo-targeting", - }, + "render-js": { type: "boolean", description: "Enable heavy JS rendering (+1 credit)" }, + stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, + branding: { type: "boolean", description: "Extract branding info (+2 credits)" }, + "country-code": { type: "string", description: "ISO country code for geo-targeting" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { - log.docs("https://docs.scrapegraphai.com/services/scrape"); - const key = await resolveApiKey(); + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/services/scrape"); + const key = await resolveApiKey(!!args.json); - const params: scrapegraphai.ScrapeParams = { - website_url: args.url, - }; + const params: scrapegraphai.ScrapeParams = { website_url: args.url }; if (args["render-js"]) params.render_heavy_js = true; if (args.stealth) params.stealth = true; if (args.branding) params.branding = true; if (args["country-code"]) params.country_code = args["country-code"]; - const s = p.spinner(); - s.start("Scraping"); - const result = await scrapegraphai.scrape(key, params, (status) => { - s.message(`Status: ${status}`); - }); - s.stop(`Done in ${log.elapsed(result.elapsedMs)}`); + out.start("Scraping"); + const result = await scrapegraphai.scrape(key, params, out.poll); + out.stop(result.elapsedMs); - if (result.data) log.result(result.data); - else log.error(result.error); + if (result.data) out.result(result.data); + else out.error(result.error); }, }); diff --git a/src/commands/search-scraper.ts b/src/commands/search-scraper.ts index 7fc07c3..ab2a4ff 100644 --- a/src/commands/search-scraper.ts +++ b/src/commands/search-scraper.ts @@ -1,4 +1,3 @@ -import * as p from "@clack/prompts"; import { defineCommand } from "citty"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; @@ -23,22 +22,15 @@ export default defineCommand({ type: "boolean", description: "Return markdown only (2 credits/site instead of 10)", }, - schema: { - type: "string", - description: "Output JSON schema (as JSON string)", - }, - stealth: { - type: "boolean", - description: "Bypass bot detection (+4 credits)", - }, - headers: { - type: "string", - description: "Custom headers as JSON object string", - }, + schema: { type: "string", description: "Output JSON schema (as JSON string)" }, + stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, + headers: { type: "string", description: "Custom headers as JSON object string" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { - log.docs("https://docs.scrapegraphai.com/services/searchscraper"); - const key = await resolveApiKey(); + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/services/searchscraper"); + const key = await resolveApiKey(!!args.json); const params: scrapegraphai.SearchScraperParams = { user_prompt: args.prompt, @@ -50,14 +42,11 @@ export default defineCommand({ if (args.stealth) params.stealth = true; if (args.headers) params.headers = JSON.parse(args.headers); - const s = p.spinner(); - s.start("Searching"); - const result = await scrapegraphai.searchScraper(key, params, (status) => { - s.message(`Status: ${status}`); - }); - s.stop(`Done in ${log.elapsed(result.elapsedMs)}`); + out.start("Searching"); + const result = await scrapegraphai.searchScraper(key, params, out.poll); + out.stop(result.elapsedMs); - if (result.data) log.result(result.data); - else log.error(result.error); + if (result.data) out.result(result.data); + else out.error(result.error); }, }); diff --git a/src/commands/sitemap.ts b/src/commands/sitemap.ts index 0b1c619..8e1b170 100644 --- a/src/commands/sitemap.ts +++ b/src/commands/sitemap.ts @@ -1,4 +1,3 @@ -import * as p from "@clack/prompts"; import { defineCommand } from "citty"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; @@ -15,16 +14,18 @@ export default defineCommand({ description: "Website URL", required: true, }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { - log.docs("https://docs.scrapegraphai.com/services/sitemap"); - const key = await resolveApiKey(); - const s = p.spinner(); - s.start("Fetching sitemap"); + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/services/sitemap"); + const key = await resolveApiKey(!!args.json); + + out.start("Fetching sitemap"); const result = await scrapegraphai.sitemap(key, { website_url: args.url }); - s.stop(`Done in ${log.elapsed(result.elapsedMs)}`); + out.stop(result.elapsedMs); - if (result.data) log.result(result.data); - else log.error(result.error); + if (result.data) out.result(result.data); + else out.error(result.error); }, }); diff --git a/src/commands/smart-scraper.ts b/src/commands/smart-scraper.ts index 61088ee..f754c14 100644 --- a/src/commands/smart-scraper.ts +++ b/src/commands/smart-scraper.ts @@ -1,4 +1,3 @@ -import * as p from "@clack/prompts"; import { defineCommand } from "citty"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; @@ -21,42 +20,20 @@ export default defineCommand({ description: "Extraction prompt", required: true, }, - schema: { - type: "string", - description: "Output JSON schema (as JSON string)", - }, - scrolls: { - type: "string", - description: "Number of infinite scrolls (0-100)", - }, - pages: { - type: "string", - description: "Total pages to scrape (1-100)", - }, - "render-js": { - type: "boolean", - description: "Enable heavy JS rendering (+1 credit)", - }, - stealth: { - type: "boolean", - description: "Bypass bot detection (+4 credits)", - }, - cookies: { - type: "string", - description: "Cookies as JSON object string", - }, - headers: { - type: "string", - description: "Custom headers as JSON object string", - }, - "plain-text": { - type: "boolean", - description: "Return plain text instead of JSON", - }, + schema: { type: "string", description: "Output JSON schema (as JSON string)" }, + scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, + pages: { type: "string", description: "Total pages to scrape (1-100)" }, + "render-js": { type: "boolean", description: "Enable heavy JS rendering (+1 credit)" }, + stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, + cookies: { type: "string", description: "Cookies as JSON object string" }, + headers: { type: "string", description: "Custom headers as JSON object string" }, + "plain-text": { type: "boolean", description: "Return plain text instead of JSON" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { - log.docs("https://docs.scrapegraphai.com/services/smartscraper"); - const key = await resolveApiKey(); + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/services/smartscraper"); + const key = await resolveApiKey(!!args.json); const params: scrapegraphai.SmartScraperParams = { website_url: args.url, @@ -72,14 +49,11 @@ export default defineCommand({ if (args.headers) params.headers = JSON.parse(args.headers); if (args["plain-text"]) params.plain_text = true; - const s = p.spinner(); - s.start("Scraping"); - const result = await scrapegraphai.smartScraper(key, params, (status) => { - s.message(`Status: ${status}`); - }); - s.stop(`Done in ${log.elapsed(result.elapsedMs)}`); + out.start("Scraping"); + const result = await scrapegraphai.smartScraper(key, params, out.poll); + out.stop(result.elapsedMs); - if (result.data) log.result(result.data); - else log.error(result.error); + if (result.data) out.result(result.data); + else out.error(result.error); }, }); diff --git a/src/commands/validate.ts b/src/commands/validate.ts index b5d514e..db956d7 100644 --- a/src/commands/validate.ts +++ b/src/commands/validate.ts @@ -1,4 +1,3 @@ -import * as p from "@clack/prompts"; import { defineCommand } from "citty"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; @@ -9,14 +8,18 @@ export default defineCommand({ name: "validate", description: "Validate your API key (health check)", }, - run: async () => { - const key = await resolveApiKey(); - const s = p.spinner(); - s.start("Checking API health"); + args: { + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + const key = await resolveApiKey(!!args.json); + + out.start("Checking API health"); const result = await scrapegraphai.checkHealth(key); - s.stop(`Done in ${log.elapsed(result.elapsedMs)}`); + out.stop(result.elapsedMs); - if (result.data) log.result(result.data); - else log.error(result.error); + if (result.data) out.result(result.data); + else out.error(result.error); }, }); diff --git a/src/lib/env.ts b/src/lib/env.ts index 07c6f84..e0e1f5c 100644 --- a/src/lib/env.ts +++ b/src/lib/env.ts @@ -31,7 +31,7 @@ function resolve(): Env { debug: process.env.JUST_SCRAPE_DEBUG === "1", timeoutS: process.env.JUST_SCRAPE_TIMEOUT_S ? Number(process.env.JUST_SCRAPE_TIMEOUT_S) - : undefined, + : 60 * 2, }); } diff --git a/src/lib/folders.ts b/src/lib/folders.ts index 5c86665..84eddef 100644 --- a/src/lib/folders.ts +++ b/src/lib/folders.ts @@ -4,18 +4,20 @@ import { CONFIG_DIR, CONFIG_PATH, env } from "./env.js"; let cachedKey: string | null = null; -export async function resolveApiKey(): Promise { +export async function resolveApiKey(quiet = false): Promise { if (cachedKey) return cachedKey; if (env.apiKey) { - let source = "config (~/.scrapegraphai/config.json)"; - if (process.env.SGAI_API_KEY) { - source = "SGAI_API_KEY env var"; - try { - if (/^SGAI_API_KEY\s*=/m.test(readFileSync(".env", "utf-8"))) source = ".env file"; - } catch {} + if (!quiet) { + let source = "config (~/.scrapegraphai/config.json)"; + if (process.env.SGAI_API_KEY) { + source = "SGAI_API_KEY env var"; + try { + if (/^SGAI_API_KEY\s*=/m.test(readFileSync(".env", "utf-8"))) source = ".env file"; + } catch {} + } + p.log.info(`Using API key from ${source}`); } - p.log.info(`Using API key from ${source}`); cachedKey = env.apiKey; return env.apiKey; } diff --git a/src/lib/log.ts b/src/lib/log.ts index f133141..68f8660 100644 --- a/src/lib/log.ts +++ b/src/lib/log.ts @@ -9,20 +9,35 @@ function highlight(json: string): string { .replace(/:\s*(true|false|null)\b/g, (_, lit) => `: ${chalk.magenta(lit)}`); } -export function elapsed(ms: number): string { +function elapsed(ms: number): string { if (ms < 1000) return `${ms}ms`; return `${(ms / 1000).toFixed(1)}s`; } -export function result(data: unknown) { - console.log(`\n${highlight(JSON.stringify(data, null, 2))}\n`); +export function create(quiet = false) { + const s = p.spinner(); + return { + docs(url: string) { + if (!quiet) console.log(chalk.dim(`Docs: ${url}`)); + }, + start(msg: string) { + if (!quiet) s.start(msg); + }, + stop(ms: number) { + if (!quiet) s.stop(`Done in ${elapsed(ms)}`); + }, + poll(status: string) { + if (!quiet) s.message(`Status: ${status}`); + }, + result(data: unknown) { + if (quiet) console.log(JSON.stringify(data, null, 2)); + else console.log(`\n${highlight(JSON.stringify(data, null, 2))}\n`); + }, + error(message?: string) { + p.log.error(message ?? "Unknown error"); + process.exit(1); + }, + }; } -export function docs(url: string) { - console.log(chalk.dim(`Docs: ${url}`)); -} - -export function error(message?: string) { - p.log.error(message ?? "Unknown error"); - process.exit(1); -} +export type Logger = ReturnType; diff --git a/src/lib/schemas.ts b/src/lib/schemas.ts index d13a2b8..5054498 100644 --- a/src/lib/schemas.ts +++ b/src/lib/schemas.ts @@ -76,3 +76,19 @@ export const AgenticScraperSchema = z.object({ ai_extraction: z.boolean().optional(), use_session: z.boolean().optional(), }); + +export const HISTORY_SERVICES = [ + "markdownify", + "smartscraper", + "searchscraper", + "scrape", + "crawl", + "agentic-scraper", + "sitemap", +] as const; + +export const HistorySchema = z.object({ + service: z.enum(HISTORY_SERVICES), + page: z.number().int().positive().default(1), + page_size: z.number().int().positive().max(100).default(10), +}); diff --git a/src/lib/scrapegraphai.ts b/src/lib/scrapegraphai.ts index 5cc85ea..4d5f499 100644 --- a/src/lib/scrapegraphai.ts +++ b/src/lib/scrapegraphai.ts @@ -3,6 +3,7 @@ import type { ApiResult, CrawlParams, GenerateSchemaParams, + HistoryParams, MarkdownifyParams, ScrapeParams, SearchScraperParams, @@ -14,6 +15,7 @@ import { AgenticScraperSchema, CrawlSchema, GenerateSchemaSchema, + HistorySchema, MarkdownifySchema, ScrapeSchema, SearchScraperSchema, @@ -26,6 +28,7 @@ export type { ApiResult, CrawlParams, GenerateSchemaParams, + HistoryParams, MarkdownifyParams, ScrapeParams, SearchScraperParams, @@ -325,3 +328,16 @@ export async function checkHealth(apiKey: string): Promise> { return fail(err); } } + +export async function history(apiKey: string, params: HistoryParams): Promise> { + try { + const parsed = HistorySchema.parse(params); + const qs = new URLSearchParams(); + qs.set("page", String(parsed.page)); + qs.set("page_size", String(parsed.page_size)); + const { data, elapsedMs } = await request("GET", `/history/${parsed.service}?${qs}`, apiKey); + return ok(data, elapsedMs); + } catch (err) { + return fail(err); + } +} diff --git a/src/types/index.ts b/src/types/index.ts index 67648df..fa6760c 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -3,6 +3,7 @@ import type { AgenticScraperSchema, CrawlSchema, GenerateSchemaSchema, + HistorySchema, MarkdownifySchema, ScrapeSchema, SearchScraperSchema, @@ -18,6 +19,7 @@ export type GenerateSchemaParams = z.infer; export type SitemapParams = z.infer; export type ScrapeParams = z.infer; export type AgenticScraperParams = z.infer; +export type HistoryParams = z.input; export type ApiResult = { status: "success" | "error"; diff --git a/src/utils/banner.ts b/src/utils/banner.ts index da71b89..9a8c4f9 100644 --- a/src/utils/banner.ts +++ b/src/utils/banner.ts @@ -24,8 +24,9 @@ const TAGLINE = " made with ♥ from scrapegraphai team"; const BANNER_COLOR = "#bd93f9"; export function showBanner() { - const text = BANNER.map((line) => chalk.hex(BANNER_COLOR)(line)).join("\n"); + if (process.argv.includes("--json")) return; + const text = BANNER.map((line) => chalk.hex(BANNER_COLOR)(line)).join("\n"); console.log(text); console.log(chalk.hex(BANNER_COLOR)(TAGLINE)); console.log(chalk.hex(BANNER_COLOR)(`v${getVersion()}`));