Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
491 changes: 283 additions & 208 deletions README.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ const main = defineCommand({
scrape: () => import("./commands/scrape.js").then((m) => m.default),
"agentic-scraper": () => import("./commands/agentic-scraper.js").then((m) => m.default),
"generate-schema": () => import("./commands/generate-schema.js").then((m) => m.default),
history: () => import("./commands/history.js").then((m) => m.default),
credits: () => import("./commands/credits.js").then((m) => m.default),
validate: () => import("./commands/validate.js").then((m) => m.default),
},
Expand Down
39 changes: 13 additions & 26 deletions src/commands/agentic-scraper.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import * as p from "@clack/prompts";
import { defineCommand } from "citty";
import { resolveApiKey } from "../lib/folders.js";
import * as log from "../lib/log.js";
Expand All @@ -25,41 +24,29 @@ export default defineCommand({
alias: "p",
description: "Extraction prompt (used with --ai-extraction)",
},
schema: {
type: "string",
description: "Output JSON schema (as JSON string)",
},
"ai-extraction": {
type: "boolean",
description: "Enable AI extraction after steps",
},
"use-session": {
type: "boolean",
description: "Persist browser session across requests",
},
schema: { type: "string", description: "Output JSON schema (as JSON string)" },
"ai-extraction": { type: "boolean", description: "Enable AI extraction after steps" },
"use-session": { type: "boolean", description: "Persist browser session across requests" },
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
},
run: async ({ args }) => {
log.docs("https://docs.scrapegraphai.com/services/agenticscraper");
const key = await resolveApiKey();
const out = log.create(!!args.json);
out.docs("https://docs.scrapegraphai.com/services/agenticscraper");
const key = await resolveApiKey(!!args.json);

const params: scrapegraphai.AgenticScraperParams = {
url: args.url,
};
const params: scrapegraphai.AgenticScraperParams = { url: args.url };

if (args.steps) params.steps = args.steps.split(",").map((s) => s.trim());
if (args.prompt) params.user_prompt = args.prompt;
if (args.schema) params.output_schema = JSON.parse(args.schema);
if (args["ai-extraction"]) params.ai_extraction = true;
if (args["use-session"]) params.use_session = true;

const s = p.spinner();
s.start("Running browser automation");
const result = await scrapegraphai.agenticScraper(key, params, (status) => {
s.message(`Status: ${status}`);
});
s.stop(`Done in ${log.elapsed(result.elapsedMs)}`);
out.start("Running browser automation");
const result = await scrapegraphai.agenticScraper(key, params, out.poll);
out.stop(result.elapsedMs);

if (result.data) log.result(result.data);
else log.error(result.error);
if (result.data) out.result(result.data);
else out.error(result.error);
},
});
59 changes: 17 additions & 42 deletions src/commands/crawl.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import * as p from "@clack/prompts";
import { defineCommand } from "citty";
import { resolveApiKey } from "../lib/folders.js";
import * as log from "../lib/log.js";
Expand All @@ -24,42 +23,21 @@ export default defineCommand({
type: "boolean",
description: "Return markdown only (2 credits/page instead of 10)",
},
"max-pages": {
type: "string",
description: "Maximum pages to crawl (default 10)",
},
depth: {
type: "string",
description: "Crawl depth (default 1)",
},
schema: {
type: "string",
description: "Output JSON schema (as JSON string)",
},
rules: {
type: "string",
description: "Crawl rules as JSON object string",
},
"no-sitemap": {
type: "boolean",
description: "Disable sitemap-based URL discovery",
},
"render-js": {
type: "boolean",
description: "Enable heavy JS rendering (+1 credit/page)",
},
stealth: {
type: "boolean",
description: "Bypass bot detection (+4 credits)",
},
"max-pages": { type: "string", description: "Maximum pages to crawl (default 10)" },
depth: { type: "string", description: "Crawl depth (default 1)" },
schema: { type: "string", description: "Output JSON schema (as JSON string)" },
rules: { type: "string", description: "Crawl rules as JSON object string" },
"no-sitemap": { type: "boolean", description: "Disable sitemap-based URL discovery" },
"render-js": { type: "boolean", description: "Enable heavy JS rendering (+1 credit/page)" },
stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" },
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
},
run: async ({ args }) => {
log.docs("https://docs.scrapegraphai.com/services/smartcrawler");
const key = await resolveApiKey();
const out = log.create(!!args.json);
out.docs("https://docs.scrapegraphai.com/services/smartcrawler");
const key = await resolveApiKey(!!args.json);

const params: scrapegraphai.CrawlParams = {
url: args.url,
};
const params: scrapegraphai.CrawlParams = { url: args.url };

if (args.prompt) params.prompt = args.prompt;
if (args["no-extraction"]) params.extraction_mode = false;
Expand All @@ -71,14 +49,11 @@ export default defineCommand({
if (args["render-js"]) params.render_heavy_js = true;
if (args.stealth) params.stealth = true;

const s = p.spinner();
s.start("Crawling");
const result = await scrapegraphai.crawl(key, params, (status) => {
s.message(`Status: ${status}`);
});
s.stop(`Done in ${log.elapsed(result.elapsedMs)}`);
out.start("Crawling");
const result = await scrapegraphai.crawl(key, params, out.poll);
out.stop(result.elapsedMs);

if (result.data) log.result(result.data);
else log.error(result.error);
if (result.data) out.result(result.data);
else out.error(result.error);
},
});
19 changes: 11 additions & 8 deletions src/commands/credits.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import * as p from "@clack/prompts";
import { defineCommand } from "citty";
import { resolveApiKey } from "../lib/folders.js";
import * as log from "../lib/log.js";
Expand All @@ -9,14 +8,18 @@ export default defineCommand({
name: "credits",
description: "Check your credit balance",
},
run: async () => {
const key = await resolveApiKey();
const s = p.spinner();
s.start("Fetching credits");
args: {
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
},
run: async ({ args }) => {
const out = log.create(!!args.json);
const key = await resolveApiKey(!!args.json);

out.start("Fetching credits");
const result = await scrapegraphai.getCredits(key);
s.stop(`Done in ${log.elapsed(result.elapsedMs)}`);
out.stop(result.elapsedMs);

if (result.data) log.result(result.data);
else log.error(result.error);
if (result.data) out.result(result.data);
else out.error(result.error);
},
});
23 changes: 9 additions & 14 deletions src/commands/generate-schema.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import * as p from "@clack/prompts";
import { defineCommand } from "citty";
import { resolveApiKey } from "../lib/folders.js";
import * as log from "../lib/log.js";
Expand All @@ -19,24 +18,20 @@ export default defineCommand({
type: "string",
description: "Existing schema to modify (as JSON string)",
},
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
},
run: async ({ args }) => {
const key = await resolveApiKey();

const params: scrapegraphai.GenerateSchemaParams = {
user_prompt: args.prompt,
};
const out = log.create(!!args.json);
const key = await resolveApiKey(!!args.json);

const params: scrapegraphai.GenerateSchemaParams = { user_prompt: args.prompt };
if (args["existing-schema"]) params.existing_schema = JSON.parse(args["existing-schema"]);

const s = p.spinner();
s.start("Generating schema");
const result = await scrapegraphai.generateSchema(key, params, (status) => {
s.message(`Status: ${status}`);
});
s.stop(`Done in ${log.elapsed(result.elapsedMs)}`);
out.start("Generating schema");
const result = await scrapegraphai.generateSchema(key, params, out.poll);
out.stop(result.elapsedMs);

if (result.data) log.result(result.data);
else log.error(result.error);
if (result.data) out.result(result.data);
else out.error(result.error);
},
});
146 changes: 146 additions & 0 deletions src/commands/history.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import * as p from "@clack/prompts";
import chalk from "chalk";
import { defineCommand } from "citty";
import { resolveApiKey } from "../lib/folders.js";
import * as log from "../lib/log.js";
import { HISTORY_SERVICES } from "../lib/schemas.js";
import * as scrapegraphai from "../lib/scrapegraphai.js";

const VALID = HISTORY_SERVICES.join(", ");
const LOAD_MORE = "__load_more__";

function getId(row: Record<string, unknown>): string {
return String(row.request_id ?? row.crawl_id ?? row.id ?? "unknown");
}

function label(row: Record<string, unknown>): string {
const id = getId(row);
const short = id.length > 12 ? `${id.slice(0, 12)}…` : id;
const status = String(row.status ?? "—");
const url = String(row.website_url ?? row.url ?? row.user_prompt ?? "");
const urlShort = url.length > 50 ? `${url.slice(0, 49)}…` : url;

const color =
status === "completed" || status === "done"
? chalk.green
: status === "failed"
? chalk.red
: chalk.yellow;

return `${chalk.dim(short)} ${color(status)} ${urlShort}`;
}

function hint(row: Record<string, unknown>): string {
const ts = row.created_at ?? row.timestamp ?? row.updated_at;
if (!ts) return "";
const d = new Date(String(ts));
return Number.isNaN(d.getTime()) ? String(ts) : d.toLocaleString();
}

export default defineCommand({
meta: {
name: "history",
description: "View request history for a service",
},
args: {
service: {
type: "positional",
description: `Service name (${VALID})`,
required: true,
},
page: { type: "string", description: "Page number (default: 1)" },
"page-size": { type: "string", description: "Results per page (default: 10, max: 100)" },
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
},
run: async ({ args }) => {
const quiet = !!args.json;
const out = log.create(quiet);
const key = await resolveApiKey(quiet);
const service = args.service as scrapegraphai.HistoryParams["service"];
const requestId = (args as { _: string[] })._.at(1);
const pageSize = args["page-size"] ? Number(args["page-size"]) : 10;
let page = args.page ? Number(args.page) : 1;

const fetchPage = async (pg: number) => {
const r = await scrapegraphai.history(key, { service, page: pg, page_size: pageSize });
if (r.status === "error") out.error(r.error);
const d = r.data as { requests: Record<string, unknown>[]; next_key?: string };
return { rows: d.requests ?? [], hasMore: !!d.next_key, ms: r.elapsedMs };
};

if (quiet || requestId) {
const { rows } = await fetchPage(page);
if (requestId) {
const match = rows.find((r) => getId(r) === requestId);
if (!match) out.error(`Request ${requestId} not found on page ${page}`);
out.result(match);
return;
}
out.result(rows);
return;
}

out.start(`Fetching ${service} history`);
const first = await fetchPage(page);
out.stop(first.ms);

if (first.rows.length === 0) {
p.log.warning("No history found.");
return;
}

const allRows = [...first.rows];
let hasMore = first.hasMore;

while (true) {
const options = allRows.map((row) => ({
value: getId(row),
label: label(row),
hint: hint(row),
}));

if (hasMore) {
options.push({
value: LOAD_MORE,
label: chalk.blue.bold("↓ Load more…"),
hint: `page ${page + 1}`,
});
}

const selected = await p.select({
message: `${allRows.length} requests — select one to view`,
options,
maxItems: 15,
});

if (p.isCancel(selected)) {
p.cancel("Cancelled");
return;
}

if (selected === LOAD_MORE) {
page++;
const ls = p.spinner();
ls.start(`Loading page ${page}`);
const next = await fetchPage(page);
ls.stop("Done");

if (next.rows.length === 0) {
hasMore = false;
p.log.warning("No more results.");
continue;
}

allRows.push(...next.rows);
hasMore = next.hasMore;
continue;
}

const match = allRows.find((r) => getId(r) === selected);
if (match) out.result(match);

const back = await p.confirm({ message: "Back to list?" });
if (p.isCancel(back) || !back) return;
}
},
});
Loading