Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions components/scrapfly/actions/account-info/account-info.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import scrapfly from "../../scrapfly.app.mjs";

export default {
key: "scrapfly-account-info",
name: "Retrieve Scrapfly Account Info",
description: "Retrieve current subscription and account usage details from Scrapfly. [See the documentation](https://scrapfly.io/docs/account#api)",
version: "0.0.1",
type: "action",
props: {
scrapfly,
},
async run({ $ }) {
const response = await this.scrapfly.getAccountInfo({
$,
});
$.export("$summary", "Successfully retrieved account information");
return response;
},
};
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import { ConfigurationError } from "@pipedream/platform";
import fs from "fs";
import { checkTmp } from "../../common/utils.mjs";
import scrapfly from "../../scrapfly.app.mjs";

export default {
key: "scrapfly-ai-data-extraction",
name: "AI Data Extraction",
description: "Automate content extraction from any text-based source using AI, LLM, and custom parsing. [See the documentation](https://scrapfly.io/docs/extraction-api/getting-started)",
version: "0.0.1",
type: "action",
props: {
scrapfly,
body: {
propDefinition: [
scrapfly,
"body",
],
},
contentType: {
propDefinition: [
scrapfly,
"contentType",
],
},
url: {
propDefinition: [
scrapfly,
"url",
],
},
charset: {
type: "string",
label: "Charset",
description: "Charset of the document pass in the body. If you are not sure, you can use the `auto` value and we will try to detect it. Bad charset can lead to bad extraction, so it's important to set it correctly. **The most common charset is `utf-8` for text document and `ascii` for binary**. The symptom of a bad charset is that the text is not correctly displayed (accent, special characters, etc).",
default: "auto",
optional: true,
},
extractionTemplate: {
type: "string",
label: "Extraction Template",
description: "Define an extraction template to get structured data. Use an ephemeral template (declared on the fly on the API call) or a stored template (declared in the dashboard) by using the template name.",
optional: true,
},
extractionPrompt: {
type: "string",
label: "Extraction Prompt",
description: "Instruction to extract data or ask a question on the scraped content with an LLM (Large Language Model). [Must be url encoded](https://scrapfly.io/web-scraping-tools/urlencode).",
optional: true,
},
extractionModel: {
type: "string",
label: "Extraction Model",
description: "AI Extraction to auto parse document to get structured data. E.g., `product`, `review`, `real-estate`, `article`.",
optional: true,
},
webhookName: {
type: "string",
label: "Webhook Name",
description: "Queue you scrape request and redirect API response to a provided webhook endpoint. You can create a webhook endpoint from your `dashboard`, it takes the name of the webhook. Webhooks are scoped to the given project/env.",
optional: true,
},
},
async run({ $ }) {
if (!this.extractionTemplate && !this.extractionPrompt && !this.extractionModel) {
throw new ConfigurationError("You must provide at least **Extraction Template**, **Extraction Prompt** or **Extraction Model**");
}
const response = await this.scrapfly.automateContentExtraction({
$,
headers: {
"content-type": this.contentType,
},
maxBodyLength: Infinity,
params: {
url: this.url,
charset: this.charset,
extraction_template: this.extractionTemplate,
extraction_prompt: this.extractionPrompt,
extraction_model: this.extractionModel,
webhook_name: this.webhookName,
},
data: fs.readFileSync(checkTmp(this.body)).toString(),
});

$.export("$summary", "Successfully extracted content");
return response;
},
};
156 changes: 156 additions & 0 deletions components/scrapfly/actions/scrape-page/scrape-page.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import { ConfigurationError } from "@pipedream/platform";
import {
FORMAT_OPTIONS,
PROXY_COUNTRY_OPTIONS,
PROXY_POOL_OPTIONS,
} from "../../common/constants.mjs";
import { parseObject } from "../../common/utils.mjs";
import scrapfly from "../../scrapfly.app.mjs";

export default {
key: "scrapfly-scrape-page",
name: "Scrape Page",
description: "Extract data from a specified web page. [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started)",
version: "0.0.1",
type: "action",
props: {
scrapfly,
url: {
propDefinition: [
scrapfly,
"url",
],
},
headers: {
type: "object",
label: "Headers",
description: "Pass custom headers to the request.",
optional: true,
},
lang: {
type: "string",
label: "Language",
description: "Select page language. By default it uses the language of the selected proxy location. Behind the scenes, it configures the `Accept-Language` HTTP header. If the website support the language, the content will be in that lang. **Note: you cannot set headers `Accept-Language` header manually**. [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started#spec)",
optional: true,
},
os: {
type: "string",
label: "Operating System",
description: "Operating System, if not selected it's random. **Note: you cannot set os parameter and `User-Agent` header at the same time.** [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started#spec)",
optional: true,
},
timeout: {
type: "integer",
label: "Timeout",
description: "Timeout in milliseconds. It represents the maximum time allowed for Scrapfly to perform the scrape. Since `timeout` is not trivial to understand see our [extended documentation on timeouts](https://scrapfly.io/docs/scrape-api/understand-timeout)",
optional: true,
},
format: {
type: "string",
label: "Format",
description: "Format of the response.",
options: FORMAT_OPTIONS,
optional: true,
},
retry: {
type: "boolean",
label: "Retry",
description: "Improve reliability with retries on failure.",
optional: true,
},
proxifiedResponse: {
type: "boolean",
label: "Proxified Response",
description: "Return the content of the page directly.",
optional: true,
},
debug: {
type: "boolean",
label: "Debug",
description: "Store the API result and take a screenshot if rendering js is enabled.",
optional: true,
},
correlationId: {
type: "string",
label: "Correlation ID",
description: "Helper ID for correlating a group of scrapes.",
optional: true,
},
tags: {
type: "string[]",
label: "Tags",
description: "Add tags to your scrapes to group them.",
optional: true,
},
dns: {
type: "boolean",
label: "DNS",
description: "Query and retrieve target DNS information.",
optional: true,
},
ssl: {
type: "boolean",
label: "SSL",
description: "SSL option.",
optional: true,
},
proxyPool: {
type: "string",
label: "Proxy Pool",
description: "Select the proxy pool to use.",
optional: true,
options: PROXY_POOL_OPTIONS,
reloadProps: true,
},
},
async additionalProps() {
const props = {};
props.country = {
type: "string",
label: "Country",
description: "Proxy country location. If not set it chooses a random location available. A reference to a country must be ISO 3166 alpha-2 (2 letters). The available countries are defined by the proxy pool you use. [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started#spec)",
optional: true,
options: PROXY_COUNTRY_OPTIONS[this.proxyPool],
};
return props;
},
async run({ $ }) {
try {
let headers = "";
if (this.headers) {
headers = Object.keys(parseObject(this.headers))
.reduce((acc, key) => {
acc.push(`headers[${key}]=${encodeURIComponent(this.headers[key])}`);
return acc;
}, []);
}
const params = {
url: this.url,
proxy_pool: this.proxyPool,
country: this.country,
lang: this.lang,
os: this.os,
timeout: this.timeout,
format: this.format,
retry: this.retry,
proxified_response: this.proxifiedResponse,
debug: this.debug,
correlation_id: this.correlationId,
tags: parseObject(this.tags),
dns: this.dns,
ssl: this.ssl,
headers,
};

const response = await this.scrapfly.extractWebPageContent({
$,
params,
});

$.export("$summary", `Successfully scraped content from ${this.url}`);
return response;
} catch ({ response: { data: { message } } }) {
throw new ConfigurationError(message);
}
},
};
Loading