PipedreamHQ · luancazarine · Sep 5, 2024 · Aug 30, 2024 · Sep 2, 2024 · Sep 2, 2024
diff --git a/components/scrapfly/actions/account-info/account-info.mjs b/components/scrapfly/actions/account-info/account-info.mjs
@@ -0,0 +1,19 @@
+import scrapfly from "../../scrapfly.app.mjs";
+
+export default {
+  key: "scrapfly-account-info",
+  name: "Retrieve Scrapfly Account Info",
+  description: "Retrieve current subscription and account usage details from Scrapfly. [See the documentation](https://scrapfly.io/docs/account#api)",
+  version: "0.0.1",
+  type: "action",
+  props: {
+    scrapfly,
+  },
+  async run({ $ }) {
+    const response = await this.scrapfly.getAccountInfo({
+      $,
+    });
+    $.export("$summary", "Successfully retrieved account information");
+    return response;
+  },
+};
diff --git a/components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs b/components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs
@@ -0,0 +1,88 @@
+import { ConfigurationError } from "@pipedream/platform";
+import fs from "fs";
+import { checkTmp } from "../../common/utils.mjs";
+import scrapfly from "../../scrapfly.app.mjs";
+
+export default {
+  key: "scrapfly-ai-data-extraction",
+  name: "AI Data Extraction",
+  description: "Automate content extraction from any text-based source using AI, LLM, and custom parsing. [See the documentation](https://scrapfly.io/docs/extraction-api/getting-started)",
+  version: "0.0.1",
+  type: "action",
+  props: {
+    scrapfly,
+    body: {
+      propDefinition: [
+        scrapfly,
+        "body",
+      ],
+    },
+    contentType: {
+      propDefinition: [
+        scrapfly,
+        "contentType",
+      ],
+    },
+    url: {
+      propDefinition: [
+        scrapfly,
+        "url",
+      ],
+    },
+    charset: {
+      type: "string",
+      label: "Charset",
+      description: "Charset of the document pass in the body. If you are not sure, you can use the `auto` value and we will try to detect it. Bad charset can lead to bad extraction, so it's important to set it correctly. **The most common charset is `utf-8` for text document and `ascii` for binary**. The symptom of a bad charset is that the text is not correctly displayed (accent, special characters, etc).",
+      default: "auto",
+      optional: true,
+    },
+    extractionTemplate: {
+      type: "string",
+      label: "Extraction Template",
+      description: "Define an extraction template to get structured data. Use an ephemeral template (declared on the fly on the API call) or a stored template (declared in the dashboard) by using the template name.",
+      optional: true,
+    },
+    extractionPrompt: {
+      type: "string",
+      label: "Extraction Prompt",
+      description: "Instruction to extract data or ask a question on the scraped content with an LLM (Large Language Model). [Must be url encoded](https://scrapfly.io/web-scraping-tools/urlencode).",
+      optional: true,
+    },
+    extractionModel: {
+      type: "string",
+      label: "Extraction Model",
+      description: "AI Extraction to auto parse document to get structured data. E.g., `product`, `review`, `real-estate`, `article`.",
+      optional: true,
+    },
+    webhookName: {
+      type: "string",
+      label: "Webhook Name",
+      description: "Queue you scrape request and redirect API response to a provided webhook endpoint. You can create a webhook endpoint from your `dashboard`, it takes the name of the webhook. Webhooks are scoped to the given project/env.",
+      optional: true,
+    },
+  },
+  async run({ $ }) {
+    if (!this.extractionTemplate && !this.extractionPrompt && !this.extractionModel) {
+      throw new ConfigurationError("You must provide at least **Extraction Template**, **Extraction Prompt** or **Extraction Model**");
+    }
+    const response = await this.scrapfly.automateContentExtraction({
+      $,
+      headers: {
+        "content-type": this.contentType,
+      },
+      maxBodyLength: Infinity,
+      params: {
+        url: this.url,
+        charset: this.charset,
+        extraction_template: this.extractionTemplate,
+        extraction_prompt: this.extractionPrompt,
+        extraction_model: this.extractionModel,
+        webhook_name: this.webhookName,
+      },
+      data: fs.readFileSync(checkTmp(this.body)).toString(),
+    });
+
+    $.export("$summary", "Successfully extracted content");
+    return response;
+  },
+};
diff --git a/components/scrapfly/actions/scrape-page/scrape-page.mjs b/components/scrapfly/actions/scrape-page/scrape-page.mjs
@@ -0,0 +1,156 @@
+import { ConfigurationError } from "@pipedream/platform";
+import {
+  FORMAT_OPTIONS,
+  PROXY_COUNTRY_OPTIONS,
+  PROXY_POOL_OPTIONS,
+} from "../../common/constants.mjs";
+import { parseObject } from "../../common/utils.mjs";
+import scrapfly from "../../scrapfly.app.mjs";
+
+export default {
+  key: "scrapfly-scrape-page",
+  name: "Scrape Page",
+  description: "Extract data from a specified web page. [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started)",
+  version: "0.0.1",
+  type: "action",
+  props: {
+    scrapfly,
+    url: {
+      propDefinition: [
+        scrapfly,
+        "url",
+      ],
+    },
+    headers: {
+      type: "object",
+      label: "Headers",
+      description: "Pass custom headers to the request.",
+      optional: true,
+    },
+    lang: {
+      type: "string",
+      label: "Language",
+      description: "Select page language. By default it uses the language of the selected proxy location. Behind the scenes, it configures the `Accept-Language` HTTP header. If the website support the language, the content will be in that lang. **Note: you cannot set headers `Accept-Language` header manually**. [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started#spec)",
+      optional: true,
+    },
+    os: {
+      type: "string",
+      label: "Operating System",
+      description: "Operating System, if not selected it's random. **Note: you cannot set os parameter and `User-Agent` header at the same time.** [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started#spec)",
+      optional: true,
+    },
+    timeout: {
+      type: "integer",
+      label: "Timeout",
+      description: "Timeout in milliseconds. It represents the maximum time allowed for Scrapfly to perform the scrape. Since `timeout` is not trivial to understand see our [extended documentation on timeouts](https://scrapfly.io/docs/scrape-api/understand-timeout)",
+      optional: true,
+    },
+    format: {
+      type: "string",
+      label: "Format",
+      description: "Format of the response.",
+      options: FORMAT_OPTIONS,
+      optional: true,
+    },
+    retry: {
+      type: "boolean",
+      label: "Retry",
+      description: "Improve reliability with retries on failure.",
+      optional: true,
+    },
+    proxifiedResponse: {
+      type: "boolean",
+      label: "Proxified Response",
+      description: "Return the content of the page directly.",
+      optional: true,
+    },
+    debug: {
+      type: "boolean",
+      label: "Debug",
+      description: "Store the API result and take a screenshot if rendering js is enabled.",
+      optional: true,
+    },
+    correlationId: {
+      type: "string",
+      label: "Correlation ID",
+      description: "Helper ID for correlating a group of scrapes.",
+      optional: true,
+    },
+    tags: {
+      type: "string[]",
+      label: "Tags",
+      description: "Add tags to your scrapes to group them.",
+      optional: true,
+    },
+    dns: {
+      type: "boolean",
+      label: "DNS",
+      description: "Query and retrieve target DNS information.",
+      optional: true,
+    },
+    ssl: {
+      type: "boolean",
+      label: "SSL",
+      description: "SSL option.",
+      optional: true,
+    },
+    proxyPool: {
+      type: "string",
+      label: "Proxy Pool",
+      description: "Select the proxy pool to use.",
+      optional: true,
+      options: PROXY_POOL_OPTIONS,
+      reloadProps: true,
+    },
+  },
+  async additionalProps() {
+    const props = {};
+    props.country = {
+      type: "string",
+      label: "Country",
+      description: "Proxy country location. If not set it chooses a random location available. A reference to a country must be ISO 3166 alpha-2 (2 letters). The available countries are defined by the proxy pool you use. [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started#spec)",
+      optional: true,
+      options: PROXY_COUNTRY_OPTIONS[this.proxyPool],
+    };
+    return props;
+  },
+  async run({ $ }) {
+    try {
+      let headers = "";
+      if (this.headers) {
+        headers = Object.keys(parseObject(this.headers))
+          .reduce((acc, key) => {
+            acc.push(`headers[${key}]=${encodeURIComponent(this.headers[key])}`);
+            return acc;
+          }, []);
+      }
+      const params = {
+        url: this.url,
+        proxy_pool: this.proxyPool,
+        country: this.country,
+        lang: this.lang,
+        os: this.os,
+        timeout: this.timeout,
+        format: this.format,
+        retry: this.retry,
+        proxified_response: this.proxifiedResponse,
+        debug: this.debug,
+        correlation_id: this.correlationId,
+        tags: parseObject(this.tags),
+        dns: this.dns,
+        ssl: this.ssl,
+        headers,
+      };
+
+      const response = await this.scrapfly.extractWebPageContent({
+        $,
+        params,
+      });
+
+      $.export("$summary", `Successfully scraped content from ${this.url}`);
+      return response;
+    } catch ({ response: { data: { message } } }) {
+      throw new ConfigurationError(message);
+    }
+  },
+};