From e8379302f688e4d38fd08475a02cc78dc0823510 Mon Sep 17 00:00:00 2001 From: Luan Cazarine Date: Fri, 30 Aug 2024 12:09:13 -0300 Subject: [PATCH 1/5] scrapfly init --- .../actions/account-info/account-info.mjs | 18 ++ .../ai-data-extraction/ai-data-extraction.mjs | 74 ++++++++ .../actions/scrape-page/scrape-page.mjs | 172 ++++++++++++++++++ components/scrapfly/package.json | 2 +- components/scrapfly/scrapfly.app.mjs | 82 ++++++++- 5 files changed, 342 insertions(+), 6 deletions(-) create mode 100644 components/scrapfly/actions/account-info/account-info.mjs create mode 100644 components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs create mode 100644 components/scrapfly/actions/scrape-page/scrape-page.mjs diff --git a/components/scrapfly/actions/account-info/account-info.mjs b/components/scrapfly/actions/account-info/account-info.mjs new file mode 100644 index 0000000000000..9736c0e83edff --- /dev/null +++ b/components/scrapfly/actions/account-info/account-info.mjs @@ -0,0 +1,18 @@ +import scrapfly from "../../scrapfly.app.mjs"; +import { axios } from "@pipedream/platform"; + +export default { + key: "scrapfly-account-info", + name: "Retrieve Scrapfly Account Info", + description: "Retrieve current subscription and account usage details from Scrapfly. [See the documentation](https://scrapfly.io/docs/account#api)", + version: "0.0.{{ts}}", + type: "action", + props: { + scrapfly, + }, + async run({ $ }) { + const response = await this.scrapfly.getSubscriptionAndUsageDetails(); + $.export("$summary", "Successfully retrieved account information"); + return response; + }, +}; diff --git a/components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs b/components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs new file mode 100644 index 0000000000000..050256a364737 --- /dev/null +++ b/components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs @@ -0,0 +1,74 @@ +import scrapfly from "../../scrapfly.app.mjs"; +import { axios } from "@pipedream/platform"; + +export default { + key: "scrapfly-ai-data-extraction", + name: "AI Data Extraction", + description: "Automate content extraction from any text-based source using AI, LLM, and custom parsing. [See the documentation](https://scrapfly.io/docs/extraction-api/getting-started)", + version: "0.0.{{ts}}", + type: "action", + props: { + scrapfly, + key: { + propDefinition: [ + scrapfly, + "key", + ], + }, + body: { + propDefinition: [ + scrapfly, + "body", + ], + }, + contentType: { + propDefinition: [ + scrapfly, + "contentType", + ], + }, + extractionPrompt: { + type: "string", + label: "Extraction Prompt", + description: "Instruction to extract data or ask a question on the scraped content with an LLM (Large Language Model).", + optional: true, + }, + extractionTemplate: { + type: "string", + label: "Extraction Template", + description: "Define an extraction template to get structured data. Use an ephemeral template (declared on the fly on the API call) or a stored template (declared in the dashboard) by using the template name.", + optional: true, + }, + extractionModel: { + type: "string", + label: "Extraction Model", + description: "AI Extraction to auto parse document to get structured data. E.g., `product`, `review`, `real-estate`, `article`.", + optional: true, + }, + charset: { + type: "string", + label: "Charset", + description: "Charset of the document passed in the body. If you are not sure, you can use the `auto` value and Scrapfly will try to detect it.", + default: "auto", + optional: true, + }, + }, + async run({ $ }) { + const params = { + extraction_prompt: this.extractionPrompt, + extraction_template: this.extractionTemplate, + extraction_model: this.extractionModel, + charset: this.charset, + }; + + const response = await this.scrapfly.automateContentExtraction({ + key: this.key, + body: this.body, + contentType: this.contentType, + ...params, + }); + + $.export("$summary", "Successfully extracted content"); + return response; + }, +}; diff --git a/components/scrapfly/actions/scrape-page/scrape-page.mjs b/components/scrapfly/actions/scrape-page/scrape-page.mjs new file mode 100644 index 0000000000000..683d694b17bb3 --- /dev/null +++ b/components/scrapfly/actions/scrape-page/scrape-page.mjs @@ -0,0 +1,172 @@ +import scrapfly from "../../scrapfly.app.mjs"; +import { axios } from "@pipedream/platform"; + +export default { + key: "scrapfly-scrape-page", + name: "Scrape Page", + description: "Extract data from a specified web page. [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started)", + version: "0.0.{{ts}}", + type: "action", + props: { + scrapfly, + url: { + propDefinition: [ + scrapfly, + "url", + ], + }, + key: { + propDefinition: [ + scrapfly, + "key", + ], + }, + contentType: { + propDefinition: [ + scrapfly, + "contentType", + ], + optional: true, + }, + body: { + propDefinition: [ + scrapfly, + "body", + ], + optional: true, + }, + proxyPool: { + type: "string", + label: "Proxy Pool", + description: "Select the proxy pool to use.", + optional: true, + }, + headers: { + type: "string[]", + label: "Headers", + description: "Pass custom headers to the request. Must be URL encoded.", + optional: true, + }, + country: { + type: "string", + label: "Country", + description: "Proxy country location.", + optional: true, + }, + lang: { + type: "string", + label: "Language", + description: "Select page language.", + optional: true, + }, + os: { + type: "string", + label: "Operating System", + description: "Operating System, if not selected it's random.", + optional: true, + }, + timeout: { + type: "integer", + label: "Timeout", + description: "Timeout in milliseconds.", + optional: true, + }, + format: { + type: "string", + label: "Format", + description: "Format of the response.", + options: [ + "raw", + "text", + "markdown", + "clean_html", + "json", + ], + optional: true, + }, + retry: { + type: "boolean", + label: "Retry", + description: "Improve reliability with retries on failure.", + optional: true, + default: true, + }, + proxifiedResponse: { + type: "boolean", + label: "Proxified Response", + description: "Return the content of the page directly.", + optional: true, + default: false, + }, + debug: { + type: "boolean", + label: "Debug", + description: "Store the API result and take a screenshot if rendering js is enabled.", + optional: true, + default: false, + }, + correlationId: { + type: "string", + label: "Correlation ID", + description: "Helper ID for correlating a group of scrapes.", + optional: true, + }, + tags: { + type: "string[]", + label: "Tags", + description: "Add tags to your scrapes to group them.", + optional: true, + }, + dns: { + type: "boolean", + label: "DNS", + description: "Query and retrieve target DNS information.", + optional: true, + default: false, + }, + ssl: { + type: "boolean", + label: "SSL", + description: "SSL option.", + optional: true, + default: true, + }, + }, + async run({ $ }) { + const params = { + proxy_pool: this.proxyPool, + country: this.country, + lang: this.lang, + os: this.os, + timeout: this.timeout, + format: this.format, + retry: this.retry, + proxified_response: this.proxifiedResponse, + debug: this.debug, + correlation_id: this.correlationId, + tags: this.tags, + dns: this.dns, + ssl: this.ssl, + }; + + if (this.headers) { + params.headers = this.headers.reduce((acc, header) => { + const [ + key, + value, + ] = header.split("="); + acc[key] = value; + return acc; + }, {}); + } + + const response = await this.scrapfly.extractWebPageContent({ + url: this.url, + key: this.key, + ...params, + }); + + $.export("$summary", `Successfully scraped content from ${this.url}`); + return response; + }, +}; diff --git a/components/scrapfly/package.json b/components/scrapfly/package.json index 3f16ae334d2ad..baaaf2eded7a4 100644 --- a/components/scrapfly/package.json +++ b/components/scrapfly/package.json @@ -12,4 +12,4 @@ "publishConfig": { "access": "public" } -} \ No newline at end of file +} diff --git a/components/scrapfly/scrapfly.app.mjs b/components/scrapfly/scrapfly.app.mjs index fdde809425126..de6cc2e549d8a 100644 --- a/components/scrapfly/scrapfly.app.mjs +++ b/components/scrapfly/scrapfly.app.mjs @@ -1,11 +1,83 @@ +import { axios } from "@pipedream/platform"; + export default { type: "app", app: "scrapfly", - propDefinitions: {}, + propDefinitions: { + url: { + type: "string", + label: "URL", + description: "The URL of the web page to extract data from", + }, + key: { + type: "string", + label: "API Key", + description: "Your Scrapfly API key", + }, + body: { + type: "string", + label: "Body", + description: "The content of the page you want to extract data from", + }, + contentType: { + type: "string", + label: "Content Type", + description: "The content type of the document passed in the body", + options: [ + "text/html", + "text/markdown", + "text/plain", + "application/xml", + ], + }, + }, methods: { - // this.$auth contains connected account data - authKeys() { - console.log(Object.keys(this.$auth)); + _baseUrl() { + return "https://api.scrapfly.io"; + }, + async _makeRequest(opts = {}) { + const { + $ = this, method = "GET", path = "/", headers, ...otherOpts + } = opts; + return axios($, { + ...otherOpts, + method, + url: this._baseUrl() + path, + headers: { + ...headers, + Authorization: `Bearer ${this.$auth.api_key}`, + }, + }); + }, + async getSubscriptionAndUsageDetails() { + return this._makeRequest({ + path: "/account", + }); + }, + async extractWebPageContent({ + url, key, ...params + }) { + return this._makeRequest({ + method: "GET", + path: `/scrape?url=${encodeURIComponent(url)}&key=${key}`, + params, + }); + }, + async automateContentExtraction({ + key, body, contentType, ...params + }) { + return this._makeRequest({ + method: "POST", + path: "/extraction", + headers: { + "Content-Type": contentType, + }, + data: body, + params: { + key, + ...params, + }, + }); }, }, -}; \ No newline at end of file +}; From 46323660c239fb9da3ef7fd9b20d7d71d193da69 Mon Sep 17 00:00:00 2001 From: Luan Cazarine Date: Mon, 2 Sep 2024 13:52:47 -0300 Subject: [PATCH 2/5] [Components] scrapfly #13774 Actions - Account Info - Scrape Page - AI Data Extraction --- .../actions/account-info/account-info.mjs | 7 +- .../ai-data-extraction/ai-data-extraction.mjs | 61 +++++---- .../actions/scrape-page/scrape-page.mjs | 123 +++++++----------- components/scrapfly/common/constants.mjs | 24 ++++ components/scrapfly/common/utils.mjs | 31 +++++ components/scrapfly/package.json | 6 +- components/scrapfly/scrapfly.app.mjs | 63 ++++----- 7 files changed, 173 insertions(+), 142 deletions(-) create mode 100644 components/scrapfly/common/constants.mjs create mode 100644 components/scrapfly/common/utils.mjs diff --git a/components/scrapfly/actions/account-info/account-info.mjs b/components/scrapfly/actions/account-info/account-info.mjs index 9736c0e83edff..4d7c4969cdfd7 100644 --- a/components/scrapfly/actions/account-info/account-info.mjs +++ b/components/scrapfly/actions/account-info/account-info.mjs @@ -1,17 +1,18 @@ import scrapfly from "../../scrapfly.app.mjs"; -import { axios } from "@pipedream/platform"; export default { key: "scrapfly-account-info", name: "Retrieve Scrapfly Account Info", description: "Retrieve current subscription and account usage details from Scrapfly. [See the documentation](https://scrapfly.io/docs/account#api)", - version: "0.0.{{ts}}", + version: "0.0.1", type: "action", props: { scrapfly, }, async run({ $ }) { - const response = await this.scrapfly.getSubscriptionAndUsageDetails(); + const response = await this.scrapfly.getAccountInfo({ + $, + }); $.export("$summary", "Successfully retrieved account information"); return response; }, diff --git a/components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs b/components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs index 050256a364737..32aeb46450c94 100644 --- a/components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs +++ b/components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs @@ -1,20 +1,15 @@ +import fs from "fs"; +import { checkTmp } from "../../common/utils.mjs"; import scrapfly from "../../scrapfly.app.mjs"; -import { axios } from "@pipedream/platform"; export default { key: "scrapfly-ai-data-extraction", name: "AI Data Extraction", description: "Automate content extraction from any text-based source using AI, LLM, and custom parsing. [See the documentation](https://scrapfly.io/docs/extraction-api/getting-started)", - version: "0.0.{{ts}}", + version: "0.0.1", type: "action", props: { scrapfly, - key: { - propDefinition: [ - scrapfly, - "key", - ], - }, body: { propDefinition: [ scrapfly, @@ -27,10 +22,17 @@ export default { "contentType", ], }, - extractionPrompt: { + url: { + propDefinition: [ + scrapfly, + "url", + ], + }, + charset: { type: "string", - label: "Extraction Prompt", - description: "Instruction to extract data or ask a question on the scraped content with an LLM (Large Language Model).", + label: "Charset", + description: "Charset of the document pass in the body. If you are not sure, you can use the `auto` value and we will try to detect it. Bad charset can lead to bad extraction, so it's important to set it correctly. **The most common charset is `utf-8` for text document and `ascii` for binary**. The symptom of a bad charset is that the text is not correctly displayed (accent, special characters, etc).", + default: "auto", optional: true, }, extractionTemplate: { @@ -39,33 +41,40 @@ export default { description: "Define an extraction template to get structured data. Use an ephemeral template (declared on the fly on the API call) or a stored template (declared in the dashboard) by using the template name.", optional: true, }, + extractionPrompt: { + type: "string", + label: "Extraction Prompt", + description: "Instruction to extract data or ask a question on the scraped content with an LLM (Large Language Model). [Must be url encoded](https://scrapfly.io/web-scraping-tools/urlencode).", + }, extractionModel: { type: "string", label: "Extraction Model", description: "AI Extraction to auto parse document to get structured data. E.g., `product`, `review`, `real-estate`, `article`.", optional: true, }, - charset: { + webhookName: { type: "string", - label: "Charset", - description: "Charset of the document passed in the body. If you are not sure, you can use the `auto` value and Scrapfly will try to detect it.", - default: "auto", + label: "Webhook Name", + description: "Queue you scrape request and redirect API response to a provided webhook endpoint. You can create a webhook endpoint from your `dashboard`, it takes the name of the webhook. Webhooks are scoped to the given project/env.", optional: true, }, }, async run({ $ }) { - const params = { - extraction_prompt: this.extractionPrompt, - extraction_template: this.extractionTemplate, - extraction_model: this.extractionModel, - charset: this.charset, - }; - const response = await this.scrapfly.automateContentExtraction({ - key: this.key, - body: this.body, - contentType: this.contentType, - ...params, + $, + headers: { + "content-type": this.contentType, + }, + maxBodyLength: Infinity, + params: { + url: this.url, + charset: this.charset, + extraction_template: this.extractionTemplate, + extraction_prompt: this.extractionPrompt, + extraction_model: this.extractionModel, + webhook_name: this.webhookName, + }, + data: fs.readFileSync(checkTmp(this.body)).toString(), }); $.export("$summary", "Successfully extracted content"); diff --git a/components/scrapfly/actions/scrape-page/scrape-page.mjs b/components/scrapfly/actions/scrape-page/scrape-page.mjs index 683d694b17bb3..42cb0be647f0a 100644 --- a/components/scrapfly/actions/scrape-page/scrape-page.mjs +++ b/components/scrapfly/actions/scrape-page/scrape-page.mjs @@ -1,11 +1,16 @@ +import { ConfigurationError } from "@pipedream/platform"; +import { + FORMAT_OPTIONS, + PROXY_POOL_OPTIONS, +} from "../../common/constants.mjs"; +import { parseObject } from "../../common/utils.mjs"; import scrapfly from "../../scrapfly.app.mjs"; -import { axios } from "@pipedream/platform"; export default { key: "scrapfly-scrape-page", name: "Scrape Page", description: "Extract data from a specified web page. [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started)", - version: "0.0.{{ts}}", + version: "0.0.1", type: "action", props: { scrapfly, @@ -15,73 +20,48 @@ export default { "url", ], }, - key: { - propDefinition: [ - scrapfly, - "key", - ], - }, - contentType: { - propDefinition: [ - scrapfly, - "contentType", - ], - optional: true, - }, - body: { - propDefinition: [ - scrapfly, - "body", - ], - optional: true, - }, proxyPool: { type: "string", label: "Proxy Pool", description: "Select the proxy pool to use.", optional: true, + options: PROXY_POOL_OPTIONS, }, headers: { - type: "string[]", + type: "object", label: "Headers", - description: "Pass custom headers to the request. Must be URL encoded.", + description: "Pass custom headers to the request.", optional: true, }, country: { type: "string", label: "Country", - description: "Proxy country location.", + description: "Proxy country location. If not set it chooses a random location available. A reference to a country must be ISO 3166 alpha-2 (2 letters). The available countries are defined by the proxy pool you use. [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started#spec)", optional: true, }, lang: { type: "string", label: "Language", - description: "Select page language.", + description: "Select page language. By default it uses the language of the selected proxy location. Behind the scenes, it configures the `Accept-Language` HTTP header. If the website support the language, the content will be in that lang. **Note: you cannot set headers `Accept-Language` header manually**. [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started#spec)", optional: true, }, os: { type: "string", label: "Operating System", - description: "Operating System, if not selected it's random.", + description: "Operating System, if not selected it's random. **Note: you cannot set os parameter and `User-Agent` header at the same time.** [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started#spec)", optional: true, }, timeout: { type: "integer", label: "Timeout", - description: "Timeout in milliseconds.", + description: "Timeout in milliseconds. It represents the maximum time allowed for Scrapfly to perform the scrape. Since `timeout` is not trivial to understand see our [extended documentation on timeouts](https://scrapfly.io/docs/scrape-api/understand-timeout)", optional: true, }, format: { type: "string", label: "Format", description: "Format of the response.", - options: [ - "raw", - "text", - "markdown", - "clean_html", - "json", - ], + options: FORMAT_OPTIONS, optional: true, }, retry: { @@ -89,21 +69,18 @@ export default { label: "Retry", description: "Improve reliability with retries on failure.", optional: true, - default: true, }, proxifiedResponse: { type: "boolean", label: "Proxified Response", description: "Return the content of the page directly.", optional: true, - default: false, }, debug: { type: "boolean", label: "Debug", description: "Store the API result and take a screenshot if rendering js is enabled.", optional: true, - default: false, }, correlationId: { type: "string", @@ -122,51 +99,51 @@ export default { label: "DNS", description: "Query and retrieve target DNS information.", optional: true, - default: false, }, ssl: { type: "boolean", label: "SSL", description: "SSL option.", optional: true, - default: true, }, }, async run({ $ }) { - const params = { - proxy_pool: this.proxyPool, - country: this.country, - lang: this.lang, - os: this.os, - timeout: this.timeout, - format: this.format, - retry: this.retry, - proxified_response: this.proxifiedResponse, - debug: this.debug, - correlation_id: this.correlationId, - tags: this.tags, - dns: this.dns, - ssl: this.ssl, - }; - - if (this.headers) { - params.headers = this.headers.reduce((acc, header) => { - const [ - key, - value, - ] = header.split("="); - acc[key] = value; - return acc; - }, {}); - } + try { + let headers = ""; + if (this.headers) { + headers = Object.keys(parseObject(this.headers)) + .reduce((acc, key) => { + acc += `headers[${key}]=${encodeURIComponent(this.headers[key])}`; + return acc; + }, ""); + } + const params = { + url: this.url, + proxy_pool: this.proxyPool, + country: this.country, + lang: this.lang, + os: this.os, + timeout: this.timeout, + format: this.format, + retry: this.retry, + proxified_response: this.proxifiedResponse, + debug: this.debug, + correlation_id: this.correlationId, + tags: parseObject(this.tags), + dns: this.dns, + ssl: this.ssl, + ...headers, + }; - const response = await this.scrapfly.extractWebPageContent({ - url: this.url, - key: this.key, - ...params, - }); + const response = await this.scrapfly.extractWebPageContent({ + $, + params, + }); - $.export("$summary", `Successfully scraped content from ${this.url}`); - return response; + $.export("$summary", `Successfully scraped content from ${this.url}`); + return response; + } catch ({ response: { data: { message } } }) { + throw new ConfigurationError(message); + } }, }; diff --git a/components/scrapfly/common/constants.mjs b/components/scrapfly/common/constants.mjs new file mode 100644 index 0000000000000..640d307bef44a --- /dev/null +++ b/components/scrapfly/common/constants.mjs @@ -0,0 +1,24 @@ +export const PROXY_POOL_OPTIONS = [ + "public_datacenter_pool", + "public_resitential_pool", +]; + +export const FORMAT_OPTIONS = [ + "raw", + "text", + "markdown", + "markdown:no_links,no_imagesLLM", + "clean_html", + "json", +]; + +export const CONTENT_TYPE_OPTIONS = [ + "application/json", + "application/jsonld", + "application/xml", + "text/plain", + "text/html", + "text/markdown", + "text/csv", + "application/xhtml+xml", +]; diff --git a/components/scrapfly/common/utils.mjs b/components/scrapfly/common/utils.mjs new file mode 100644 index 0000000000000..0cd1a12b6a4ba --- /dev/null +++ b/components/scrapfly/common/utils.mjs @@ -0,0 +1,31 @@ +export const checkTmp = (filename) => { + if (!filename.startsWith("/tmp")) { + return `/tmp/${filename}`; + } + return filename; +}; + +export const parseObject = (obj) => { + if (!obj) return undefined; + + if (Array.isArray(obj)) { + return obj.map((item) => { + if (typeof item === "string") { + try { + return JSON.parse(item); + } catch (e) { + return item; + } + } + return item; + }); + } + if (typeof obj === "string") { + try { + return JSON.parse(obj); + } catch (e) { + return obj; + } + } + return obj; +}; diff --git a/components/scrapfly/package.json b/components/scrapfly/package.json index baaaf2eded7a4..f2bc0dcc759f4 100644 --- a/components/scrapfly/package.json +++ b/components/scrapfly/package.json @@ -1,6 +1,6 @@ { "name": "@pipedream/scrapfly", - "version": "0.0.1", + "version": "0.1.0", "description": "Pipedream Scrapfly Components", "main": "scrapfly.app.mjs", "keywords": [ @@ -11,5 +11,9 @@ "author": "Pipedream (https://pipedream.com/)", "publishConfig": { "access": "public" + }, + "dependencies": { + "@pipedream/platform": "^3.0.1" } } + diff --git a/components/scrapfly/scrapfly.app.mjs b/components/scrapfly/scrapfly.app.mjs index de6cc2e549d8a..53e4c77fa946b 100644 --- a/components/scrapfly/scrapfly.app.mjs +++ b/components/scrapfly/scrapfly.app.mjs @@ -1,4 +1,5 @@ import { axios } from "@pipedream/platform"; +import { CONTENT_TYPE_OPTIONS } from "./common/constants.mjs"; export default { type: "app", @@ -7,76 +8,60 @@ export default { url: { type: "string", label: "URL", - description: "The URL of the web page to extract data from", - }, - key: { - type: "string", - label: "API Key", - description: "Your Scrapfly API key", + description: "This URL is used to transform any relative URLs in the document into absolute URLs automatically. It can be either the base URL or the exact URL of the document. [Must be url encoded](https://scrapfly.io/web-scraping-tools/urlencode).", }, body: { type: "string", label: "Body", - description: "The content of the page you want to extract data from", + description: "The request body must contain the content of the page you want to extract data from. The content must be in the format specified by the `content-type` header or via the `content_type` HTTP parameter. Provide a file from `/tmp`. To upload a file to `/tmp` folder, please follow the doc [here](https://pipedream.com/docs/code/nodejs/working-with-files/#writing-a-file-to-tmp)", }, contentType: { type: "string", label: "Content Type", - description: "The content type of the document passed in the body", - options: [ - "text/html", - "text/markdown", - "text/plain", - "application/xml", - ], + description: "Content type of the document pass in the body - You must specify the content type of the document by using this parameter or via the `content-type` header. This parameter has priority over the `content-type` header.", + options: CONTENT_TYPE_OPTIONS, }, }, methods: { _baseUrl() { return "https://api.scrapfly.io"; }, - async _makeRequest(opts = {}) { - const { - $ = this, method = "GET", path = "/", headers, ...otherOpts - } = opts; + _params(params = {}) { + return { + ...params, + key: `${this.$auth.api_key}`, + }; + }, + _makeRequest({ + $ = this, params, path, ...opts + }) { return axios($, { - ...otherOpts, - method, url: this._baseUrl() + path, - headers: { - ...headers, - Authorization: `Bearer ${this.$auth.api_key}`, - }, + params: this._params(params), + ...opts, }); }, - async getSubscriptionAndUsageDetails() { + getAccountInfo(opts = {}) { return this._makeRequest({ path: "/account", + ...opts, }); }, - async extractWebPageContent({ - url, key, ...params + extractWebPageContent({ + params, ...opts }) { return this._makeRequest({ method: "GET", - path: `/scrape?url=${encodeURIComponent(url)}&key=${key}`, + path: "/scrape", params, + ...opts, }); }, - async automateContentExtraction({ - key, body, contentType, ...params - }) { + automateContentExtraction(opts = {}) { return this._makeRequest({ method: "POST", path: "/extraction", - headers: { - "Content-Type": contentType, - }, - data: body, - params: { - key, - ...params, - }, + ...opts, }); }, }, From 2afb3ddef2347678d4751a493bb5db77f772a0b7 Mon Sep 17 00:00:00 2001 From: Luan Cazarine Date: Mon, 2 Sep 2024 13:54:11 -0300 Subject: [PATCH 3/5] pnpm update --- pnpm-lock.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b94c3b11b319e..21b86670bb842 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -8221,7 +8221,10 @@ importers: specifiers: {} components/scrapfly: - specifiers: {} + specifiers: + '@pipedream/platform': ^3.0.1 + dependencies: + '@pipedream/platform': 3.0.1 components/scrapingant: specifiers: {} From a314f743e0686a63f05110d1ebe39d9eefa6d516 Mon Sep 17 00:00:00 2001 From: Luan Cazarine Date: Mon, 2 Sep 2024 14:13:39 -0300 Subject: [PATCH 4/5] Update components/scrapfly/common/constants.mjs Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- components/scrapfly/common/constants.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/scrapfly/common/constants.mjs b/components/scrapfly/common/constants.mjs index 640d307bef44a..5abdd1402a944 100644 --- a/components/scrapfly/common/constants.mjs +++ b/components/scrapfly/common/constants.mjs @@ -1,6 +1,6 @@ export const PROXY_POOL_OPTIONS = [ "public_datacenter_pool", - "public_resitential_pool", + "public_residential_pool", ]; export const FORMAT_OPTIONS = [ From 56e2f2f1de315dd4092a2ef9798f56d55024b40c Mon Sep 17 00:00:00 2001 From: Luan Cazarine Date: Wed, 4 Sep 2024 15:30:59 -0300 Subject: [PATCH 5/5] some adjusts --- .../ai-data-extraction/ai-data-extraction.mjs | 5 + .../actions/scrape-page/scrape-page.mjs | 39 +- components/scrapfly/common/constants.mjs | 707 ++++++++++++++++++ 3 files changed, 735 insertions(+), 16 deletions(-) diff --git a/components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs b/components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs index 32aeb46450c94..0e2078c4135c7 100644 --- a/components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs +++ b/components/scrapfly/actions/ai-data-extraction/ai-data-extraction.mjs @@ -1,3 +1,4 @@ +import { ConfigurationError } from "@pipedream/platform"; import fs from "fs"; import { checkTmp } from "../../common/utils.mjs"; import scrapfly from "../../scrapfly.app.mjs"; @@ -45,6 +46,7 @@ export default { type: "string", label: "Extraction Prompt", description: "Instruction to extract data or ask a question on the scraped content with an LLM (Large Language Model). [Must be url encoded](https://scrapfly.io/web-scraping-tools/urlencode).", + optional: true, }, extractionModel: { type: "string", @@ -60,6 +62,9 @@ export default { }, }, async run({ $ }) { + if (!this.extractionTemplate && !this.extractionPrompt && !this.extractionModel) { + throw new ConfigurationError("You must provide at least **Extraction Template**, **Extraction Prompt** or **Extraction Model**"); + } const response = await this.scrapfly.automateContentExtraction({ $, headers: { diff --git a/components/scrapfly/actions/scrape-page/scrape-page.mjs b/components/scrapfly/actions/scrape-page/scrape-page.mjs index 42cb0be647f0a..554872c4a1c1f 100644 --- a/components/scrapfly/actions/scrape-page/scrape-page.mjs +++ b/components/scrapfly/actions/scrape-page/scrape-page.mjs @@ -1,6 +1,7 @@ import { ConfigurationError } from "@pipedream/platform"; import { FORMAT_OPTIONS, + PROXY_COUNTRY_OPTIONS, PROXY_POOL_OPTIONS, } from "../../common/constants.mjs"; import { parseObject } from "../../common/utils.mjs"; @@ -20,25 +21,12 @@ export default { "url", ], }, - proxyPool: { - type: "string", - label: "Proxy Pool", - description: "Select the proxy pool to use.", - optional: true, - options: PROXY_POOL_OPTIONS, - }, headers: { type: "object", label: "Headers", description: "Pass custom headers to the request.", optional: true, }, - country: { - type: "string", - label: "Country", - description: "Proxy country location. If not set it chooses a random location available. A reference to a country must be ISO 3166 alpha-2 (2 letters). The available countries are defined by the proxy pool you use. [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started#spec)", - optional: true, - }, lang: { type: "string", label: "Language", @@ -106,6 +94,25 @@ export default { description: "SSL option.", optional: true, }, + proxyPool: { + type: "string", + label: "Proxy Pool", + description: "Select the proxy pool to use.", + optional: true, + options: PROXY_POOL_OPTIONS, + reloadProps: true, + }, + }, + async additionalProps() { + const props = {}; + props.country = { + type: "string", + label: "Country", + description: "Proxy country location. If not set it chooses a random location available. A reference to a country must be ISO 3166 alpha-2 (2 letters). The available countries are defined by the proxy pool you use. [See the documentation](https://scrapfly.io/docs/scrape-api/getting-started#spec)", + optional: true, + options: PROXY_COUNTRY_OPTIONS[this.proxyPool], + }; + return props; }, async run({ $ }) { try { @@ -113,9 +120,9 @@ export default { if (this.headers) { headers = Object.keys(parseObject(this.headers)) .reduce((acc, key) => { - acc += `headers[${key}]=${encodeURIComponent(this.headers[key])}`; + acc.push(`headers[${key}]=${encodeURIComponent(this.headers[key])}`); return acc; - }, ""); + }, []); } const params = { url: this.url, @@ -132,7 +139,7 @@ export default { tags: parseObject(this.tags), dns: this.dns, ssl: this.ssl, - ...headers, + headers, }; const response = await this.scrapfly.extractWebPageContent({ diff --git a/components/scrapfly/common/constants.mjs b/components/scrapfly/common/constants.mjs index 5abdd1402a944..625839df3ff85 100644 --- a/components/scrapfly/common/constants.mjs +++ b/components/scrapfly/common/constants.mjs @@ -22,3 +22,710 @@ export const CONTENT_TYPE_OPTIONS = [ "text/csv", "application/xhtml+xml", ]; + +export const PROXY_COUNTRY_OPTIONS = { + public_datacenter_pool: [ + { + label: "Albania", + value: "al", + }, + { + label: "Armenia", + value: "am", + }, + { + label: "Argentina", + value: "ar", + }, + { + label: "Austria", + value: "at", + }, + { + label: "Australia", + value: "au", + }, + { + label: "Belgium", + value: "be", + }, + { + label: "Bulgaria", + value: "bg", + }, + { + label: "Bolivia", + value: "bo", + }, + { + label: "Brazil", + value: "br", + }, + { + label: "Belarus", + value: "by", + }, + { + label: "Canada", + value: "ca", + }, + { + label: "Switzerland", + value: "ch", + }, + { + label: "Chile", + value: "cl", + }, + { + label: "China", + value: "cn", + }, + { + label: "Colombia", + value: "co", + }, + { + label: "Czechia", + value: "cz", + }, + { + label: "Germany", + value: "de", + }, + { + label: "Denmark", + value: "dk", + }, + { + label: "Ecuador", + value: "ec", + }, + { + label: "Estonia", + value: "ee", + }, + { + label: "Spain", + value: "es", + }, + { + label: "Finland", + value: "fi", + }, + { + label: "France", + value: "fr", + }, + { + label: "United Kingdom", + value: "gb", + }, + { + label: "Georgia", + value: "ge", + }, + { + label: "Greece", + value: "gr", + }, + { + label: "Croatia", + value: "hr", + }, + { + label: "Hungary", + value: "hu", + }, + { + label: "Ireland", + value: "ie", + }, + { + label: "Israel", + value: "il", + }, + { + label: "India", + value: "in", + }, + { + label: "Iceland", + value: "is", + }, + { + label: "Italy", + value: "it", + }, + { + label: "Japan", + value: "jp", + }, + { + label: "South Korea", + value: "kr", + }, + { + label: "Lithuania", + value: "lt", + }, + { + label: "Latvia", + value: "lv", + }, + { + label: "Mexico", + value: "mx", + }, + { + label: "Netherlands", + value: "nl", + }, + { + label: "Norway", + value: "no", + }, + { + label: "New Zealand", + value: "nz", + }, + { + label: "Peru", + value: "pe", + }, + { + label: "Pakistan", + value: "pk", + }, + { + label: "Poland", + value: "pl", + }, + { + label: "Portugal", + value: "pt", + }, + { + label: "Romania", + value: "ro", + }, + { + label: "Russia", + value: "ru", + }, + { + label: "Saudi Arabia", + value: "sa", + }, + { + label: "Sweden", + value: "se", + }, + { + label: "Slovakia", + value: "sk", + }, + { + label: "Türkiye", + value: "tr", + }, + { + label: "Ukraine", + value: "ua", + }, + { + label: "United States", + value: "us", + }, + ], + public_residential_pool: [ + { + label: "Andorra", + value: "ad", + }, + { + label: "United Arab Emirates", + value: "ae", + }, + { + label: "Afghanistan", + value: "af", + }, + { + label: "Albania", + value: "al", + }, + { + label: "Armenia", + value: "am", + }, + { + label: "Angola", + value: "ao", + }, + { + label: "Argentina", + value: "ar", + }, + { + label: "Austria", + value: "at", + }, + { + label: "Australia", + value: "au", + }, + { + label: "Aruba", + value: "aw", + }, + { + label: "Azerbaijan", + value: "az", + }, + { + label: "Bosnia & Herzegovina", + value: "ba", + }, + { + label: "Bangladesh", + value: "bd", + }, + { + label: "Belgium", + value: "be", + }, + { + label: "Bulgaria", + value: "bg", + }, + { + label: "Bahrain", + value: "bh", + }, + { + label: "Benin", + value: "bj", + }, + { + label: "Bolivia", + value: "bo", + }, + { + label: "Brazil", + value: "br", + }, + { + label: "Bahamas", + value: "bs", + }, + { + label: "Bhutan", + value: "bt", + }, + { + label: "Belarus", + value: "by", + }, + { + label: "Belize", + value: "bz", + }, + { + label: "Canada", + value: "ca", + }, + { + label: "Switzerland", + value: "ch", + }, + { + label: "Côte d’Ivoire", + value: "ci", + }, + { + label: "Chile", + value: "cl", + }, + { + label: "China", + value: "cn", + }, + { + label: "Colombia", + value: "co", + }, + { + label: "Costa Rica", + value: "cr", + }, + { + label: "Cuba", + value: "cu", + }, + { + label: "Czechia", + value: "cz", + }, + { + label: "Germany", + value: "de", + }, + { + label: "Denmark", + value: "dk", + }, + { + label: "Dominica", + value: "dm", + }, + { + label: "Ecuador", + value: "ec", + }, + { + label: "Estonia", + value: "ee", + }, + { + label: "Egypt", + value: "eg", + }, + { + label: "Spain", + value: "es", + }, + { + label: "Ethiopia", + value: "et", + }, + { + label: "Finland", + value: "fi", + }, + { + label: "Fiji", + value: "fj", + }, + { + label: "France", + value: "fr", + }, + { + label: "United Kingdom", + value: "gb", + }, + { + label: "Georgia", + value: "ge", + }, + { + label: "Ghana", + value: "gh", + }, + { + label: "Gambia", + value: "gm", + }, + { + label: "Greece", + value: "gr", + }, + { + label: "Hong Kong SAR China", + value: "hk", + }, + { + label: "Honduras", + value: "hn", + }, + { + label: "Croatia", + value: "hr", + }, + { + label: "Haiti", + value: "ht", + }, + { + label: "Hungary", + value: "hu", + }, + { + label: "Indonesia", + value: "id", + }, + { + label: "Ireland", + value: "ie", + }, + { + label: "Israel", + value: "il", + }, + { + label: "India", + value: "in", + }, + { + label: "Iraq", + value: "iq", + }, + { + label: "Iran", + value: "ir", + }, + { + label: "Iceland", + value: "is", + }, + { + label: "Italy", + value: "it", + }, + { + label: "Jordan", + value: "jo", + }, + { + label: "Japan", + value: "jp", + }, + { + label: "Kenya", + value: "ke", + }, + { + label: "Cambodia", + value: "kh", + }, + { + label: "South Korea", + value: "kr", + }, + { + label: "Kazakhstan", + value: "kz", + }, + { + label: "Lebanon", + value: "lb", + }, + { + label: "Liberia", + value: "lr", + }, + { + label: "Lithuania", + value: "lt", + }, + { + label: "Latvia", + value: "lv", + }, + { + label: "Morocco", + value: "ma", + }, + { + label: "Monaco", + value: "mc", + }, + { + label: "Madagascar", + value: "mg", + }, + { + label: "North Macedonia", + value: "mk", + }, + { + label: "Mongolia", + value: "mn", + }, + { + label: "Mauritania", + value: "mr", + }, + { + label: "Malta", + value: "mt", + }, + { + label: "Mauritius", + value: "mu", + }, + { + label: "Maldives", + value: "mv", + }, + { + label: "Mexico", + value: "mx", + }, + { + label: "Malaysia", + value: "my", + }, + { + label: "Mozambique", + value: "mz", + }, + { + label: "Nigeria", + value: "ng", + }, + { + label: "Netherlands", + value: "nl", + }, + { + label: "Norway", + value: "no", + }, + { + label: "New Zealand", + value: "nz", + }, + { + label: "Oman", + value: "om", + }, + { + label: "Panama", + value: "pa", + }, + { + label: "Peru", + value: "pe", + }, + { + label: "Philippines", + value: "ph", + }, + { + label: "Pakistan", + value: "pk", + }, + { + label: "Poland", + value: "pl", + }, + { + label: "Puerto Rico", + value: "pr", + }, + { + label: "Portugal", + value: "pt", + }, + { + label: "Paraguay", + value: "py", + }, + { + label: "Qatar", + value: "qa", + }, + { + label: "Romania", + value: "ro", + }, + { + label: "Serbia", + value: "rs", + }, + { + label: "Russia", + value: "ru", + }, + { + label: "Saudi Arabia", + value: "sa", + }, + { + label: "Seychelles", + value: "sc", + }, + { + label: "Sudan", + value: "sd", + }, + { + label: "Sweden", + value: "se", + }, + { + label: "Singapore", + value: "sg", + }, + { + label: "Slovenia", + value: "si", + }, + { + label: "Slovakia", + value: "sk", + }, + { + label: "Senegal", + value: "sn", + }, + { + label: "South Sudan", + value: "ss", + }, + { + label: "Tunisia", + value: "tn", + }, + { + label: "Türkiye", + value: "tr", + }, + { + label: "Taiwan", + value: "tw", + }, + { + label: "Ukraine", + value: "ua", + }, + { + label: "Uganda", + value: "ug", + }, + { + label: "United States", + value: "us", + }, + { + label: "Uruguay", + value: "uy", + }, + { + label: "Uzbekistan", + value: "uz", + }, + { + label: "Venezuela", + value: "ve", + }, + { + label: "British Virgin Islands", + value: "vg", + }, + { + label: "Vietnam", + value: "vn", + }, + { + label: "Yemen", + value: "ye", + }, + { + label: "South Africa", + value: "za", + }, + ], +};