From 680a81550011a2d63a8e595863e7c44b1ce20282 Mon Sep 17 00:00:00 2001 From: Michelle Bergeron Date: Thu, 28 Aug 2025 13:36:59 -0400 Subject: [PATCH 1/2] update scrape-single-url --- .../scrape-single-url/scrape-single-url.mjs | 44 +++---------------- components/apify/package.json | 5 ++- 2 files changed, 9 insertions(+), 40 deletions(-) diff --git a/components/apify/actions/scrape-single-url/scrape-single-url.mjs b/components/apify/actions/scrape-single-url/scrape-single-url.mjs index b4c73f0e540cc..ebe1877452162 100644 --- a/components/apify/actions/scrape-single-url/scrape-single-url.mjs +++ b/components/apify/actions/scrape-single-url/scrape-single-url.mjs @@ -1,11 +1,11 @@ import apify from "../../apify.app.mjs"; -import { ACTOR_ID } from "../../common/constants.mjs"; +import { gotScraping } from "got-scraping"; export default { key: "apify-scrape-single-url", name: "Scrape Single URL", - description: "Executes a scraper on a specific website and returns its content as text. This action is perfect for extracting content from a single page.", - version: "0.0.4", + description: "Executes a scraper on a specific website and returns its content as HTML. This action is perfect for extracting content from a single page. [See the documentation](https://docs.apify.com/sdk/js/docs/examples/crawl-single-url)", + version: "0.1.0", type: "action", props: { apify, @@ -13,45 +13,13 @@ export default { type: "string", label: "URL", description: "The URL of the web page to scrape.", - optional: false, - }, - crawlerType: { - type: "string", - label: "Crawler Type", - description: "Select the crawling engine:\n- **Headless web browser** - Useful for modern websites with anti-scraping protections and JavaScript rendering. It recognizes common blocking patterns like CAPTCHAs and automatically retries blocked requests through new sessions. However, running web browsers is more expensive as it requires more computing resources and is slower. It is recommended to use at least 8 GB of RAM.\n- **Stealthy web browser** (default) - Another headless web browser with anti-blocking measures enabled. Try this if you encounter bot protection while scraping. For best performance, use with Apify Proxy residential IPs. \n- **Raw HTTP client** - High-performance crawling mode that uses raw HTTP requests to fetch the pages. It is faster and cheaper, but it might not work on all websites.", - options: [ - { - label: "Headless browser (stealthy Firefox+Playwright) - Very reliable, best in avoiding blocking, but might be slow", - value: "playwright:firefox", - }, - { - label: "Headless browser (Chrome+Playwright) - Reliable, but might be slow", - value: "playwright:chrome", - }, - { - label: "Raw HTTP client (Cheerio) - Extremely fast, but cannot handle dynamic content", - value: "cheerio", - }, - ], }, }, async run({ $ }) { - const response = await this.apify.runActor({ - $, - actorId: ACTOR_ID, - data: { - crawlerType: this.crawlerType, - maxCrawlDepth: 0, - maxCrawlPages: 1, - maxResults: 1, - startUrls: [ - { - url: this.url, - }, - ], - }, + const { body } = await gotScraping({ + url: this.url, }); $.export("$summary", `Successfully scraped content from ${this.url}`); - return response; + return body; }, }; diff --git a/components/apify/package.json b/components/apify/package.json index 6fa6e927daca3..9d0348256ff9e 100644 --- a/components/apify/package.json +++ b/components/apify/package.json @@ -1,6 +1,6 @@ { "name": "@pipedream/apify", - "version": "0.2.2", + "version": "0.3.0", "description": "Pipedream Apify Components", "main": "apify.app.mjs", "keywords": [ @@ -14,6 +14,7 @@ }, "dependencies": { "@apify/consts": "^2.41.0", - "@pipedream/platform": "^3.0.3" + "@pipedream/platform": "^3.0.3", + "got-scraping": "^4.1.2" } } From 84938b5f2cad57ca088692af183ef1d9325a5394 Mon Sep 17 00:00:00 2001 From: Michelle Bergeron Date: Thu, 28 Aug 2025 13:37:37 -0400 Subject: [PATCH 2/2] pnpm-lock.yaml --- pnpm-lock.yaml | 98 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 94 insertions(+), 4 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c3b944f6d5eb3..421da13767ff3 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -962,6 +962,9 @@ importers: '@pipedream/platform': specifier: ^3.0.3 version: 3.0.3 + got-scraping: + specifier: ^4.1.2 + version: 4.1.2 components/apify_oauth: {} @@ -10938,8 +10941,7 @@ importers: specifier: ^1.5.1 version: 1.6.6 - components/postbin: - specifiers: {} + components/postbin: {} components/postgresql: dependencies: @@ -14995,8 +14997,7 @@ importers: components/upollo: {} - components/uproc: - specifiers: {} + components/uproc: {} components/upstash_redis: dependencies: @@ -23490,6 +23491,10 @@ packages: resolution: {integrity: sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==} engines: {node: '>=6'} + callsites@4.2.0: + resolution: {integrity: sha512-kfzR4zzQtAE9PC7CzZsjl3aBNbXWuXiSeOCdLcPpBfGW8YuCqQHcRPFDbr/BPVmd3EEPVpuFzLyuT/cUhPr4OQ==} + engines: {node: '>=12.20'} + camelcase-keys@9.1.3: resolution: {integrity: sha512-Rircqi9ch8AnZscQcsA1C47NFdaO3wukpmIRzYcDOrmvgt78hM/sj5pZhZNec2NM12uk5vTwRHZ4anGcrC4ZTg==} engines: {node: '>=16'} @@ -24541,6 +24546,10 @@ packages: resolution: {integrity: sha512-tE7ztYzXHIeyvc7N+hR3oi7FIbf/NIjVP9hmAt3yMXzrQ072/fpjGLx2GxNxGxUl5V73MEqYzioOMoVhGMJ5cA==} engines: {node: '>=10'} + dot-prop@7.2.0: + resolution: {integrity: sha512-Ol/IPXUARn9CSbkrdV4VJo7uCy1I3VuSiWCaFSg+8BdUOzF9n3jefIpcgAydvUZbTdEBZs2vEiTiS9m61ssiDA==} + engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + dot-prop@9.0.0: resolution: {integrity: sha512-1gxPBJpI/pcjQhKgIU91II6Wkay+dLcN3M6rf2uwP8hRur3HtQXjVrdAK3sjC0piaEuxzMwjXChcETiJl47lAQ==} engines: {node: '>=18'} @@ -25731,6 +25740,9 @@ packages: generate-function@2.3.1: resolution: {integrity: sha512-eeB5GfMNeevm/GRYq20ShmsaGcmI81kIX2K9XQx5miC8KdHaC6Jm0qQ8ZNeGOi7wYB8OsdxKs+Y2oVuTFuVwKQ==} + generative-bayesian-network@2.1.70: + resolution: {integrity: sha512-nP0CNiVs/QS5ppMsGiEYN3dgAe3UTT1mpDth0wTh9uEyEO4e7y1Yr5PGDcTJsU0Lm3YM21yNzhuPbUg7etKHbQ==} + generic-pool@3.9.0: resolution: {integrity: sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==} engines: {node: '>= 4'} @@ -26016,6 +26028,10 @@ packages: resolution: {integrity: sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==} engines: {node: '>= 0.4'} + got-scraping@4.1.2: + resolution: {integrity: sha512-LtVwPM5YLnNY7HVT/AK/yDBUg/4yOZSlAjjug2ovrHQseS43QCmO1XosKKXcXrfc6OMX8OnDbAWIauFMcaJ5TQ==} + engines: {node: '>=16'} + got@11.8.6: resolution: {integrity: sha512-6tfZ91bOr7bOXnK7PRDCGBLa1H4U080YHNaAQ2KsMGlLEzRbk44nsZF2E1IeRc3vtJHPVbKCYgdFbaGO2ljd8g==} engines: {node: '>=10.19.0'} @@ -26174,6 +26190,10 @@ packages: resolution: {integrity: sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==} hasBin: true + header-generator@2.1.70: + resolution: {integrity: sha512-s2/jN4hIr/pDRZhXA1D2T72eO4f8Gi1mwYEIFLbU+OR7cjo+Tayrw4RlTN3dXPahrU/MBdjk9gv//MwxLoCpGQ==} + engines: {node: '>=16.0.0'} + heap-js@2.5.0: resolution: {integrity: sha512-kUGoI3p7u6B41z/dp33G6OaL7J4DRqRYwVmeIlwLClx7yaaAy7hoDExnuejTKtuDwfcatGmddHDEOjf6EyIxtQ==} engines: {node: '>=10.0.0'} @@ -28846,6 +28866,14 @@ packages: resolution: {integrity: sha512-D2FR03Vir7FIu45XBY20mTb+/ZSWB00sjU9jdQXt83gDrI4Ztz5Fs7/yy74g2N5SVQY4xY1qDr4rNddwYRVX0g==} engines: {node: '>=0.10.0'} + ow@0.28.2: + resolution: {integrity: sha512-dD4UpyBh/9m4X2NVjA+73/ZPBRF+uF4zIMFvvQsabMiEK8x41L3rQ8EENOi35kyyoaJwNxEeJcP6Fj1H4U409Q==} + engines: {node: '>=12'} + + ow@1.1.1: + resolution: {integrity: sha512-sJBRCbS5vh1Jp9EOgwp1Ws3c16lJrUkJYlvWTYC03oyiYVwS/ns7lKRWow4w4XjDyTrA2pplQv4B2naWSR6yDA==} + engines: {node: '>=14.16'} + p-cancelable@2.1.1: resolution: {integrity: sha512-BZOr3nRQHOntUjTrH8+Lh54smKHoHyur8We1V8DSMVrl5A2malOOwuJRnKRDjSnkoeBh4at6BwEnb5I7Jl31wg==} engines: {node: '>=8'} @@ -29639,6 +29667,10 @@ packages: resolution: {integrity: sha512-AAFUA5O1d83pIHEhJwWCq/RQcRukCkn/NSm2QsTEMle5f2hP0ChI2+3Xb051PZCkLryI/Ir1MVKviT2FIloaTQ==} engines: {node: '>=12'} + quick-lru@7.1.0: + resolution: {integrity: sha512-Pzd/4IFnTb8E+I1P5rbLQoqpUHcXKg48qTYKi4EANg+sTPwGFEMOcYGiiZz6xuQcOMZP7MPsrdAPx+16Q8qahg==} + engines: {node: '>=18'} + quote-unquote@1.0.0: resolution: {integrity: sha512-twwRO/ilhlG/FIgYeKGFqyHhoEhqgnKVkcmqMKi2r524gz3ZbDTcyFt38E9xjJI2vT+KbRNHVbnJ/e0I25Azwg==} @@ -31343,6 +31375,10 @@ packages: resolution: {integrity: sha512-q+MB8nYR1KDLrgr4G5yemftpMC7/QLqVndBmEEdqzmNj5dcFOO4Oo8qlwZE3ULT3+Zim1F8Kq4cBnikNhlCMlg==} engines: {node: '>=8'} + type-fest@2.19.0: + resolution: {integrity: sha512-RAH822pAdBgcNMAfWnCBU3CFZcfZ/i1eZjwFU/dsLKumyuuP3niueg2UAukXYF0E2AAoc82ZSSf9J0WQBinzHA==} + engines: {node: '>=12.20'} + type-fest@4.27.0: resolution: {integrity: sha512-3IMSWgP7C5KSQqmo1wjhKrwsvXAtF33jO3QY+Uy++ia7hqvgSK6iXbbg5PbDBc1P2ZbNEDgejOrN4YooXvhwCw==} engines: {node: '>=16'} @@ -31796,6 +31832,10 @@ packages: resolution: {integrity: sha512-kiGUalWN+rgBJ/1OHZsBtU4rXZOfj/7rKQxULKlIzwzQSvMJUUNgPwJEEh7gU6xEVxC0ahoOBvN2YI8GH6FNgA==} engines: {node: '>=10.12.0'} + vali-date@1.0.0: + resolution: {integrity: sha512-sgECfZthyaCKW10N0fm27cg8HYTFK5qMWgypqkXMQ4Wbl/zZKx7xZICgcoxIIE+WFAP/MBL2EFwC/YvLxw3Zeg==} + engines: {node: '>=0.10.0'} + valid-data-url@4.0.1: resolution: {integrity: sha512-t0oA6VCnlQ/MPKP/Ie9ZD3biEpB2JTxK1Hx4KC72RbhubL9HsXznoBn228UQTazL7cPvsY36bhzt3fk424TjyA==} engines: {node: '>=10'} @@ -42451,6 +42491,8 @@ snapshots: callsites@3.1.0: {} + callsites@4.2.0: {} + camelcase-keys@9.1.3: dependencies: camelcase: 8.0.0 @@ -43546,6 +43588,10 @@ snapshots: dependencies: is-obj: 2.0.0 + dot-prop@7.2.0: + dependencies: + type-fest: 2.19.0 + dot-prop@9.0.0: dependencies: type-fest: 4.27.0 @@ -45220,6 +45266,11 @@ snapshots: dependencies: is-property: 1.0.2 + generative-bayesian-network@2.1.70: + dependencies: + adm-zip: 0.5.16 + tslib: 2.8.1 + generic-pool@3.9.0: {} gensync@1.0.0-beta.2: {} @@ -45668,6 +45719,16 @@ snapshots: gopd@1.2.0: {} + got-scraping@4.1.2: + dependencies: + got: 14.4.6 + header-generator: 2.1.70 + http2-wrapper: 2.2.1 + mimic-response: 4.0.0 + ow: 1.1.1 + quick-lru: 7.1.0 + tslib: 2.8.1 + got@11.8.6: dependencies: '@sindresorhus/is': 4.6.0 @@ -45907,6 +45968,13 @@ snapshots: he@1.2.0: {} + header-generator@2.1.70: + dependencies: + browserslist: 4.24.2 + generative-bayesian-network: 2.1.70 + ow: 0.28.2 + tslib: 2.8.1 + heap-js@2.5.0: {} help-me@3.0.0: @@ -49486,6 +49554,22 @@ snapshots: os-tmpdir@1.0.2: {} + ow@0.28.2: + dependencies: + '@sindresorhus/is': 4.6.0 + callsites: 3.1.0 + dot-prop: 6.0.1 + lodash.isequal: 4.5.0 + vali-date: 1.0.0 + + ow@1.1.1: + dependencies: + '@sindresorhus/is': 5.6.0 + callsites: 4.2.0 + dot-prop: 7.2.0 + lodash.isequal: 4.5.0 + vali-date: 1.0.0 + p-cancelable@2.1.1: {} p-cancelable@3.0.0: {} @@ -50523,6 +50607,8 @@ snapshots: quick-lru@6.1.2: {} + quick-lru@7.1.0: {} + quote-unquote@1.0.0: {} quotemeta@0.0.0: {} @@ -52982,6 +53068,8 @@ snapshots: type-fest@0.6.0: {} + type-fest@2.19.0: {} + type-fest@4.27.0: {} type-fest@4.41.0: {} @@ -53394,6 +53482,8 @@ snapshots: '@types/istanbul-lib-coverage': 2.0.6 convert-source-map: 2.0.0 + vali-date@1.0.0: {} + valid-data-url@4.0.1: {} validate-npm-package-license@3.0.4: