diff --git a/scrapegraph-js/examples/stealth_mode_example.js b/scrapegraph-js/examples/stealth_mode_example.js new file mode 100644 index 0000000..3913589 --- /dev/null +++ b/scrapegraph-js/examples/stealth_mode_example.js @@ -0,0 +1,613 @@ +/** + * Stealth Mode Examples for ScrapeGraph AI JavaScript SDK + * + * This file demonstrates how to use stealth mode with various endpoints + * to avoid bot detection when scraping websites. + * + * Stealth mode enables advanced techniques to make requests appear more + * like those from a real browser, helping to bypass basic bot detection. + */ + +import { + smartScraper, + searchScraper, + markdownify, + scrape, + agenticScraper, + crawl, + getScrapeRequest, + getAgenticScraperRequest, + getCrawlRequest +} from '../index.js'; +import 'dotenv/config'; + +// Get API key from environment variable +const API_KEY = process.env.SGAI_APIKEY || 'your-api-key-here'; + +// ============================================================================ +// EXAMPLE 1: SmartScraper with Stealth Mode +// ============================================================================ + +async function exampleSmartScraperWithStealth() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 1: SmartScraper with Stealth Mode'); + console.log('='.repeat(60)); + + try { + const response = await smartScraper( + API_KEY, + 'https://www.scrapethissite.com/pages/simple/', + 'Extract country names and capitals', + null, // schema + null, // numberOfScrolls + null, // totalPages + null, // cookies + {}, // options + false, // plain_text + false, // renderHeavyJs + true // stealth - Enable stealth mode + ); + + console.log('Status:', response.status); + console.log('Request ID:', response.request_id); + console.log('Result:', JSON.stringify(response.result, null, 2)); + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// EXAMPLE 2: SmartScraper with Stealth Mode and Pagination +// ============================================================================ + +async function exampleSmartScraperWithStealthAndPagination() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 2: SmartScraper with Stealth Mode and Pagination'); + console.log('='.repeat(60)); + + try { + const response = await smartScraper( + API_KEY, + 'https://example.com/products', + 'Extract all product information from multiple pages', + null, // schema + 10, // numberOfScrolls + 5, // totalPages + null, // cookies + {}, // options + false, // plain_text + true, // renderHeavyJs - Enable JS rendering + true // stealth - Enable stealth mode + ); + + console.log('Status:', response.status); + console.log('Request ID:', response.request_id); + console.log('Products extracted:', response.result); + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// EXAMPLE 3: SearchScraper with Stealth Mode +// ============================================================================ + +async function exampleSearchScraperWithStealth() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 3: SearchScraper with Stealth Mode'); + console.log('='.repeat(60)); + + try { + const response = await searchScraper( + API_KEY, + 'What are the latest developments in AI technology?', + 5, // numResults + null, // schema + null, // userAgent + { + stealth: true, // Enable stealth mode + extractionMode: true, + renderHeavyJs: false + } + ); + + console.log('Status:', response.status); + console.log('Request ID:', response.request_id); + console.log('Result:', JSON.stringify(response.result, null, 2)); + + if (response.reference_urls) { + console.log('Reference URLs:', response.reference_urls); + } + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// EXAMPLE 4: Markdownify with Stealth Mode +// ============================================================================ + +async function exampleMarkdownifyWithStealth() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 4: Markdownify with Stealth Mode'); + console.log('='.repeat(60)); + + try { + const response = await markdownify( + API_KEY, + 'https://www.example.com', + { + stealth: true // Enable stealth mode + } + ); + + console.log('Status:', response.status); + console.log('Request ID:', response.request_id); + console.log('Markdown Preview (first 500 chars):'); + console.log(response.result.substring(0, 500)); + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// EXAMPLE 5: Scrape with Stealth Mode +// ============================================================================ + +async function exampleScrapeWithStealth() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 5: Scrape with Stealth Mode'); + console.log('='.repeat(60)); + + try { + const response = await scrape( + API_KEY, + 'https://www.example.com', + { + stealth: true // Enable stealth mode + } + ); + + console.log('Status:', response.status); + console.log('Scrape Request ID:', response.scrape_request_id); + console.log('HTML Preview (first 500 chars):'); + console.log(response.html.substring(0, 500)); + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// EXAMPLE 6: Scrape with Stealth Mode and Heavy JS Rendering +// ============================================================================ + +async function exampleScrapeWithStealthAndJS() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 6: Scrape with Stealth Mode and Heavy JS'); + console.log('='.repeat(60)); + + try { + const response = await scrape( + API_KEY, + 'https://www.example.com', + { + renderHeavyJs: true, // Enable JavaScript rendering + stealth: true // Enable stealth mode + } + ); + + console.log('Status:', response.status); + console.log('Scrape Request ID:', response.scrape_request_id); + console.log('HTML Preview (first 500 chars):'); + console.log(response.html.substring(0, 500)); + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// EXAMPLE 7: Scrape with Stealth Mode and Custom Headers +// ============================================================================ + +async function exampleScrapeWithStealthAndHeaders() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 7: Scrape with Stealth Mode and Custom Headers'); + console.log('='.repeat(60)); + + try { + const customHeaders = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'DNT': '1' + }; + + const response = await scrape( + API_KEY, + 'https://www.protected-site.com', + { + headers: customHeaders, // Custom headers + stealth: true // Enable stealth mode + } + ); + + console.log('Status:', response.status); + console.log('Scrape Request ID:', response.scrape_request_id); + console.log('Success! Stealth mode + custom headers bypassed detection.'); + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// EXAMPLE 8: Agentic Scraper with Stealth Mode +// ============================================================================ + +async function exampleAgenticScraperWithStealth() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 8: Agentic Scraper with Stealth Mode'); + console.log('='.repeat(60)); + + try { + const steps = [ + 'Type user@example.com in email input box', + 'Type password123 in password input box', + 'Click on login button' + ]; + + const response = await agenticScraper( + API_KEY, + 'https://dashboard.example.com/login', + steps, + true, // useSession + null, // userPrompt + null, // outputSchema + false, // aiExtraction + { + stealth: true // Enable stealth mode + } + ); + + console.log('Status:', response.status); + console.log('Request ID:', response.request_id); + console.log('Message:', response.message); + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// EXAMPLE 9: Agentic Scraper with Stealth Mode and AI Extraction +// ============================================================================ + +async function exampleAgenticScraperWithStealthAndAI() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 9: Agentic Scraper with Stealth and AI Extraction'); + console.log('='.repeat(60)); + + try { + const steps = [ + 'Navigate to user profile section', + 'Click on settings tab' + ]; + + const outputSchema = { + user_info: { + type: 'object', + properties: { + username: { type: 'string' }, + email: { type: 'string' }, + settings: { type: 'object' } + } + } + }; + + const response = await agenticScraper( + API_KEY, + 'https://dashboard.example.com', + steps, + true, // useSession + 'Extract user profile information and settings', // userPrompt + outputSchema, // outputSchema + true, // aiExtraction + { + stealth: true // Enable stealth mode + } + ); + + console.log('Status:', response.status); + console.log('Request ID:', response.request_id); + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// EXAMPLE 10: Crawl with Stealth Mode +// ============================================================================ + +async function exampleCrawlWithStealth() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 10: Crawl with Stealth Mode'); + console.log('='.repeat(60)); + + try { + const schema = { + type: 'object', + properties: { + title: { type: 'string', description: 'Page title' }, + content: { type: 'string', description: 'Main content' } + }, + required: ['title'] + }; + + const response = await crawl( + API_KEY, + 'https://www.example.com', + 'Extract page titles and main content', + schema, + { + depth: 2, + maxPages: 5, + stealth: true // Enable stealth mode + } + ); + + console.log('Status:', response.status); + console.log('Crawl ID:', response.id); + console.log('Message:', response.message); + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// EXAMPLE 11: Crawl with Stealth Mode and Sitemap +// ============================================================================ + +async function exampleCrawlWithStealthAndSitemap() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 11: Crawl with Stealth Mode and Sitemap'); + console.log('='.repeat(60)); + + try { + const schema = { + type: 'object', + properties: { + product_name: { type: 'string' }, + price: { type: 'string' }, + description: { type: 'string' } + }, + required: ['product_name'] + }; + + const response = await crawl( + API_KEY, + 'https://www.example-shop.com', + 'Extract product information from all pages', + schema, + { + sitemap: true, // Use sitemap for better page discovery + depth: 3, + maxPages: 10, + stealth: true // Enable stealth mode + } + ); + + console.log('Status:', response.status); + console.log('Crawl ID:', response.id); + console.log('Message:', response.message); + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// EXAMPLE 12: Complete Workflow with Stealth Mode +// ============================================================================ + +async function exampleCompleteWorkflowWithStealth() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 12: Complete Workflow with Stealth Mode'); + console.log('='.repeat(60)); + + try { + // Step 1: Start a scrape request with stealth mode + console.log('\n1. Starting scrape request with stealth mode...'); + const scrapeResponse = await scrape( + API_KEY, + 'https://www.example.com', + { + renderHeavyJs: true, + stealth: true + } + ); + + console.log(' Scrape initiated. Request ID:', scrapeResponse.scrape_request_id); + console.log(' Status:', scrapeResponse.status); + + // Step 2: Wait a bit and check the result (if processing) + if (scrapeResponse.status === 'processing') { + console.log('\n2. Waiting for scrape to complete...'); + await new Promise(resolve => setTimeout(resolve, 3000)); // Wait 3 seconds + + const result = await getScrapeRequest(API_KEY, scrapeResponse.scrape_request_id); + console.log(' Updated Status:', result.status); + + if (result.status === 'completed') { + console.log(' HTML received (length):', result.html.length); + } + } + + console.log('\nβœ… Workflow completed successfully with stealth mode!'); + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// EXAMPLE 13: SearchScraper with Stealth and Custom User Agent +// ============================================================================ + +async function exampleSearchScraperWithStealthAndUserAgent() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 13: SearchScraper with Stealth and User Agent'); + console.log('='.repeat(60)); + + try { + const customUserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'; + + const response = await searchScraper( + API_KEY, + 'Find the best practices for web scraping', + 5, // numResults + null, // schema + customUserAgent, // Custom user agent + { + stealth: true, + extractionMode: true + } + ); + + console.log('Status:', response.status); + console.log('Request ID:', response.request_id); + console.log('Result:', JSON.stringify(response.result, null, 2)); + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// EXAMPLE 14: Comparing With and Without Stealth Mode +// ============================================================================ + +async function exampleCompareStealthMode() { + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 14: Comparing With and Without Stealth Mode'); + console.log('='.repeat(60)); + + try { + const testUrl = 'https://www.example.com'; + + // Without stealth mode + console.log('\n1. Scraping WITHOUT stealth mode...'); + const responseWithoutStealth = await scrape( + API_KEY, + testUrl, + { + stealth: false + } + ); + console.log(' Status:', responseWithoutStealth.status); + console.log(' Request ID:', responseWithoutStealth.scrape_request_id); + + // With stealth mode + console.log('\n2. Scraping WITH stealth mode...'); + const responseWithStealth = await scrape( + API_KEY, + testUrl, + { + stealth: true + } + ); + console.log(' Status:', responseWithStealth.status); + console.log(' Request ID:', responseWithStealth.scrape_request_id); + + console.log('\nπŸ“Š Comparison complete!'); + console.log(' Both requests succeeded, but stealth mode provides better bot detection avoidance.'); + } catch (error) { + console.error('Error:', error.message); + } +} + +// ============================================================================ +// RUN ALL EXAMPLES +// ============================================================================ + +async function runAllExamples() { + console.log('\n' + '='.repeat(60)); + console.log('STEALTH MODE EXAMPLES FOR SCRAPEGRAPH AI JAVASCRIPT SDK'); + console.log('='.repeat(60)); + console.log('\nThese examples demonstrate how to use stealth mode'); + console.log('to avoid bot detection when scraping websites.'); + console.log('\nStealth mode is available for all major endpoints:'); + console.log('- SmartScraper'); + console.log('- SearchScraper'); + console.log('- Markdownify'); + console.log('- Scrape'); + console.log('- Agentic Scraper'); + console.log('- Crawl'); + + const examples = [ + { name: 'SmartScraper with Stealth', fn: exampleSmartScraperWithStealth }, + { name: 'SmartScraper with Stealth and Pagination', fn: exampleSmartScraperWithStealthAndPagination }, + { name: 'SearchScraper with Stealth', fn: exampleSearchScraperWithStealth }, + { name: 'Markdownify with Stealth', fn: exampleMarkdownifyWithStealth }, + { name: 'Scrape with Stealth', fn: exampleScrapeWithStealth }, + { name: 'Scrape with Stealth and Heavy JS', fn: exampleScrapeWithStealthAndJS }, + { name: 'Scrape with Stealth and Custom Headers', fn: exampleScrapeWithStealthAndHeaders }, + { name: 'Agentic Scraper with Stealth', fn: exampleAgenticScraperWithStealth }, + { name: 'Agentic Scraper with Stealth and AI', fn: exampleAgenticScraperWithStealthAndAI }, + { name: 'Crawl with Stealth', fn: exampleCrawlWithStealth }, + { name: 'Crawl with Stealth and Sitemap', fn: exampleCrawlWithStealthAndSitemap }, + { name: 'Complete Workflow with Stealth', fn: exampleCompleteWorkflowWithStealth }, + { name: 'SearchScraper with Stealth and User Agent', fn: exampleSearchScraperWithStealthAndUserAgent }, + { name: 'Compare Stealth Mode', fn: exampleCompareStealthMode } + ]; + + for (let i = 0; i < examples.length; i++) { + const example = examples[i]; + try { + console.log(`\n\nπŸ“Œ Running Example ${i + 1}/${examples.length}: ${example.name}`); + await example.fn(); + console.log(`\nβœ… Example ${i + 1} completed`); + } catch (error) { + console.error(`\n❌ Example ${i + 1} failed: ${error.message}`); + } + + // Add a small delay between examples + if (i < examples.length - 1) { + await new Promise(resolve => setTimeout(resolve, 1000)); + } + } + + console.log('\n' + '='.repeat(60)); + console.log('ALL EXAMPLES COMPLETED'); + console.log('='.repeat(60)); +} + +// ============================================================================ +// MAIN EXECUTION +// ============================================================================ + +// Run all examples if this file is executed directly +if (import.meta.url === `file://${process.argv[1]}`) { + runAllExamples() + .then(() => { + console.log('\n✨ All stealth mode examples executed successfully!'); + process.exit(0); + }) + .catch(error => { + console.error('\nπŸ’₯ Fatal error:', error.message); + process.exit(1); + }); +} + +// Export individual examples for selective usage +export { + exampleSmartScraperWithStealth, + exampleSmartScraperWithStealthAndPagination, + exampleSearchScraperWithStealth, + exampleMarkdownifyWithStealth, + exampleScrapeWithStealth, + exampleScrapeWithStealthAndJS, + exampleScrapeWithStealthAndHeaders, + exampleAgenticScraperWithStealth, + exampleAgenticScraperWithStealthAndAI, + exampleCrawlWithStealth, + exampleCrawlWithStealthAndSitemap, + exampleCompleteWorkflowWithStealth, + exampleSearchScraperWithStealthAndUserAgent, + exampleCompareStealthMode, + runAllExamples +}; diff --git a/scrapegraph-js/package-lock.json b/scrapegraph-js/package-lock.json index 414230f..5019617 100644 --- a/scrapegraph-js/package-lock.json +++ b/scrapegraph-js/package-lock.json @@ -1,6 +1,6 @@ { "name": "scrapegraph-js", - "version": "0.1.5", + "version": "0.2.0", "lockfileVersion": 3, "requires": true, "packages": { diff --git a/scrapegraph-js/package.json b/scrapegraph-js/package.json index 1f0deee..0a68a64 100644 --- a/scrapegraph-js/package.json +++ b/scrapegraph-js/package.json @@ -1,7 +1,7 @@ { "name": "scrapegraph-js", "author": "ScrapeGraphAI", - "version": "0.1.6", + "version": "0.2.0", "description": "Scrape and extract structured data from a webpage using ScrapeGraphAI's APIs. Supports cookies for authentication, infinite scrolling, and pagination.", "repository": { "type": "git", diff --git a/scrapegraph-js/src/agenticScraper.js b/scrapegraph-js/src/agenticScraper.js index 1f27bf9..7e48c0c 100644 --- a/scrapegraph-js/src/agenticScraper.js +++ b/scrapegraph-js/src/agenticScraper.js @@ -16,6 +16,7 @@ import { getMockResponse } from './utils/mockResponse.js'; * @param {Object} options - Optional configuration options * @param {boolean} options.mock - Override mock mode for this request * @param {boolean} options.renderHeavyJs - Whether to render heavy JavaScript on the page + * @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection * @returns {Promise} Response from the API containing request_id and initial status * @throws {Error} Will throw an error in case of an HTTP failure or invalid parameters. * @@ -52,9 +53,9 @@ import { getMockResponse } from './utils/mockResponse.js'; * * try { * const result = await agenticScraper( - * apiKey, - * url, - * steps, + * apiKey, + * url, + * steps, * true, * "Extract user information and available dashboard sections", * outputSchema, @@ -66,7 +67,7 @@ import { getMockResponse } from './utils/mockResponse.js'; * } */ export async function agenticScraper(apiKey, url, steps, useSession = true, userPrompt = null, outputSchema = null, aiExtraction = false, options = {}) { - const { mock = null, renderHeavyJs = false } = options; + const { mock = null, renderHeavyJs = false, stealth = false } = options; // Check if mock mode is enabled const useMock = mock !== null ? mock : isMockEnabled(); @@ -133,6 +134,10 @@ export async function agenticScraper(apiKey, url, steps, useSession = true, user render_heavy_js: renderHeavyJs, }; + if (stealth) { + payload.stealth = stealth; + } + // Add AI extraction parameters if enabled if (aiExtraction) { payload.user_prompt = userPrompt; diff --git a/scrapegraph-js/src/crawl.js b/scrapegraph-js/src/crawl.js index 0c7cca2..5b5a0af 100644 --- a/scrapegraph-js/src/crawl.js +++ b/scrapegraph-js/src/crawl.js @@ -22,6 +22,7 @@ import { getMockResponse } from './utils/mockResponse.js'; * @param {number} [options.batchSize=1] - Batch size for processing pages (1-10) * @param {boolean} [options.mock] - Override mock mode for this request * @param {boolean} [options.renderHeavyJs=false] - Whether to render heavy JavaScript on the page + * @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection * @returns {Promise} The crawl job response * @throws {Error} Throws an error if the HTTP request fails */ @@ -32,7 +33,7 @@ export async function crawl( schema, options = {} ) { - const { mock = null, renderHeavyJs = false } = options; + const { mock = null, renderHeavyJs = false, stealth = false } = options; // Check if mock mode is enabled const useMock = mock !== null ? mock : isMockEnabled(); @@ -83,6 +84,10 @@ export async function crawl( render_heavy_js: renderHeavyJs, }; + if (stealth) { + payload.stealth = stealth; + } + try { const response = await axios.post(endpoint, payload, { headers }); return response.data; diff --git a/scrapegraph-js/src/markdownify.js b/scrapegraph-js/src/markdownify.js index 7f93349..6a1cebd 100644 --- a/scrapegraph-js/src/markdownify.js +++ b/scrapegraph-js/src/markdownify.js @@ -10,11 +10,12 @@ import { getMockResponse } from './utils/mockResponse.js'; * @param {string} url - The URL of the webpage to be converted. * @param {Object} options - Optional configuration options. * @param {boolean} options.mock - Override mock mode for this request + * @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection * @returns {Promise} A promise that resolves to the markdown representation of the webpage. * @throws {Error} Throws an error if the HTTP request fails. */ export async function markdownify(apiKey, url, options = {}) { - const { mock = null } = options; + const { mock = null, stealth = false } = options; // Check if mock mode is enabled const useMock = mock !== null ? mock : isMockEnabled(); @@ -36,6 +37,10 @@ export async function markdownify(apiKey, url, options = {}) { website_url: url, }; + if (stealth) { + payload.stealth = stealth; + } + try { const response = await axios.post(endpoint, payload, { headers }); return response.data; diff --git a/scrapegraph-js/src/scrape.js b/scrapegraph-js/src/scrape.js index a6075e9..3d9bfca 100644 --- a/scrapegraph-js/src/scrape.js +++ b/scrapegraph-js/src/scrape.js @@ -11,6 +11,7 @@ import { getMockResponse, createMockAxiosResponse } from './utils/mockResponse.j * @param {Object} options - Optional configuration options. * @param {boolean} options.renderHeavyJs - Whether to render heavy JavaScript (defaults to false). * @param {Object} options.headers - Optional custom headers to send with the request. + * @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection * @returns {Promise} A promise that resolves to the HTML content and metadata. * @throws {Error} Throws an error if the HTTP request fails. * @@ -47,7 +48,8 @@ export async function scrape(apiKey, url, options = {}) { const { renderHeavyJs = false, headers: customHeaders = {}, - mock = null + mock = null, + stealth = false } = options; // Check if mock mode is enabled @@ -73,6 +75,10 @@ export async function scrape(apiKey, url, options = {}) { render_heavy_js: renderHeavyJs, }; + if (stealth) { + payload.stealth = stealth; + } + // Only include headers in payload if they are provided if (Object.keys(customHeaders).length > 0) { payload.headers = customHeaders; diff --git a/scrapegraph-js/src/searchScraper.js b/scrapegraph-js/src/searchScraper.js index 6ec09e2..a27472b 100644 --- a/scrapegraph-js/src/searchScraper.js +++ b/scrapegraph-js/src/searchScraper.js @@ -20,11 +20,12 @@ import { getMockResponse } from './utils/mockResponse.js'; * @param {boolean} options.renderHeavyJs - Whether to render heavy JavaScript on the page * @param {boolean} [options.extractionMode=true] - Whether to use AI extraction (true) or markdown conversion (false). * AI extraction costs 10 credits per page, markdown conversion costs 2 credits per page. + * @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection * @returns {Promise} Extracted data in JSON format matching the provided schema * @throws - Will throw an error in case of an HTTP failure. */ export async function searchScraper(apiKey, prompt, numResults = 3, schema = null, userAgent = null, options = {}) { - const { mock = null, renderHeavyJs = false, extractionMode = true } = options; + const { mock = null, renderHeavyJs = false, extractionMode = true, stealth = false } = options; // Check if mock mode is enabled const useMock = mock !== null ? mock : isMockEnabled(); @@ -56,6 +57,10 @@ export async function searchScraper(apiKey, prompt, numResults = 3, schema = nul extraction_mode: extractionMode, }; + if (stealth) { + payload.stealth = stealth; + } + if (schema) { if (schema instanceof ZodType) { payload.output_schema = zodToJsonSchema(schema); diff --git a/scrapegraph-js/src/smartScraper.js b/scrapegraph-js/src/smartScraper.js index 8cbeb46..9883582 100644 --- a/scrapegraph-js/src/smartScraper.js +++ b/scrapegraph-js/src/smartScraper.js @@ -16,10 +16,11 @@ import { getMockResponse, createMockAxiosResponse } from './utils/mockResponse.j * @param {number} [totalPages] - Optional number of pages to scrape (1-10). If not provided, only the first page will be scraped. * @param {Object} [cookies] - Optional cookies object for authentication and session management * @param {boolean} [renderHeavyJs] - Optional flag to enable heavy JavaScript rendering on the page + * @param {boolean} [stealth] - Optional flag to enable stealth mode to avoid bot detection * @returns {Promise} Extracted data in JSON format matching the provided schema * @throws - Will throw an error in case of an HTTP failure. */ -export async function smartScraper(apiKey, url, prompt, schema = null, numberOfScrolls = null, totalPages = null, cookies = null, options = {}, plain_text = false, renderHeavyJs = false) { +export async function smartScraper(apiKey, url, prompt, schema = null, numberOfScrolls = null, totalPages = null, cookies = null, options = {}, plain_text = false, renderHeavyJs = false, stealth = false) { const { mock = null } = options; // Check if mock mode is enabled @@ -79,6 +80,10 @@ export async function smartScraper(apiKey, url, prompt, schema = null, numberOfS payload.total_pages = totalPages; } + if (stealth) { + payload.stealth = stealth; + } + try { const response = await axios.post(endpoint, payload, { headers }); return response.data; diff --git a/scrapegraph-js/test/stealth_mode_test.js b/scrapegraph-js/test/stealth_mode_test.js new file mode 100644 index 0000000..2d17244 --- /dev/null +++ b/scrapegraph-js/test/stealth_mode_test.js @@ -0,0 +1,626 @@ +import { + smartScraper, + searchScraper, + markdownify, + scrape, + agenticScraper, + crawl, +} from '../index.js'; +import 'dotenv/config'; + +/** + * Test suite for Stealth Mode functionality + * This file demonstrates usage and validates stealth mode parameters across all endpoints + */ + +// Mock API key for testing (replace with real key for actual testing) +const API_KEY = process.env.SGAI_APIKEY || 'test-api-key'; + +/** + * Test input validation for stealth mode + */ +function testStealthModeValidation() { + console.log('πŸ§ͺ Testing Stealth Mode Input Validation'); + console.log('='.repeat(50)); + + const testCases = [ + { + name: 'Valid stealth mode - true', + stealth: true, + expected: true, + description: 'Stealth mode enabled (boolean true)' + }, + { + name: 'Valid stealth mode - false', + stealth: false, + expected: true, + description: 'Stealth mode disabled (boolean false)' + }, + { + name: 'Valid stealth mode - undefined (default)', + stealth: undefined, + expected: true, + description: 'Stealth mode not specified (should default to false)' + }, + { + name: 'Invalid stealth mode - string', + stealth: 'true', + expected: false, + description: 'Stealth mode as string instead of boolean' + }, + { + name: 'Invalid stealth mode - number', + stealth: 1, + expected: false, + description: 'Stealth mode as number instead of boolean' + }, + { + name: 'Invalid stealth mode - object', + stealth: {}, + expected: false, + description: 'Stealth mode as object instead of boolean' + }, + ]; + + let passed = 0; + let total = testCases.length; + + testCases.forEach((testCase, index) => { + console.log(`\n${index + 1}. ${testCase.name}`); + console.log(` Description: ${testCase.description}`); + + try { + const isValid = validateStealthMode(testCase.stealth); + + if (isValid === testCase.expected) { + console.log(` βœ… PASSED`); + passed++; + } else { + console.log(` ❌ FAILED - Expected: ${testCase.expected}, Got: ${isValid}`); + } + } catch (error) { + if (!testCase.expected) { + console.log(` βœ… PASSED (Expected error: ${error.message})`); + passed++; + } else { + console.log(` ❌ FAILED - Unexpected error: ${error.message}`); + } + } + }); + + console.log(`\nπŸ“Š Stealth Mode Validation Results: ${passed}/${total} tests passed`); + return passed === total; +} + +/** + * Validate stealth mode parameter + */ +function validateStealthMode(stealth) { + if (stealth !== undefined && typeof stealth !== 'boolean') { + throw new Error('Stealth mode must be a boolean value (true or false)'); + } + return true; +} + +/** + * Test SmartScraper with stealth mode + */ +async function testSmartScraperWithStealth() { + console.log('\nπŸ§ͺ Testing SmartScraper with Stealth Mode'); + console.log('='.repeat(50)); + + const testCases = [ + { + name: 'SmartScraper with stealth=true', + options: { stealth: true }, + description: 'Test smartScraper with stealth mode enabled' + }, + { + name: 'SmartScraper with stealth=false', + options: { stealth: false }, + description: 'Test smartScraper with stealth mode disabled' + }, + { + name: 'SmartScraper without stealth parameter', + options: {}, + description: 'Test smartScraper without stealth parameter (defaults to false)' + }, + ]; + + let passed = 0; + + for (const testCase of testCases) { + console.log(`\n${testCase.name}`); + console.log(` Description: ${testCase.description}`); + + try { + // Mock function to avoid actual API calls + const mockSmartScraper = async (apiKey, url, prompt, schema, numScrolls, totalPages, cookies, options) => { + // Validate that stealth parameter is boolean if provided + if (options.stealth !== undefined && typeof options.stealth !== 'boolean') { + throw new Error('Stealth must be a boolean'); + } + + return { + request_id: 'mock-request-id', + status: 'completed', + result: { data: 'mock data' } + }; + }; + + const result = await mockSmartScraper( + API_KEY, + 'https://example.com', + 'Extract data', + null, + null, + null, + null, + testCase.options + ); + + console.log(` βœ… PASSED - Status: ${result.status}`); + passed++; + } catch (error) { + console.log(` ❌ FAILED - Error: ${error.message}`); + } + } + + console.log(`\nπŸ“Š SmartScraper Stealth Tests: ${passed}/${testCases.length} passed`); + return passed === testCases.length; +} + +/** + * Test SearchScraper with stealth mode + */ +async function testSearchScraperWithStealth() { + console.log('\nπŸ§ͺ Testing SearchScraper with Stealth Mode'); + console.log('='.repeat(50)); + + const testCases = [ + { + name: 'SearchScraper with stealth=true', + options: { stealth: true }, + description: 'Test searchScraper with stealth mode enabled' + }, + { + name: 'SearchScraper with stealth=false', + options: { stealth: false }, + description: 'Test searchScraper with stealth mode disabled' + }, + ]; + + let passed = 0; + + for (const testCase of testCases) { + console.log(`\n${testCase.name}`); + console.log(` Description: ${testCase.description}`); + + try { + // Mock function + const mockSearchScraper = async (apiKey, prompt, numResults, schema, userAgent, options) => { + if (options.stealth !== undefined && typeof options.stealth !== 'boolean') { + throw new Error('Stealth must be a boolean'); + } + + return { + request_id: 'mock-request-id', + status: 'completed', + result: { answer: 'mock answer' } + }; + }; + + const result = await mockSearchScraper( + API_KEY, + 'Search query', + 3, + null, + null, + testCase.options + ); + + console.log(` βœ… PASSED - Status: ${result.status}`); + passed++; + } catch (error) { + console.log(` ❌ FAILED - Error: ${error.message}`); + } + } + + console.log(`\nπŸ“Š SearchScraper Stealth Tests: ${passed}/${testCases.length} passed`); + return passed === testCases.length; +} + +/** + * Test Markdownify with stealth mode + */ +async function testMarkdownifyWithStealth() { + console.log('\nπŸ§ͺ Testing Markdownify with Stealth Mode'); + console.log('='.repeat(50)); + + const testCases = [ + { + name: 'Markdownify with stealth=true', + options: { stealth: true }, + description: 'Test markdownify with stealth mode enabled' + }, + { + name: 'Markdownify with stealth=false', + options: { stealth: false }, + description: 'Test markdownify with stealth mode disabled' + }, + ]; + + let passed = 0; + + for (const testCase of testCases) { + console.log(`\n${testCase.name}`); + console.log(` Description: ${testCase.description}`); + + try { + // Mock function + const mockMarkdownify = async (apiKey, url, options) => { + if (options.stealth !== undefined && typeof options.stealth !== 'boolean') { + throw new Error('Stealth must be a boolean'); + } + + return { + request_id: 'mock-request-id', + status: 'completed', + result: '# Markdown content' + }; + }; + + const result = await mockMarkdownify( + API_KEY, + 'https://example.com', + testCase.options + ); + + console.log(` βœ… PASSED - Status: ${result.status}`); + passed++; + } catch (error) { + console.log(` ❌ FAILED - Error: ${error.message}`); + } + } + + console.log(`\nπŸ“Š Markdownify Stealth Tests: ${passed}/${testCases.length} passed`); + return passed === testCases.length; +} + +/** + * Test Scrape with stealth mode + */ +async function testScrapeWithStealth() { + console.log('\nπŸ§ͺ Testing Scrape with Stealth Mode'); + console.log('='.repeat(50)); + + const testCases = [ + { + name: 'Scrape with stealth=true', + options: { stealth: true }, + description: 'Test scrape with stealth mode enabled' + }, + { + name: 'Scrape with stealth=false', + options: { stealth: false }, + description: 'Test scrape with stealth mode disabled' + }, + { + name: 'Scrape with stealth=true and renderHeavyJs=true', + options: { stealth: true, renderHeavyJs: true }, + description: 'Test scrape with both stealth and heavy JS rendering' + }, + ]; + + let passed = 0; + + for (const testCase of testCases) { + console.log(`\n${testCase.name}`); + console.log(` Description: ${testCase.description}`); + + try { + // Mock function + const mockScrape = async (apiKey, url, options) => { + if (options.stealth !== undefined && typeof options.stealth !== 'boolean') { + throw new Error('Stealth must be a boolean'); + } + + return { + scrape_request_id: 'mock-request-id', + status: 'completed', + html: 'Mock content' + }; + }; + + const result = await mockScrape( + API_KEY, + 'https://example.com', + testCase.options + ); + + console.log(` βœ… PASSED - Status: ${result.status}`); + passed++; + } catch (error) { + console.log(` ❌ FAILED - Error: ${error.message}`); + } + } + + console.log(`\nπŸ“Š Scrape Stealth Tests: ${passed}/${testCases.length} passed`); + return passed === testCases.length; +} + +/** + * Test Agentic Scraper with stealth mode + */ +async function testAgenticScraperWithStealth() { + console.log('\nπŸ§ͺ Testing Agentic Scraper with Stealth Mode'); + console.log('='.repeat(50)); + + const testCases = [ + { + name: 'AgenticScraper with stealth=true', + options: { stealth: true }, + description: 'Test agenticScraper with stealth mode enabled' + }, + { + name: 'AgenticScraper with stealth=false', + options: { stealth: false }, + description: 'Test agenticScraper with stealth mode disabled' + }, + { + name: 'AgenticScraper with stealth and AI extraction', + options: { stealth: true }, + aiExtraction: true, + userPrompt: 'Extract user data', + description: 'Test agenticScraper with stealth and AI extraction' + }, + ]; + + let passed = 0; + + for (const testCase of testCases) { + console.log(`\n${testCase.name}`); + console.log(` Description: ${testCase.description}`); + + try { + // Mock function + const mockAgenticScraper = async (apiKey, url, steps, useSession, userPrompt, outputSchema, aiExtraction, options) => { + if (options.stealth !== undefined && typeof options.stealth !== 'boolean') { + throw new Error('Stealth must be a boolean'); + } + + return { + request_id: 'mock-request-id', + status: 'processing', + message: 'Agentic scraping started' + }; + }; + + const result = await mockAgenticScraper( + API_KEY, + 'https://example.com', + ['Click button', 'Extract data'], + true, + testCase.userPrompt || null, + null, + testCase.aiExtraction || false, + testCase.options + ); + + console.log(` βœ… PASSED - Status: ${result.status}`); + passed++; + } catch (error) { + console.log(` ❌ FAILED - Error: ${error.message}`); + } + } + + console.log(`\nπŸ“Š AgenticScraper Stealth Tests: ${passed}/${testCases.length} passed`); + return passed === testCases.length; +} + +/** + * Test Crawl with stealth mode + */ +async function testCrawlWithStealth() { + console.log('\nπŸ§ͺ Testing Crawl with Stealth Mode'); + console.log('='.repeat(50)); + + const testCases = [ + { + name: 'Crawl with stealth=true', + options: { stealth: true }, + description: 'Test crawl with stealth mode enabled' + }, + { + name: 'Crawl with stealth=false', + options: { stealth: false }, + description: 'Test crawl with stealth mode disabled' + }, + { + name: 'Crawl with stealth and sitemap', + options: { stealth: true, sitemap: true }, + description: 'Test crawl with stealth mode and sitemap enabled' + }, + ]; + + let passed = 0; + + for (const testCase of testCases) { + console.log(`\n${testCase.name}`); + console.log(` Description: ${testCase.description}`); + + try { + // Mock function + const mockCrawl = async (apiKey, url, prompt, schema, options) => { + if (options.stealth !== undefined && typeof options.stealth !== 'boolean') { + throw new Error('Stealth must be a boolean'); + } + + return { + id: 'mock-crawl-id', + status: 'processing', + message: 'Crawl job started' + }; + }; + + const result = await mockCrawl( + API_KEY, + 'https://example.com', + 'Extract data', + { type: 'object', properties: { title: { type: 'string' } } }, + testCase.options + ); + + console.log(` βœ… PASSED - Status: ${result.status}`); + passed++; + } catch (error) { + console.log(` ❌ FAILED - Error: ${error.message}`); + } + } + + console.log(`\nπŸ“Š Crawl Stealth Tests: ${passed}/${testCases.length} passed`); + return passed === testCases.length; +} + +/** + * Test combined features with stealth mode + */ +async function testCombinedFeaturesWithStealth() { + console.log('\nπŸ§ͺ Testing Combined Features with Stealth Mode'); + console.log('='.repeat(50)); + + const testCases = [ + { + name: 'SmartScraper with stealth + headers + pagination', + endpoint: 'smartScraper', + options: { + stealth: true, + renderHeavyJs: true + }, + additionalParams: { + numberOfScrolls: 10, + totalPages: 5 + }, + description: 'Test smartScraper with stealth and multiple features' + }, + { + name: 'Scrape with stealth + headers + heavy JS', + endpoint: 'scrape', + options: { + stealth: true, + renderHeavyJs: true, + headers: { 'User-Agent': 'Test Agent' } + }, + description: 'Test scrape with stealth, custom headers, and JS rendering' + }, + { + name: 'SearchScraper with stealth + extraction mode', + endpoint: 'searchScraper', + options: { + stealth: true, + extractionMode: true, + renderHeavyJs: true + }, + description: 'Test searchScraper with stealth and extraction mode' + }, + ]; + + let passed = 0; + + for (const testCase of testCases) { + console.log(`\n${testCase.name}`); + console.log(` Description: ${testCase.description}`); + + try { + // Validate all options + if (testCase.options.stealth !== undefined && typeof testCase.options.stealth !== 'boolean') { + throw new Error('Stealth must be a boolean'); + } + if (testCase.options.renderHeavyJs !== undefined && typeof testCase.options.renderHeavyJs !== 'boolean') { + throw new Error('RenderHeavyJs must be a boolean'); + } + + console.log(` βœ… PASSED - All parameters validated successfully`); + passed++; + } catch (error) { + console.log(` ❌ FAILED - Error: ${error.message}`); + } + } + + console.log(`\nπŸ“Š Combined Features Tests: ${passed}/${testCases.length} passed`); + return passed === testCases.length; +} + +/** + * Run all stealth mode tests + */ +async function runAllStealthTests() { + console.log('πŸš€ Starting Stealth Mode Test Suite'); + console.log('='.repeat(60)); + console.log(`πŸ”‘ API Key: ${API_KEY.substring(0, 8)}...`); + console.log(`⏰ Timestamp: ${new Date().toISOString()}\n`); + + const tests = [ + { name: 'Stealth Mode Validation', fn: testStealthModeValidation }, + { name: 'SmartScraper with Stealth', fn: testSmartScraperWithStealth }, + { name: 'SearchScraper with Stealth', fn: testSearchScraperWithStealth }, + { name: 'Markdownify with Stealth', fn: testMarkdownifyWithStealth }, + { name: 'Scrape with Stealth', fn: testScrapeWithStealth }, + { name: 'AgenticScraper with Stealth', fn: testAgenticScraperWithStealth }, + { name: 'Crawl with Stealth', fn: testCrawlWithStealth }, + { name: 'Combined Features with Stealth', fn: testCombinedFeaturesWithStealth }, + ]; + + let passed = 0; + let total = tests.length; + + for (const test of tests) { + try { + const result = await test.fn(); + if (result) { + passed++; + } + } catch (error) { + console.error(`❌ Test '${test.name}' failed with error: ${error.message}`); + } + console.log('\n' + '-'.repeat(60)); + } + + console.log('\n🎯 FINAL TEST RESULTS'); + console.log('='.repeat(30)); + console.log(`βœ… Passed: ${passed}`); + console.log(`❌ Failed: ${total - passed}`); + console.log(`πŸ“Š Success Rate: ${((passed / total) * 100).toFixed(1)}%`); + + if (passed === total) { + console.log('\nπŸŽ‰ All stealth mode tests passed! Functionality is working correctly.'); + return 0; + } else { + console.log('\n⚠️ Some tests failed. Please review the output above.'); + return 1; + } +} + +// Run tests if this file is executed directly +if (import.meta.url === `file://${process.argv[1]}`) { + runAllStealthTests() + .then(exitCode => { + process.exit(exitCode); + }) + .catch(error => { + console.error('πŸ’₯ Fatal error during test execution:', error.message); + process.exit(1); + }); +} + +export { + testStealthModeValidation, + testSmartScraperWithStealth, + testSearchScraperWithStealth, + testMarkdownifyWithStealth, + testScrapeWithStealth, + testAgenticScraperWithStealth, + testCrawlWithStealth, + testCombinedFeaturesWithStealth, + runAllStealthTests +}; diff --git a/scrapegraph-py/examples/stealth_mode_example.py b/scrapegraph-py/examples/stealth_mode_example.py new file mode 100644 index 0000000..442c3a3 --- /dev/null +++ b/scrapegraph-py/examples/stealth_mode_example.py @@ -0,0 +1,494 @@ +""" +Stealth Mode Examples for ScrapeGraph AI Python SDK + +This file demonstrates how to use stealth mode with various endpoints +to avoid bot detection when scraping websites. + +Stealth mode enables advanced techniques to make requests appear more +like those from a real browser, helping to bypass basic bot detection. +""" + +import os +from scrapegraph_py import Client +from pydantic import BaseModel, Field + +# Get API key from environment variable +API_KEY = os.getenv("SGAI_API_KEY", "your-api-key-here") + + +# ============================================================================ +# EXAMPLE 1: SmartScraper with Stealth Mode +# ============================================================================ + + +def example_smartscraper_with_stealth(): + """ + Extract structured data from a webpage using stealth mode. + Useful for websites with bot detection. + """ + print("\n" + "=" * 60) + print("EXAMPLE 1: SmartScraper with Stealth Mode") + print("=" * 60) + + with Client(api_key=API_KEY) as client: + try: + response = client.smartscraper( + website_url="https://www.scrapethissite.com/pages/simple/", + user_prompt="Extract country names and capitals", + stealth=True, # Enable stealth mode + ) + + print(f"Status: {response['status']}") + print(f"Request ID: {response['request_id']}") + print(f"Result: {response['result']}") + + except Exception as e: + print(f"Error: {e}") + + +# ============================================================================ +# EXAMPLE 2: SmartScraper with Stealth Mode and Output Schema +# ============================================================================ + + +def example_smartscraper_with_stealth_and_schema(): + """ + Use stealth mode with a structured output schema to extract data + from websites that might detect bots. + """ + print("\n" + "=" * 60) + print("EXAMPLE 2: SmartScraper with Stealth Mode and Schema") + print("=" * 60) + + # Define output schema using Pydantic + class Product(BaseModel): + name: str = Field(description="Product name") + price: str = Field(description="Product price") + rating: float = Field(description="Product rating (0-5)") + + with Client(api_key=API_KEY) as client: + try: + response = client.smartscraper( + website_url="https://example.com/products", + user_prompt="Extract product information including name, price, and rating", + output_schema=Product, + stealth=True, # Enable stealth mode + ) + + print(f"Status: {response['status']}") + print(f"Request ID: {response['request_id']}") + print(f"Result: {response['result']}") + + except Exception as e: + print(f"Error: {e}") + + +# ============================================================================ +# EXAMPLE 3: SearchScraper with Stealth Mode +# ============================================================================ + + +def example_searchscraper_with_stealth(): + """ + Search and extract information from multiple sources using stealth mode. + """ + print("\n" + "=" * 60) + print("EXAMPLE 3: SearchScraper with Stealth Mode") + print("=" * 60) + + with Client(api_key=API_KEY) as client: + try: + response = client.searchscraper( + user_prompt="What are the latest developments in AI technology?", + num_results=5, + stealth=True, # Enable stealth mode + ) + + print(f"Status: {response['status']}") + print(f"Request ID: {response['request_id']}") + print(f"Result: {response['result']}") + if "reference_urls" in response: + print(f"Reference URLs: {response['reference_urls']}") + + except Exception as e: + print(f"Error: {e}") + + +# ============================================================================ +# EXAMPLE 4: Markdownify with Stealth Mode +# ============================================================================ + + +def example_markdownify_with_stealth(): + """ + Convert a webpage to markdown format using stealth mode. + """ + print("\n" + "=" * 60) + print("EXAMPLE 4: Markdownify with Stealth Mode") + print("=" * 60) + + with Client(api_key=API_KEY) as client: + try: + response = client.markdownify( + website_url="https://www.example.com", + stealth=True, # Enable stealth mode + ) + + print(f"Status: {response['status']}") + print(f"Request ID: {response['request_id']}") + print(f"Markdown Preview (first 500 chars):") + print(response["result"][:500]) + + except Exception as e: + print(f"Error: {e}") + + +# ============================================================================ +# EXAMPLE 5: Scrape with Stealth Mode +# ============================================================================ + + +def example_scrape_with_stealth(): + """ + Get raw HTML from a webpage using stealth mode. + """ + print("\n" + "=" * 60) + print("EXAMPLE 5: Scrape with Stealth Mode") + print("=" * 60) + + with Client(api_key=API_KEY) as client: + try: + response = client.scrape( + website_url="https://www.example.com", + stealth=True, # Enable stealth mode + ) + + print(f"Status: {response['status']}") + print(f"Scrape Request ID: {response['scrape_request_id']}") + print(f"HTML Preview (first 500 chars):") + print(response["html"][:500]) + + except Exception as e: + print(f"Error: {e}") + + +# ============================================================================ +# EXAMPLE 6: Scrape with Stealth Mode and Heavy JS Rendering +# ============================================================================ + + +def example_scrape_with_stealth_and_js(): + """ + Scrape a JavaScript-heavy website using stealth mode. + Combines JavaScript rendering with stealth techniques. + """ + print("\n" + "=" * 60) + print("EXAMPLE 6: Scrape with Stealth Mode and Heavy JS") + print("=" * 60) + + with Client(api_key=API_KEY) as client: + try: + response = client.scrape( + website_url="https://www.example.com", + render_heavy_js=True, # Enable JavaScript rendering + stealth=True, # Enable stealth mode + ) + + print(f"Status: {response['status']}") + print(f"Scrape Request ID: {response['scrape_request_id']}") + print(f"HTML Preview (first 500 chars):") + print(response["html"][:500]) + + except Exception as e: + print(f"Error: {e}") + + +# ============================================================================ +# EXAMPLE 7: Agentic Scraper with Stealth Mode +# ============================================================================ + + +def example_agenticscraper_with_stealth(): + """ + Perform automated browser actions using stealth mode. + Ideal for interacting with protected forms or multi-step workflows. + """ + print("\n" + "=" * 60) + print("EXAMPLE 7: Agentic Scraper with Stealth Mode") + print("=" * 60) + + with Client(api_key=API_KEY) as client: + try: + response = client.agenticscraper( + url="https://dashboard.example.com/login", + steps=[ + "Type user@example.com in email input box", + "Type password123 in password input box", + "Click on login button", + ], + use_session=True, + stealth=True, # Enable stealth mode + ) + + print(f"Status: {response['status']}") + print(f"Request ID: {response['request_id']}") + + except Exception as e: + print(f"Error: {e}") + + +# ============================================================================ +# EXAMPLE 8: Agentic Scraper with Stealth Mode and AI Extraction +# ============================================================================ + + +def example_agenticscraper_with_stealth_and_ai(): + """ + Combine stealth mode with AI extraction in agentic scraping. + Performs actions and then extracts structured data. + """ + print("\n" + "=" * 60) + print("EXAMPLE 8: Agentic Scraper with Stealth and AI Extraction") + print("=" * 60) + + with Client(api_key=API_KEY) as client: + try: + response = client.agenticscraper( + url="https://dashboard.example.com", + steps=[ + "Navigate to user profile section", + "Click on settings tab", + ], + use_session=True, + user_prompt="Extract user profile information and settings", + ai_extraction=True, + stealth=True, # Enable stealth mode + ) + + print(f"Status: {response['status']}") + print(f"Request ID: {response['request_id']}") + + except Exception as e: + print(f"Error: {e}") + + +# ============================================================================ +# EXAMPLE 9: Crawl with Stealth Mode +# ============================================================================ + + +def example_crawl_with_stealth(): + """ + Crawl an entire website using stealth mode. + Useful for comprehensive data extraction from protected sites. + """ + print("\n" + "=" * 60) + print("EXAMPLE 9: Crawl with Stealth Mode") + print("=" * 60) + + schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Website Content", + "type": "object", + "properties": { + "title": {"type": "string", "description": "Page title"}, + "content": {"type": "string", "description": "Main content"}, + }, + "required": ["title"], + } + + with Client(api_key=API_KEY) as client: + try: + response = client.crawl( + url="https://www.example.com", + prompt="Extract page titles and main content", + data_schema=schema, + depth=2, + max_pages=5, + stealth=True, # Enable stealth mode + ) + + print(f"Status: {response['status']}") + print(f"Crawl ID: {response['id']}") + + except Exception as e: + print(f"Error: {e}") + + +# ============================================================================ +# EXAMPLE 10: Crawl with Stealth Mode and Sitemap +# ============================================================================ + + +def example_crawl_with_stealth_and_sitemap(): + """ + Use sitemap for efficient crawling with stealth mode enabled. + """ + print("\n" + "=" * 60) + print("EXAMPLE 10: Crawl with Stealth Mode and Sitemap") + print("=" * 60) + + schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Product Information", + "type": "object", + "properties": { + "product_name": {"type": "string"}, + "price": {"type": "string"}, + "description": {"type": "string"}, + }, + "required": ["product_name"], + } + + with Client(api_key=API_KEY) as client: + try: + response = client.crawl( + url="https://www.example-shop.com", + prompt="Extract product information from all pages", + data_schema=schema, + sitemap=True, # Use sitemap for better page discovery + depth=3, + max_pages=10, + stealth=True, # Enable stealth mode + ) + + print(f"Status: {response['status']}") + print(f"Crawl ID: {response['id']}") + + except Exception as e: + print(f"Error: {e}") + + +# ============================================================================ +# EXAMPLE 11: SmartScraper with Stealth, Custom Headers, and Pagination +# ============================================================================ + + +def example_smartscraper_advanced_stealth(): + """ + Advanced example combining stealth mode with custom headers and pagination. + """ + print("\n" + "=" * 60) + print("EXAMPLE 11: SmartScraper Advanced with Stealth") + print("=" * 60) + + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Accept-Language": "en-US,en;q=0.9", + } + + with Client(api_key=API_KEY) as client: + try: + response = client.smartscraper( + website_url="https://www.example-marketplace.com/products", + user_prompt="Extract all product listings from multiple pages", + headers=headers, + number_of_scrolls=10, + total_pages=5, + render_heavy_js=True, + stealth=True, # Enable stealth mode + ) + + print(f"Status: {response['status']}") + print(f"Request ID: {response['request_id']}") + print(f"Result: {response['result']}") + + except Exception as e: + print(f"Error: {e}") + + +# ============================================================================ +# EXAMPLE 12: Using Stealth Mode with Custom Headers +# ============================================================================ + + +def example_stealth_with_custom_headers(): + """ + Demonstrate using stealth mode together with custom headers + for maximum control over request appearance. + """ + print("\n" + "=" * 60) + print("EXAMPLE 12: Stealth Mode with Custom Headers") + print("=" * 60) + + # Custom headers to simulate a real browser request + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "DNT": "1", + } + + with Client(api_key=API_KEY) as client: + try: + # Using with markdownify + response = client.markdownify( + website_url="https://www.protected-site.com", + headers=headers, + stealth=True, # Enable stealth mode + ) + + print(f"Status: {response['status']}") + print(f"Request ID: {response['request_id']}") + print(f"Success! Stealth mode + custom headers bypassed detection.") + + except Exception as e: + print(f"Error: {e}") + + +# ============================================================================ +# RUN ALL EXAMPLES +# ============================================================================ + + +def run_all_examples(): + """Run all stealth mode examples""" + print("\n") + print("=" * 60) + print("STEALTH MODE EXAMPLES FOR SCRAPEGRAPH AI PYTHON SDK") + print("=" * 60) + print("\nThese examples demonstrate how to use stealth mode") + print("to avoid bot detection when scraping websites.") + print("\nStealth mode is available for all major endpoints:") + print("- SmartScraper") + print("- SearchScraper") + print("- Markdownify") + print("- Scrape") + print("- Agentic Scraper") + print("- Crawl") + + examples = [ + example_smartscraper_with_stealth, + example_smartscraper_with_stealth_and_schema, + example_searchscraper_with_stealth, + example_markdownify_with_stealth, + example_scrape_with_stealth, + example_scrape_with_stealth_and_js, + example_agenticscraper_with_stealth, + example_agenticscraper_with_stealth_and_ai, + example_crawl_with_stealth, + example_crawl_with_stealth_and_sitemap, + example_smartscraper_advanced_stealth, + example_stealth_with_custom_headers, + ] + + for i, example_func in enumerate(examples, 1): + try: + example_func() + except Exception as e: + print(f"\nExample {i} failed: {e}") + + print("\n" + "=" * 60) + print("ALL EXAMPLES COMPLETED") + print("=" * 60) + + +if __name__ == "__main__": + # You can run all examples or specific ones + run_all_examples() + + # Or run individual examples: + # example_smartscraper_with_stealth() + # example_searchscraper_with_stealth() + # example_crawl_with_stealth() diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index c33a6b0..09a590e 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -362,14 +362,16 @@ def new_id(prefix: str) -> str: return {"status": "mock", "url": url, "method": method, "kwargs": kwargs} async def markdownify( - self, website_url: str, headers: Optional[dict[str, str]] = None + self, website_url: str, headers: Optional[dict[str, str]] = None, stealth: bool = False ): """Send a markdownify request""" logger.info(f"πŸ” Starting markdownify request for {website_url}") if headers: logger.debug("πŸ”§ Using custom headers") + if stealth: + logger.debug("πŸ₯· Stealth mode enabled") - request = MarkdownifyRequest(website_url=website_url, headers=headers) + request = MarkdownifyRequest(website_url=website_url, headers=headers, stealth=stealth) logger.debug("βœ… Request validation passed") result = await self._make_request( @@ -397,23 +399,28 @@ async def scrape( website_url: str, render_heavy_js: bool = False, headers: Optional[dict[str, str]] = None, + stealth: bool = False, ): """Send a scrape request to get HTML content from a website - + Args: website_url: The URL of the website to get HTML from render_heavy_js: Whether to render heavy JavaScript (defaults to False) headers: Optional headers to send with the request + stealth: Enable stealth mode to avoid bot detection """ logger.info(f"πŸ” Starting scrape request for {website_url}") logger.debug(f"πŸ”§ Render heavy JS: {render_heavy_js}") if headers: logger.debug("πŸ”§ Using custom headers") + if stealth: + logger.debug("πŸ₯· Stealth mode enabled") request = ScrapeRequest( website_url=website_url, render_heavy_js=render_heavy_js, headers=headers, + stealth=stealth, ) logger.debug("βœ… Request validation passed") @@ -489,6 +496,7 @@ async def smartscraper( output_schema: Optional[BaseModel] = None, number_of_scrolls: Optional[int] = None, total_pages: Optional[int] = None, + stealth: bool = False, ): """Send a smartscraper request with optional pagination support and cookies""" logger.info("πŸ” Starting smartscraper request") @@ -504,6 +512,8 @@ async def smartscraper( logger.debug(f"πŸ”„ Number of scrolls: {number_of_scrolls}") if total_pages is not None: logger.debug(f"πŸ“„ Total pages to scrape: {total_pages}") + if stealth: + logger.debug("πŸ₯· Stealth mode enabled") logger.debug(f"πŸ“ Prompt: {user_prompt}") request = SmartScraperRequest( @@ -515,6 +525,7 @@ async def smartscraper( output_schema=output_schema, number_of_scrolls=number_of_scrolls, total_pages=total_pages, + stealth=stealth, ) logger.debug("βœ… Request validation passed") @@ -578,6 +589,7 @@ async def searchscraper( headers: Optional[dict[str, str]] = None, output_schema: Optional[BaseModel] = None, extraction_mode: bool = True, + stealth: bool = False, ): """Send a searchscraper request @@ -591,6 +603,7 @@ async def searchscraper( output_schema: Optional schema to structure the output extraction_mode: Whether to use AI extraction (True) or markdown conversion (False). AI extraction costs 10 credits per page, markdown conversion costs 2 credits per page. + stealth: Enable stealth mode to avoid bot detection """ logger.info("πŸ” Starting searchscraper request") logger.debug(f"πŸ“ Prompt: {user_prompt}") @@ -598,6 +611,8 @@ async def searchscraper( logger.debug(f"πŸ€– Extraction mode: {'AI extraction' if extraction_mode else 'Markdown conversion'}") if headers: logger.debug("πŸ”§ Using custom headers") + if stealth: + logger.debug("πŸ₯· Stealth mode enabled") request = SearchScraperRequest( user_prompt=user_prompt, @@ -605,6 +620,7 @@ async def searchscraper( headers=headers, output_schema=output_schema, extraction_mode=extraction_mode, + stealth=stealth, ) logger.debug("βœ… Request validation passed") @@ -640,6 +656,7 @@ async def crawl( same_domain_only: bool = True, batch_size: Optional[int] = None, sitemap: bool = False, + stealth: bool = False, ): """Send a crawl request with support for both AI extraction and markdown conversion modes""" @@ -660,6 +677,8 @@ async def crawl( logger.debug(f"πŸ“„ Max pages: {max_pages}") logger.debug(f"🏠 Same domain only: {same_domain_only}") logger.debug(f"πŸ—ΊοΈ Use sitemap: {sitemap}") + if stealth: + logger.debug("πŸ₯· Stealth mode enabled") if batch_size is not None: logger.debug(f"πŸ“¦ Batch size: {batch_size}") @@ -672,6 +691,7 @@ async def crawl( "max_pages": max_pages, "same_domain_only": same_domain_only, "sitemap": sitemap, + "stealth": stealth, } # Add optional parameters only if provided @@ -713,9 +733,10 @@ async def agenticscraper( user_prompt: Optional[str] = None, output_schema: Optional[Dict[str, Any]] = None, ai_extraction: bool = False, + stealth: bool = False, ): """Send an agentic scraper request to perform automated actions on a webpage - + Args: url: The URL to scrape steps: List of steps to perform on the webpage @@ -723,6 +744,7 @@ async def agenticscraper( user_prompt: Prompt for AI extraction (required when ai_extraction=True) output_schema: Schema for structured data extraction (optional, used with ai_extraction=True) ai_extraction: Whether to use AI for data extraction from the scraped content (default: False) + stealth: Enable stealth mode to avoid bot detection """ logger.info(f"πŸ€– Starting agentic scraper request for {url}") logger.debug(f"πŸ”§ Use session: {use_session}") @@ -731,6 +753,8 @@ async def agenticscraper( if ai_extraction: logger.debug(f"πŸ’­ User prompt: {user_prompt}") logger.debug(f"πŸ“‹ Output schema provided: {output_schema is not None}") + if stealth: + logger.debug("πŸ₯· Stealth mode enabled") request = AgenticScraperRequest( url=url, @@ -739,6 +763,7 @@ async def agenticscraper( user_prompt=user_prompt, output_schema=output_schema, ai_extraction=ai_extraction, + stealth=stealth, ) logger.debug("βœ… Request validation passed") diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 30f9318..71af4d9 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -384,13 +384,15 @@ def new_id(prefix: str) -> str: # Generic fallback return {"status": "mock", "url": url, "method": method, "kwargs": kwargs} - def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None, mock:bool=False): + def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None, mock:bool=False, stealth:bool=False): """Send a markdownify request""" logger.info(f"πŸ” Starting markdownify request for {website_url}") if headers: logger.debug("πŸ”§ Using custom headers") + if stealth: + logger.debug("πŸ₯· Stealth mode enabled") - request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock) + request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, stealth=stealth) logger.debug("βœ… Request validation passed") result = self._make_request( @@ -417,24 +419,29 @@ def scrape( render_heavy_js: bool = False, headers: Optional[dict[str, str]] = None, mock:bool=False, + stealth:bool=False, ): """Send a scrape request to get HTML content from a website - + Args: website_url: The URL of the website to get HTML from render_heavy_js: Whether to render heavy JavaScript (defaults to False) headers: Optional headers to send with the request + stealth: Enable stealth mode to avoid bot detection """ logger.info(f"πŸ” Starting scrape request for {website_url}") logger.debug(f"πŸ”§ Render heavy JS: {render_heavy_js}") if headers: logger.debug("πŸ”§ Using custom headers") + if stealth: + logger.debug("πŸ₯· Stealth mode enabled") request = ScrapeRequest( website_url=website_url, render_heavy_js=render_heavy_js, headers=headers, - mock=mock + mock=mock, + stealth=stealth ) logger.debug("βœ… Request validation passed") @@ -510,7 +517,8 @@ def smartscraper( number_of_scrolls: Optional[int] = None, total_pages: Optional[int] = None, mock:bool=False, - plain_text:bool=False + plain_text:bool=False, + stealth:bool=False ): """Send a smartscraper request with optional pagination support and cookies""" logger.info("πŸ” Starting smartscraper request") @@ -526,6 +534,8 @@ def smartscraper( logger.debug(f"πŸ”„ Number of scrolls: {number_of_scrolls}") if total_pages is not None: logger.debug(f"πŸ“„ Total pages to scrape: {total_pages}") + if stealth: + logger.debug("πŸ₯· Stealth mode enabled") logger.debug(f"πŸ“ Prompt: {user_prompt}") request = SmartScraperRequest( @@ -539,6 +549,7 @@ def smartscraper( total_pages=total_pages, mock=mock, plain_text=plain_text, + stealth=stealth, ) logger.debug("βœ… Request validation passed") @@ -599,7 +610,8 @@ def searchscraper( headers: Optional[dict[str, str]] = None, output_schema: Optional[BaseModel] = None, extraction_mode: bool = True, - mock: bool=False + mock: bool=False, + stealth: bool=False ): """Send a searchscraper request @@ -613,6 +625,7 @@ def searchscraper( output_schema: Optional schema to structure the output extraction_mode: Whether to use AI extraction (True) or markdown conversion (False). AI extraction costs 10 credits per page, markdown conversion costs 2 credits per page. + stealth: Enable stealth mode to avoid bot detection """ logger.info("πŸ” Starting searchscraper request") logger.debug(f"πŸ“ Prompt: {user_prompt}") @@ -620,6 +633,8 @@ def searchscraper( logger.debug(f"πŸ€– Extraction mode: {'AI extraction' if extraction_mode else 'Markdown conversion'}") if headers: logger.debug("πŸ”§ Using custom headers") + if stealth: + logger.debug("πŸ₯· Stealth mode enabled") request = SearchScraperRequest( user_prompt=user_prompt, @@ -627,7 +642,8 @@ def searchscraper( headers=headers, output_schema=output_schema, extraction_mode=extraction_mode, - mock=mock + mock=mock, + stealth=stealth ) logger.debug("βœ… Request validation passed") @@ -661,6 +677,7 @@ def crawl( same_domain_only: bool = True, batch_size: Optional[int] = None, sitemap: bool = False, + stealth: bool = False, ): """Send a crawl request with support for both AI extraction and markdown conversion modes""" @@ -681,6 +698,8 @@ def crawl( logger.debug(f"πŸ“„ Max pages: {max_pages}") logger.debug(f"🏠 Same domain only: {same_domain_only}") logger.debug(f"πŸ—ΊοΈ Use sitemap: {sitemap}") + if stealth: + logger.debug("πŸ₯· Stealth mode enabled") if batch_size is not None: logger.debug(f"πŸ“¦ Batch size: {batch_size}") @@ -693,6 +712,7 @@ def crawl( "max_pages": max_pages, "same_domain_only": same_domain_only, "sitemap": sitemap, + "stealth": stealth, } # Add optional parameters only if provided @@ -733,9 +753,10 @@ def agenticscraper( output_schema: Optional[Dict[str, Any]] = None, ai_extraction: bool = False, mock: bool=False, + stealth: bool=False, ): """Send an agentic scraper request to perform automated actions on a webpage - + Args: url: The URL to scrape steps: List of steps to perform on the webpage @@ -743,6 +764,7 @@ def agenticscraper( user_prompt: Prompt for AI extraction (required when ai_extraction=True) output_schema: Schema for structured data extraction (optional, used with ai_extraction=True) ai_extraction: Whether to use AI for data extraction from the scraped content (default: False) + stealth: Enable stealth mode to avoid bot detection """ logger.info(f"πŸ€– Starting agentic scraper request for {url}") logger.debug(f"πŸ”§ Use session: {use_session}") @@ -751,6 +773,8 @@ def agenticscraper( if ai_extraction: logger.debug(f"πŸ’­ User prompt: {user_prompt}") logger.debug(f"πŸ“‹ Output schema provided: {output_schema is not None}") + if stealth: + logger.debug("πŸ₯· Stealth mode enabled") request = AgenticScraperRequest( url=url, @@ -759,7 +783,8 @@ def agenticscraper( user_prompt=user_prompt, output_schema=output_schema, ai_extraction=ai_extraction, - mock=mock + mock=mock, + stealth=stealth ) logger.debug("βœ… Request validation passed") diff --git a/scrapegraph-py/scrapegraph_py/models/agenticscraper.py b/scrapegraph-py/scrapegraph_py/models/agenticscraper.py index a2e7f21..6226597 100644 --- a/scrapegraph-py/scrapegraph_py/models/agenticscraper.py +++ b/scrapegraph-py/scrapegraph_py/models/agenticscraper.py @@ -59,7 +59,8 @@ class AgenticScraperRequest(BaseModel): ) mock: bool = Field(default=False, description="Whether to use mock mode for the request") render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") - + stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") + @model_validator(mode="after") def validate_url(self) -> "AgenticScraperRequest": if not self.url.strip(): diff --git a/scrapegraph-py/scrapegraph_py/models/crawl.py b/scrapegraph-py/scrapegraph_py/models/crawl.py index 07227e3..e174030 100644 --- a/scrapegraph-py/scrapegraph_py/models/crawl.py +++ b/scrapegraph-py/scrapegraph_py/models/crawl.py @@ -73,6 +73,7 @@ class CrawlRequest(BaseModel): "and user agent", ) render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") + stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") @model_validator(mode="after") def validate_url(self) -> "CrawlRequest": diff --git a/scrapegraph-py/scrapegraph_py/models/markdownify.py b/scrapegraph-py/scrapegraph_py/models/markdownify.py index d795d27..9174e7f 100644 --- a/scrapegraph-py/scrapegraph_py/models/markdownify.py +++ b/scrapegraph-py/scrapegraph_py/models/markdownify.py @@ -18,6 +18,7 @@ class MarkdownifyRequest(BaseModel): "and user agent", ) mock: bool = Field(default=False, description="Whether to use mock mode for the request") + stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") @model_validator(mode="after") def validate_url(self) -> "MarkdownifyRequest": diff --git a/scrapegraph-py/scrapegraph_py/models/scrape.py b/scrapegraph-py/scrapegraph_py/models/scrape.py index c178a54..d970c45 100644 --- a/scrapegraph-py/scrapegraph_py/models/scrape.py +++ b/scrapegraph-py/scrapegraph_py/models/scrape.py @@ -22,6 +22,7 @@ class ScrapeRequest(BaseModel): "and user agent", ), mock: bool = Field(default=False, description="Whether to use mock mode for the request") + stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") @model_validator(mode="after") def validate_url(self) -> "ScrapeRequest": diff --git a/scrapegraph-py/scrapegraph_py/models/searchscraper.py b/scrapegraph-py/scrapegraph_py/models/searchscraper.py index f8277da..9656935 100644 --- a/scrapegraph-py/scrapegraph_py/models/searchscraper.py +++ b/scrapegraph-py/scrapegraph_py/models/searchscraper.py @@ -33,6 +33,7 @@ class SearchScraperRequest(BaseModel): ) mock: bool = Field(default=False, description="Whether to use mock mode for the request") render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") + stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") @model_validator(mode="after") def validate_user_prompt(self) -> "SearchScraperRequest": diff --git a/scrapegraph-py/scrapegraph_py/models/smartscraper.py b/scrapegraph-py/scrapegraph_py/models/smartscraper.py index c379b3f..41cabed 100644 --- a/scrapegraph-py/scrapegraph_py/models/smartscraper.py +++ b/scrapegraph-py/scrapegraph_py/models/smartscraper.py @@ -51,7 +51,8 @@ class SmartScraperRequest(BaseModel): mock: bool = Field(default=False, description="Whether to use mock mode for the request") plain_text: bool = Field(default=False, description="Whether to return the result as plain text") render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") - + stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") + @model_validator(mode="after") def validate_user_prompt(self) -> "SmartScraperRequest": if self.user_prompt is None or not self.user_prompt.strip(): diff --git a/scrapegraph-py/tests/test_stealth_mode.py b/scrapegraph-py/tests/test_stealth_mode.py new file mode 100644 index 0000000..1987696 --- /dev/null +++ b/scrapegraph-py/tests/test_stealth_mode.py @@ -0,0 +1,469 @@ +from uuid import uuid4 + +import pytest +import responses + +from scrapegraph_py.client import Client +from tests.utils import generate_mock_api_key + + +@pytest.fixture +def mock_api_key(): + return generate_mock_api_key() + + +@pytest.fixture +def mock_uuid(): + return str(uuid4()) + + +# ============================================================================ +# SMARTSCRAPER STEALTH MODE TESTS +# ============================================================================ + + +@responses.activate +def test_smartscraper_with_stealth_mode(mock_api_key): + """Test smartscraper with stealth mode enabled""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/smartscraper", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"description": "Content extracted with stealth mode."}, + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.smartscraper( + website_url="https://example.com", + user_prompt="Describe this page.", + stealth=True, + ) + assert response["status"] == "completed" + + +@responses.activate +def test_smartscraper_without_stealth_mode(mock_api_key): + """Test smartscraper with stealth mode disabled (default)""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/smartscraper", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"description": "Content extracted without stealth."}, + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.smartscraper( + website_url="https://example.com", + user_prompt="Describe this page.", + stealth=False, + ) + assert response["status"] == "completed" + + +# ============================================================================ +# SEARCHSCRAPER STEALTH MODE TESTS +# ============================================================================ + + +@responses.activate +def test_searchscraper_with_stealth_mode(mock_api_key): + """Test searchscraper with stealth mode enabled""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/searchscraper", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"answer": "Search results with stealth mode."}, + "reference_urls": ["https://example.com"], + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.searchscraper( + user_prompt="Search for information", stealth=True + ) + assert response["status"] == "completed" + + +@responses.activate +def test_searchscraper_without_stealth_mode(mock_api_key): + """Test searchscraper with stealth mode disabled (default)""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/searchscraper", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"answer": "Search results without stealth."}, + "reference_urls": ["https://example.com"], + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.searchscraper( + user_prompt="Search for information", stealth=False + ) + assert response["status"] == "completed" + + +# ============================================================================ +# MARKDOWNIFY STEALTH MODE TESTS +# ============================================================================ + + +@responses.activate +def test_markdownify_with_stealth_mode(mock_api_key): + """Test markdownify with stealth mode enabled""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/markdownify", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": "# Markdown content with stealth mode", + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.markdownify( + website_url="https://example.com", stealth=True + ) + assert response["status"] == "completed" + + +@responses.activate +def test_markdownify_without_stealth_mode(mock_api_key): + """Test markdownify with stealth mode disabled (default)""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/markdownify", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": "# Markdown content without stealth", + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.markdownify( + website_url="https://example.com", stealth=False + ) + assert response["status"] == "completed" + + +# ============================================================================ +# SCRAPE STEALTH MODE TESTS +# ============================================================================ + + +@responses.activate +def test_scrape_with_stealth_mode(mock_api_key): + """Test scrape with stealth mode enabled""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/scrape", + json={ + "scrape_request_id": str(uuid4()), + "status": "completed", + "html": "

Content with stealth mode

", + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.scrape(website_url="https://example.com", stealth=True) + assert response["status"] == "completed" + assert "html" in response + + +@responses.activate +def test_scrape_without_stealth_mode(mock_api_key): + """Test scrape with stealth mode disabled (default)""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/scrape", + json={ + "scrape_request_id": str(uuid4()), + "status": "completed", + "html": "

Content without stealth

", + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.scrape(website_url="https://example.com", stealth=False) + assert response["status"] == "completed" + assert "html" in response + + +@responses.activate +def test_scrape_with_stealth_and_heavy_js(mock_api_key): + """Test scrape with both stealth mode and heavy JS rendering enabled""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/scrape", + json={ + "scrape_request_id": str(uuid4()), + "status": "completed", + "html": "
JS rendered with stealth
", + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.scrape( + website_url="https://example.com", + render_heavy_js=True, + stealth=True, + ) + assert response["status"] == "completed" + assert "html" in response + + +# ============================================================================ +# AGENTIC SCRAPER STEALTH MODE TESTS +# ============================================================================ + + +@responses.activate +def test_agenticscraper_with_stealth_mode(mock_api_key): + """Test agentic scraper with stealth mode enabled""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/agentic-scrapper", + json={ + "request_id": str(uuid4()), + "status": "processing", + "message": "Agentic scraping started with stealth mode", + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.agenticscraper( + url="https://example.com", + steps=["Click on button", "Extract data"], + use_session=True, + stealth=True, + ) + assert response["status"] == "processing" + + +@responses.activate +def test_agenticscraper_without_stealth_mode(mock_api_key): + """Test agentic scraper with stealth mode disabled (default)""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/agentic-scrapper", + json={ + "request_id": str(uuid4()), + "status": "processing", + "message": "Agentic scraping started without stealth", + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.agenticscraper( + url="https://example.com", + steps=["Click on button", "Extract data"], + use_session=True, + stealth=False, + ) + assert response["status"] == "processing" + + +@responses.activate +def test_agenticscraper_with_stealth_and_ai_extraction(mock_api_key): + """Test agentic scraper with stealth mode and AI extraction enabled""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/agentic-scrapper", + json={ + "request_id": str(uuid4()), + "status": "processing", + "message": "Agentic scraping with AI extraction and stealth", + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.agenticscraper( + url="https://example.com", + steps=["Navigate to page", "Extract info"], + use_session=True, + user_prompt="Extract user data", + ai_extraction=True, + stealth=True, + ) + assert response["status"] == "processing" + + +# ============================================================================ +# CRAWL STEALTH MODE TESTS +# ============================================================================ + + +@responses.activate +def test_crawl_with_stealth_mode(mock_api_key): + """Test crawl with stealth mode enabled""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/crawl", + json={ + "id": str(uuid4()), + "status": "processing", + "message": "Crawl started with stealth mode", + }, + ) + + schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Test Schema", + "type": "object", + "properties": {"title": {"type": "string"}}, + "required": ["title"], + } + + with Client(api_key=mock_api_key) as client: + response = client.crawl( + url="https://example.com", + prompt="Extract data", + data_schema=schema, + stealth=True, + ) + assert response["status"] == "processing" + + +@responses.activate +def test_crawl_without_stealth_mode(mock_api_key): + """Test crawl with stealth mode disabled (default)""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/crawl", + json={ + "id": str(uuid4()), + "status": "processing", + "message": "Crawl started without stealth", + }, + ) + + schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Test Schema", + "type": "object", + "properties": {"title": {"type": "string"}}, + "required": ["title"], + } + + with Client(api_key=mock_api_key) as client: + response = client.crawl( + url="https://example.com", + prompt="Extract data", + data_schema=schema, + stealth=False, + ) + assert response["status"] == "processing" + + +@responses.activate +def test_crawl_with_stealth_and_sitemap(mock_api_key): + """Test crawl with stealth mode and sitemap enabled""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/crawl", + json={ + "id": str(uuid4()), + "status": "processing", + "message": "Crawl started with sitemap and stealth", + }, + ) + + schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Test Schema", + "type": "object", + "properties": {"title": {"type": "string"}}, + "required": ["title"], + } + + with Client(api_key=mock_api_key) as client: + response = client.crawl( + url="https://example.com", + prompt="Extract data", + data_schema=schema, + sitemap=True, + stealth=True, + ) + assert response["status"] == "processing" + + +# ============================================================================ +# COMBINED FEATURES WITH STEALTH MODE TESTS +# ============================================================================ + + +@responses.activate +def test_smartscraper_with_stealth_and_all_features(mock_api_key): + """Test smartscraper with stealth mode and all additional features""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/smartscraper", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": { + "products": [ + {"name": "Product 1", "price": "$10"}, + {"name": "Product 2", "price": "$20"}, + ] + }, + }, + ) + + headers = { + "User-Agent": "Mozilla/5.0", + "Cookie": "session=123", + } + + with Client(api_key=mock_api_key) as client: + response = client.smartscraper( + website_url="https://example.com/products", + user_prompt="Extract products", + headers=headers, + number_of_scrolls=5, + total_pages=2, + stealth=True, + ) + assert response["status"] == "completed" + assert "products" in response["result"] + + +@responses.activate +def test_searchscraper_with_stealth_and_all_features(mock_api_key): + """Test searchscraper with stealth mode and all additional features""" + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/searchscraper", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"answer": "Complete search results with stealth"}, + "reference_urls": ["https://example1.com", "https://example2.com"], + }, + ) + + headers = { + "User-Agent": "Mozilla/5.0", + } + + with Client(api_key=mock_api_key) as client: + response = client.searchscraper( + user_prompt="Search query", + headers=headers, + num_results=5, + stealth=True, + ) + assert response["status"] == "completed" + assert "answer" in response["result"]