diff --git a/.agent/README.md b/.agent/README.md index c5ccaa2..ef55d6b 100644 --- a/.agent/README.md +++ b/.agent/README.md @@ -212,6 +212,7 @@ Both SDKs support the following endpoints: | SmartScraper | ✅ | ✅ | AI-powered data extraction | | SearchScraper | ✅ | ✅ | Multi-website search extraction | | Markdownify | ✅ | ✅ | HTML to Markdown conversion | +| Sitemap | ❌ | ✅ | Sitemap URL extraction | | SmartCrawler | ✅ | ✅ | Sitemap generation & crawling | | AgenticScraper | ✅ | ✅ | Browser automation | | Scrape | ✅ | ✅ | Basic HTML extraction | @@ -259,6 +260,7 @@ Both SDKs support the following endpoints: - `searchScraper.js` - `crawl.js` - `markdownify.js` + - `sitemap.js` - `agenticScraper.js` - `scrape.js` - `scheduledJobs.js` diff --git a/scrapegraph-js/README.md b/scrapegraph-js/README.md index bfb7c50..2d2e19e 100644 --- a/scrapegraph-js/README.md +++ b/scrapegraph-js/README.md @@ -451,6 +451,27 @@ const url = 'https://scrapegraphai.com/'; })(); ``` +### Sitemap + +Extract all URLs from a website's sitemap. Automatically discovers sitemap from robots.txt or common sitemap locations. + +```javascript +import { sitemap } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; +const websiteUrl = 'https://example.com'; + +(async () => { + try { + const response = await sitemap(apiKey, websiteUrl); + console.log('Total URLs found:', response.urls.length); + console.log('URLs:', response.urls); + } catch (error) { + console.error('Error:', error); + } +})(); +``` + ### Checking API Credits ```javascript @@ -688,6 +709,21 @@ Starts a crawl job to extract structured data from a website and its linked page Converts a webpage into clean, well-structured markdown format. +### Sitemap + +#### `sitemap(apiKey, websiteUrl, options)` + +Extracts all URLs from a website's sitemap. Automatically discovers sitemap from robots.txt or common sitemap locations. + +**Parameters:** +- `apiKey` (string): Your ScrapeGraph AI API key +- `websiteUrl` (string): The URL of the website to extract sitemap from +- `options` (object, optional): Additional options + - `mock` (boolean): Override mock mode for this request + +**Returns:** Promise resolving to an object containing: +- `urls` (array): List of URLs extracted from the sitemap + ### Agentic Scraper #### `agenticScraper(apiKey, url, steps, useSession, userPrompt, outputSchema, aiExtraction)` diff --git a/scrapegraph-js/examples/sitemap/README.md b/scrapegraph-js/examples/sitemap/README.md new file mode 100644 index 0000000..472f53e --- /dev/null +++ b/scrapegraph-js/examples/sitemap/README.md @@ -0,0 +1,128 @@ +# Sitemap Examples + +This directory contains examples demonstrating how to use the `sitemap` endpoint to extract URLs from website sitemaps. + +## 📁 Examples + +### 1. Basic Sitemap Extraction (`sitemap_example.js`) + +Demonstrates the basic usage of the sitemap endpoint: +- Extract all URLs from a website's sitemap +- Display the URLs +- Save URLs to a text file +- Save complete response as JSON + +**Usage:** +```bash +node sitemap_example.js +``` + +**What it does:** +1. Calls the sitemap API with a target website URL +2. Retrieves all URLs from the sitemap +3. Displays the first 10 URLs in the console +4. Saves all URLs to `sitemap_urls.txt` +5. Saves the full response to `sitemap_urls.json` + +### 2. Advanced: Sitemap + SmartScraper (`sitemap_with_smartscraper.js`) + +Shows how to combine sitemap extraction with smartScraper for batch processing: +- Extract sitemap URLs +- Filter URLs based on patterns (e.g., blog posts) +- Scrape selected URLs with smartScraper +- Display results and summary + +**Usage:** +```bash +node sitemap_with_smartscraper.js +``` + +**What it does:** +1. Extracts all URLs from a website's sitemap +2. Filters URLs (example: only blog posts or specific sections) +3. Scrapes each filtered URL using smartScraper +4. Extracts structured data from each page +5. Displays a summary of successful and failed scrapes + +**Use Cases:** +- Bulk content extraction from blogs +- E-commerce product catalog scraping +- News article aggregation +- Content migration and archival + +## 🔑 Setup + +Before running the examples, make sure you have: + +1. **API Key**: Set your ScrapeGraph AI API key as an environment variable: + ```bash + export SGAI_APIKEY="your-api-key-here" + ``` + + Or create a `.env` file in the project root: + ``` + SGAI_APIKEY=your-api-key-here + ``` + +2. **Dependencies**: Install required packages: + ```bash + npm install + ``` + +## 📊 Expected Output + +### Basic Sitemap Example Output: +``` +🗺️ Extracting sitemap from: https://example.com/ +⏳ Please wait... + +✅ Sitemap extracted successfully! +📊 Total URLs found: 150 + +📄 First 10 URLs: + 1. https://example.com/ + 2. https://example.com/about + 3. https://example.com/products + ... + +💾 URLs saved to: sitemap_urls.txt +💾 JSON saved to: sitemap_urls.json +``` + +### Advanced Example Output: +``` +🗺️ Step 1: Extracting sitemap from: https://example.com/ +⏳ Please wait... + +✅ Sitemap extracted successfully! +📊 Total URLs found: 150 + +🎯 Selected 3 URLs to scrape: + 1. https://example.com/blog/post-1 + 2. https://example.com/blog/post-2 + 3. https://example.com/blog/post-3 + +🤖 Step 2: Scraping selected URLs... + +📄 Scraping (1/3): https://example.com/blog/post-1 + ✅ Success +... + +📈 Summary: + ✅ Successful: 3 + ❌ Failed: 0 + 📊 Total: 3 +``` + +## 💡 Tips + +1. **Rate Limiting**: When scraping multiple URLs, add delays between requests to avoid rate limiting +2. **Error Handling**: Always use try/catch blocks to handle API errors gracefully +3. **Filtering**: Use URL patterns to filter specific sections (e.g., `/blog/`, `/products/`) +4. **Batch Size**: Start with a small batch to test before processing hundreds of URLs + +## 🔗 Related Documentation + +- [Sitemap API Documentation](../../README.md#sitemap) +- [SmartScraper Documentation](../../README.md#smart-scraper) +- [ScrapeGraph AI API Docs](https://docs.scrapegraphai.com) diff --git a/scrapegraph-js/examples/sitemap/sitemap_example.js b/scrapegraph-js/examples/sitemap/sitemap_example.js new file mode 100644 index 0000000..99b84b1 --- /dev/null +++ b/scrapegraph-js/examples/sitemap/sitemap_example.js @@ -0,0 +1,72 @@ +import { sitemap } from 'scrapegraph-js'; +import fs from 'fs'; +import 'dotenv/config'; + +/** + * Example: Extract sitemap URLs from a website + * + * This example demonstrates how to use the sitemap endpoint to extract + * all URLs from a website's sitemap.xml file. + */ + +// Get API key from environment variable +const apiKey = process.env.SGAI_APIKEY; + +// Target website URL +const url = 'https://scrapegraphai.com/'; + +console.log('🗺️ Extracting sitemap from:', url); +console.log('⏳ Please wait...\n'); + +try { + // Call the sitemap endpoint + const response = await sitemap(apiKey, url); + + console.log('✅ Sitemap extracted successfully!'); + console.log(`📊 Total URLs found: ${response.urls.length}\n`); + + // Display first 10 URLs + console.log('📄 First 10 URLs:'); + response.urls.slice(0, 10).forEach((url, index) => { + console.log(` ${index + 1}. ${url}`); + }); + + if (response.urls.length > 10) { + console.log(` ... and ${response.urls.length - 10} more URLs`); + } + + // Save the complete list to a file + saveUrlsToFile(response.urls, 'sitemap_urls.txt'); + + // Save as JSON for programmatic use + saveUrlsToJson(response, 'sitemap_urls.json'); + +} catch (error) { + console.error('❌ Error:', error.message); + process.exit(1); +} + +/** + * Helper function to save URLs to a text file + */ +function saveUrlsToFile(urls, filename) { + try { + const content = urls.join('\n'); + fs.writeFileSync(filename, content); + console.log(`\n💾 URLs saved to: ${filename}`); + } catch (err) { + console.error('❌ Error saving file:', err.message); + } +} + +/** + * Helper function to save complete response as JSON + */ +function saveUrlsToJson(response, filename) { + try { + fs.writeFileSync(filename, JSON.stringify(response, null, 2)); + console.log(`💾 JSON saved to: ${filename}`); + } catch (err) { + console.error('❌ Error saving JSON:', err.message); + } +} diff --git a/scrapegraph-js/examples/sitemap/sitemap_with_smartscraper.js b/scrapegraph-js/examples/sitemap/sitemap_with_smartscraper.js new file mode 100644 index 0000000..962128e --- /dev/null +++ b/scrapegraph-js/examples/sitemap/sitemap_with_smartscraper.js @@ -0,0 +1,106 @@ +import { sitemap, smartScraper } from 'scrapegraph-js'; +import 'dotenv/config'; + +/** + * Advanced Example: Extract sitemap and scrape selected URLs + * + * This example demonstrates how to combine the sitemap endpoint + * with smartScraper to extract structured data from multiple pages. + */ + +const apiKey = process.env.SGAI_APIKEY; + +// Configuration +const websiteUrl = 'https://scrapegraphai.com/'; +const maxPagesToScrape = 3; // Limit number of pages to scrape +const userPrompt = 'Extract the page title and main heading'; + +console.log('🗺️ Step 1: Extracting sitemap from:', websiteUrl); +console.log('⏳ Please wait...\n'); + +try { + // Step 1: Get all URLs from sitemap + const sitemapResponse = await sitemap(apiKey, websiteUrl); + + console.log('✅ Sitemap extracted successfully!'); + console.log(`📊 Total URLs found: ${sitemapResponse.urls.length}\n`); + + // Step 2: Filter URLs (example: only blog posts) + const filteredUrls = sitemapResponse.urls + .filter(url => url.includes('/blog/') || url.includes('/post/')) + .slice(0, maxPagesToScrape); + + if (filteredUrls.length === 0) { + console.log('ℹ️ No blog URLs found, using first 3 URLs instead'); + filteredUrls.push(...sitemapResponse.urls.slice(0, maxPagesToScrape)); + } + + console.log(`🎯 Selected ${filteredUrls.length} URLs to scrape:`); + filteredUrls.forEach((url, index) => { + console.log(` ${index + 1}. ${url}`); + }); + + // Step 3: Scrape each selected URL + console.log('\n🤖 Step 2: Scraping selected URLs...\n'); + + const results = []; + + for (let i = 0; i < filteredUrls.length; i++) { + const url = filteredUrls[i]; + console.log(`📄 Scraping (${i + 1}/${filteredUrls.length}): ${url}`); + + try { + const scrapeResponse = await smartScraper( + apiKey, + url, + userPrompt + ); + + results.push({ + url: url, + data: scrapeResponse.result, + status: 'success' + }); + + console.log(' ✅ Success'); + + // Add a small delay between requests to avoid rate limiting + if (i < filteredUrls.length - 1) { + await new Promise(resolve => setTimeout(resolve, 1000)); + } + + } catch (error) { + console.log(` ❌ Failed: ${error.message}`); + results.push({ + url: url, + error: error.message, + status: 'failed' + }); + } + } + + // Step 4: Display results + console.log('\n📊 Scraping Results:\n'); + results.forEach((result, index) => { + console.log(`${index + 1}. ${result.url}`); + if (result.status === 'success') { + console.log(' Status: ✅ Success'); + console.log(' Data:', JSON.stringify(result.data, null, 2)); + } else { + console.log(' Status: ❌ Failed'); + console.log(' Error:', result.error); + } + console.log(''); + }); + + // Summary + const successCount = results.filter(r => r.status === 'success').length; + console.log('📈 Summary:'); + console.log(` ✅ Successful: ${successCount}`); + console.log(` ❌ Failed: ${results.length - successCount}`); + console.log(` 📊 Total: ${results.length}`); + +} catch (error) { + console.error('❌ Error:', error.message); + process.exit(1); +} diff --git a/scrapegraph-js/index.js b/scrapegraph-js/index.js index 303b0c5..f49ca2e 100644 --- a/scrapegraph-js/index.js +++ b/scrapegraph-js/index.js @@ -7,17 +7,18 @@ export { getCredits } from './src/credits.js'; export { sendFeedback } from './src/feedback.js'; export { crawl, getCrawlRequest } from './src/crawl.js'; export { generateSchema, getSchemaStatus, pollSchemaGeneration } from './src/schema.js'; -export { - createScheduledJob, - getScheduledJobs, - getScheduledJob, - updateScheduledJob, - replaceScheduledJob, - deleteScheduledJob, - pauseScheduledJob, - resumeScheduledJob, - triggerScheduledJob, - getJobExecutions +export { sitemap } from './src/sitemap.js'; +export { + createScheduledJob, + getScheduledJobs, + getScheduledJob, + updateScheduledJob, + replaceScheduledJob, + deleteScheduledJob, + pauseScheduledJob, + resumeScheduledJob, + triggerScheduledJob, + getJobExecutions } from './src/scheduledJobs.js'; // Mock utilities diff --git a/scrapegraph-js/src/schema.js b/scrapegraph-js/src/schema.js index a814cc8..d245328 100644 --- a/scrapegraph-js/src/schema.js +++ b/scrapegraph-js/src/schema.js @@ -2,7 +2,7 @@ * Schema generation functionality for ScrapeGraph JavaScript SDK */ -import { handleError } from './utils/handleError.js'; +import handleError from './utils/handleError.js'; /** * Generate a JSON schema from a user prompt diff --git a/scrapegraph-js/src/sitemap.js b/scrapegraph-js/src/sitemap.js new file mode 100644 index 0000000..41afb93 --- /dev/null +++ b/scrapegraph-js/src/sitemap.js @@ -0,0 +1,68 @@ +import axios from 'axios'; +import handleError from './utils/handleError.js'; +import { isMockEnabled, getMockConfig } from './utils/mockConfig.js'; +import { getMockResponse } from './utils/mockResponse.js'; + +/** + * Extract all URLs from a website's sitemap. + * Automatically discovers sitemap from robots.txt or common sitemap locations. + * + * @param {string} apiKey - Your ScrapeGraph AI API key. + * @param {string} websiteUrl - The URL of the website to extract sitemap from. + * @param {Object} options - Optional configuration options. + * @param {boolean} options.mock - Override mock mode for this request. + * @returns {Promise} A promise that resolves to an object containing: + * - urls: Array of URLs extracted from the sitemap + * @throws {Error} Throws an error if the HTTP request fails. + * + * @example + * // Basic usage: + * const apiKey = 'your-api-key'; + * const websiteUrl = 'https://example.com'; + * + * try { + * const result = await sitemap(apiKey, websiteUrl); + * console.log('Sitemap URLs:', result.urls); + * console.log('Total URLs found:', result.urls.length); + * } catch (error) { + * console.error('Error:', error); + * } + * + * @example + * // Processing sitemap URLs: + * const result = await sitemap(apiKey, 'https://example.com'); + * result.urls.forEach(url => { + * console.log('Found URL:', url); + * }); + */ +export async function sitemap(apiKey, websiteUrl, options = {}) { + const { mock = null } = options; + + // Check if mock mode is enabled + const useMock = mock !== null ? mock : isMockEnabled(); + + if (useMock) { + console.log('🧪 Mock mode active. Returning stub for sitemap request'); + const mockConfig = getMockConfig(); + const mockData = getMockResponse('POST', 'https://api.scrapegraphai.com/v1/sitemap', mockConfig.customResponses, mockConfig.customHandler); + return mockData; + } + + const endpoint = 'https://api.scrapegraphai.com/v1/sitemap'; + const headers = { + 'accept': 'application/json', + 'SGAI-APIKEY': apiKey, + 'Content-Type': 'application/json', + }; + + const payload = { + website_url: websiteUrl, + }; + + try { + const response = await axios.post(endpoint, payload, { headers }); + return response.data; + } catch (error) { + handleError(error); + } +} diff --git a/scrapegraph-js/test/sitemap_test.js b/scrapegraph-js/test/sitemap_test.js new file mode 100644 index 0000000..3aa64a9 --- /dev/null +++ b/scrapegraph-js/test/sitemap_test.js @@ -0,0 +1,371 @@ +import { sitemap } from '../index.js'; +import 'dotenv/config'; + +/** + * Test suite for Sitemap functionality + * This file demonstrates usage and validates the Sitemap parameters + */ + +// Mock API key for testing (replace with real key for actual testing) +const API_KEY = process.env.SGAI_APIKEY || 'test-api-key'; + +/** + * Test input validation for sitemap + */ +function testInputValidation() { + console.log('🧪 Testing Sitemap Input Validation'); + console.log('='.repeat(50)); + + const testCases = [ + { + name: 'Valid inputs - basic', + apiKey: 'valid-key', + websiteUrl: 'https://example.com', + options: {}, + expected: true, + description: 'All valid parameters with default options' + }, + { + name: 'Valid inputs - subdomain', + apiKey: 'valid-key', + websiteUrl: 'https://blog.example.com', + options: {}, + expected: true, + description: 'Valid subdomain URL' + }, + { + name: 'Valid inputs - with path', + apiKey: 'valid-key', + websiteUrl: 'https://example.com/section', + options: {}, + expected: true, + description: 'URL with path component' + }, + { + name: 'Invalid URL - no protocol', + apiKey: 'valid-key', + websiteUrl: 'example.com', + options: {}, + expected: false, + description: 'URL without http/https protocol' + }, + { + name: 'Invalid URL - relative path', + apiKey: 'valid-key', + websiteUrl: '/path/to/page', + options: {}, + expected: false, + description: 'Relative path instead of absolute URL' + }, + { + name: 'Invalid URL - empty string', + apiKey: 'valid-key', + websiteUrl: '', + options: {}, + expected: false, + description: 'Empty URL string' + }, + { + name: 'Invalid URL - null', + apiKey: 'valid-key', + websiteUrl: null, + options: {}, + expected: false, + description: 'Null URL' + }, + { + name: 'Empty API key', + apiKey: '', + websiteUrl: 'https://example.com', + options: {}, + expected: false, + description: 'Empty API key string' + }, + { + name: 'Invalid API key type', + apiKey: 123, + websiteUrl: 'https://example.com', + options: {}, + expected: false, + description: 'API key as number instead of string' + } + ]; + + let passed = 0; + let total = testCases.length; + + testCases.forEach((testCase, index) => { + console.log(`\n${index + 1}. ${testCase.name}`); + console.log(` Description: ${testCase.description}`); + + try { + // Validate inputs + const isValid = validateSitemapInputs( + testCase.apiKey, + testCase.websiteUrl, + testCase.options + ); + + if (isValid === testCase.expected) { + console.log(` ✅ PASSED`); + passed++; + } else { + console.log(` ❌ FAILED - Expected: ${testCase.expected}, Got: ${isValid}`); + } + } catch (error) { + if (!testCase.expected) { + console.log(` ✅ PASSED (Expected error: ${error.message})`); + passed++; + } else { + console.log(` ❌ FAILED - Unexpected error: ${error.message}`); + } + } + }); + + console.log(`\n📊 Input Validation Results: ${passed}/${total} tests passed`); + return passed === total; +} + +/** + * Validate sitemap function inputs + */ +function validateSitemapInputs(apiKey, websiteUrl, options) { + // Check API key + if (!apiKey || typeof apiKey !== 'string' || apiKey.trim() === '') { + throw new Error('Invalid API key'); + } + + // Check URL + if (!websiteUrl || typeof websiteUrl !== 'string' || websiteUrl.trim() === '') { + throw new Error('Invalid URL'); + } + + // Check URL format + if (!websiteUrl.startsWith('http://') && !websiteUrl.startsWith('https://')) { + throw new Error('URL must start with http:// or https://'); + } + + // Check options + if (options && typeof options !== 'object') { + throw new Error('Options must be an object'); + } + + return true; +} + +/** + * Test sitemap function with mock data + */ +async function testSitemapFunction() { + console.log('\n🧪 Testing Sitemap Function (Mock)'); + console.log('='.repeat(50)); + + try { + // Mock the sitemap function to avoid actual API calls during testing + const mockSitemap = async (apiKey, websiteUrl, options = {}) => { + // Simulate API delay + await new Promise(resolve => setTimeout(resolve, 100)); + + // Return mock response + return { + urls: [ + 'https://example.com/', + 'https://example.com/about', + 'https://example.com/products', + 'https://example.com/contact', + 'https://example.com/blog/post-1', + 'https://example.com/blog/post-2' + ] + }; + }; + + console.log('1. Testing basic sitemap call...'); + const result1 = await mockSitemap(API_KEY, 'https://example.com'); + console.log(` ✅ URLs found: ${result1.urls.length}`); + console.log(` ✅ First URL: ${result1.urls[0]}`); + + console.log('\n2. Testing sitemap for subdomain...'); + const result2 = await mockSitemap(API_KEY, 'https://blog.example.com'); + console.log(` ✅ URLs found: ${result2.urls.length}`); + + console.log('\n3. Testing sitemap for URL with path...'); + const result3 = await mockSitemap(API_KEY, 'https://example.com/section'); + console.log(` ✅ URLs found: ${result3.urls.length}`); + + console.log('\n✅ All sitemap function tests passed'); + return true; + + } catch (error) { + console.error(`❌ Sitemap function test failed: ${error.message}`); + return false; + } +} + +/** + * Test error handling + */ +function testErrorHandling() { + console.log('\n🧪 Testing Error Handling'); + console.log('='.repeat(50)); + + let passed = 0; + let total = 0; + + // Test 1: Invalid API key + total++; + try { + validateSitemapInputs('', 'https://example.com', {}); + console.log('1. Empty API key test: ❌ FAILED (should have thrown error)'); + } catch (error) { + console.log('1. Empty API key test: ✅ PASSED'); + passed++; + } + + // Test 2: Invalid URL + total++; + try { + validateSitemapInputs('valid-key', 'invalid-url', {}); + console.log('2. Invalid URL test: ❌ FAILED (should have thrown error)'); + } catch (error) { + console.log('2. Invalid URL test: ✅ PASSED'); + passed++; + } + + // Test 3: Invalid options + total++; + try { + validateSitemapInputs('valid-key', 'https://example.com', 'invalid-options'); + console.log('3. Invalid options test: ❌ FAILED (should have thrown error)'); + } catch (error) { + console.log('3. Invalid options test: ✅ PASSED'); + passed++; + } + + console.log(`\n📊 Error Handling Results: ${passed}/${total} tests passed`); + return passed === total; +} + +/** + * Test URL validation + */ +function testUrlValidation() { + console.log('\n🧪 Testing URL Validation'); + console.log('='.repeat(50)); + + const testUrls = [ + { url: 'https://example.com', expected: true, description: 'HTTPS URL' }, + { url: 'http://example.com', expected: true, description: 'HTTP URL' }, + { url: 'https://sub.example.com', expected: true, description: 'Subdomain HTTPS' }, + { url: 'https://example.com/path', expected: true, description: 'HTTPS with path' }, + { url: 'https://example.com?param=value', expected: true, description: 'HTTPS with query params' }, + { url: 'https://example.com#fragment', expected: true, description: 'HTTPS with fragment' }, + { url: 'example.com', expected: false, description: 'No protocol' }, + { url: '/path/to/page', expected: false, description: 'Relative path' }, + { url: 'ftp://example.com', expected: false, description: 'FTP protocol' }, + { url: '', expected: false, description: 'Empty string' }, + { url: null, expected: false, description: 'Null value' }, + { url: undefined, expected: false, description: 'Undefined value' } + ]; + + let passed = 0; + let total = testUrls.length; + + testUrls.forEach((testCase, index) => { + console.log(`${index + 1}. ${testCase.description}: ${testCase.url}`); + + try { + if (testCase.url) { + const isValid = testCase.url.startsWith('http://') || testCase.url.startsWith('https://'); + if (isValid === testCase.expected) { + console.log(` ✅ PASSED`); + passed++; + } else { + console.log(` ❌ FAILED - Expected: ${testCase.expected}, Got: ${isValid}`); + } + } else { + if (!testCase.expected) { + console.log(` ✅ PASSED`); + passed++; + } else { + console.log(` ❌ FAILED - Expected: ${testCase.expected}, Got: false`); + } + } + } catch (error) { + if (!testCase.expected) { + console.log(` ✅ PASSED (Expected error)`); + passed++; + } else { + console.log(` ❌ FAILED - Unexpected error: ${error.message}`); + } + } + }); + + console.log(`\n📊 URL Validation Results: ${passed}/${total} tests passed`); + return passed === total; +} + +/** + * Run all tests + */ +async function runAllTests() { + console.log('🚀 Starting Sitemap Test Suite'); + console.log('='.repeat(60)); + console.log(`🔑 API Key: ${API_KEY.substring(0, 8)}...`); + console.log(`⏰ Timestamp: ${new Date().toISOString()}\n`); + + const tests = [ + { name: 'Input Validation', fn: testInputValidation }, + { name: 'Sitemap Function', fn: testSitemapFunction }, + { name: 'Error Handling', fn: testErrorHandling }, + { name: 'URL Validation', fn: testUrlValidation } + ]; + + let passed = 0; + let total = tests.length; + + for (const test of tests) { + try { + const result = await test.fn(); + if (result) { + passed++; + } + } catch (error) { + console.error(`❌ Test '${test.name}' failed with error: ${error.message}`); + } + console.log('\n' + '-'.repeat(60)); + } + + console.log('\n🎯 FINAL TEST RESULTS'); + console.log('='.repeat(30)); + console.log(`✅ Passed: ${passed}`); + console.log(`❌ Failed: ${total - passed}`); + console.log(`📊 Success Rate: ${((passed / total) * 100).toFixed(1)}%`); + + if (passed === total) { + console.log('\n🎉 All tests passed! Sitemap functionality is working correctly.'); + return 0; + } else { + console.log('\n⚠️ Some tests failed. Please review the output above.'); + return 1; + } +} + +// Run tests if this file is executed directly +if (import.meta.url === `file://${process.argv[1]}`) { + runAllTests() + .then(exitCode => { + process.exit(exitCode); + }) + .catch(error => { + console.error('💥 Fatal error during test execution:', error.message); + process.exit(1); + }); +} + +export { + testInputValidation, + testSitemapFunction, + testErrorHandling, + testUrlValidation, + runAllTests +}; diff --git a/scrapegraph-py/examples/sitemap/async/async_sitemap_example.py b/scrapegraph-py/examples/sitemap/async/async_sitemap_example.py new file mode 100644 index 0000000..f9d986e --- /dev/null +++ b/scrapegraph-py/examples/sitemap/async/async_sitemap_example.py @@ -0,0 +1,276 @@ +""" +Asynchronous example demonstrating how to use the Sitemap API. + +This example shows: +1. How to extract URLs from a website's sitemap asynchronously +2. How to process multiple sitemaps concurrently +3. How to combine sitemap with async smartscraper operations + +The Sitemap API automatically discovers the sitemap from: +- robots.txt file +- Common locations like /sitemap.xml +- Sitemap index files + +Requirements: +- Python 3.10+ +- scrapegraph-py +- python-dotenv +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import asyncio +from pathlib import Path +from dotenv import load_dotenv + +from scrapegraph_py import AsyncClient + +# Load environment variables from .env file +load_dotenv() + + +async def basic_sitemap_example(): + """Demonstrate basic async sitemap extraction.""" + print("🗺️ Basic Async Sitemap Example") + print("=" * 40) + + async with AsyncClient.from_env() as client: + try: + # Extract sitemap URLs + print("Extracting sitemap from https://scrapegraphai.com...") + response = await client.sitemap(website_url="https://scrapegraphai.com") + + # Display results + print(f"✅ Success! Found {len(response.urls)} URLs\n") + + # Show first 10 URLs + print("First 10 URLs:") + for i, url in enumerate(response.urls[:10], 1): + print(f" {i}. {url}") + + if len(response.urls) > 10: + print(f" ... and {len(response.urls) - 10} more URLs") + + return response + + except Exception as e: + print(f"❌ Error: {str(e)}") + return None + + +async def save_urls_to_file(urls: list[str], filename: str): + """Save sitemap URLs to a text file asynchronously.""" + output_dir = Path("sitemap_output") + output_dir.mkdir(exist_ok=True) + + file_path = output_dir / f"{filename}.txt" + + # Use asyncio to write file asynchronously + loop = asyncio.get_event_loop() + await loop.run_in_executor( + None, + lambda: file_path.write_text("\n".join(urls), encoding="utf-8") + ) + + print(f"💾 URLs saved to: {file_path}") + return file_path + + +async def concurrent_sitemaps_example(): + """Demonstrate extracting multiple sitemaps concurrently.""" + print("\n⚡ Concurrent Sitemaps Example") + print("=" * 40) + + websites = [ + "https://scrapegraphai.com", + "https://example.com", + "https://python.org" + ] + + async with AsyncClient.from_env() as client: + try: + print(f"Extracting sitemaps from {len(websites)} websites concurrently...") + + # Create tasks for concurrent execution + tasks = [ + client.sitemap(website_url=url) + for url in websites + ] + + # Execute all tasks concurrently + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Process results + successful = 0 + for url, result in zip(websites, results): + if isinstance(result, Exception): + print(f"❌ {url}: {str(result)}") + else: + print(f"✅ {url}: {len(result.urls)} URLs") + successful += 1 + + print(f"\n📊 Summary: {successful}/{len(websites)} successful") + + return [r for r in results if not isinstance(r, Exception)] + + except Exception as e: + print(f"❌ Error: {str(e)}") + return None + + +async def filter_and_scrape_example(): + """Demonstrate filtering sitemap URLs and scraping them asynchronously.""" + print("\n🤖 Filter + Async Scrape Example") + print("=" * 40) + + async with AsyncClient.from_env() as client: + try: + # Extract sitemap + print("Step 1: Extracting sitemap...") + response = await client.sitemap(website_url="https://scrapegraphai.com") + + # Filter for specific URLs + target_urls = [url for url in response.urls if '/blog/' in url][:3] + + if not target_urls: + target_urls = response.urls[:3] + + print(f"✅ Found {len(response.urls)} URLs") + print(f"🎯 Selected {len(target_urls)} URLs to scrape\n") + + # Create scraping tasks + print("Step 2: Scraping URLs concurrently...") + + async def scrape_url(url): + """Scrape a single URL.""" + try: + result = await client.smartscraper( + website_url=url, + user_prompt="Extract the page title and main heading" + ) + return { + 'url': url, + 'data': result.get('result'), + 'status': 'success' + } + except Exception as e: + return { + 'url': url, + 'error': str(e), + 'status': 'failed' + } + + # Execute scraping tasks concurrently + tasks = [scrape_url(url) for url in target_urls] + results = await asyncio.gather(*tasks) + + # Display results + successful = sum(1 for r in results if r['status'] == 'success') + print(f"\n📊 Summary:") + print(f" ✅ Successful: {successful}/{len(results)}") + print(f" ❌ Failed: {len(results) - successful}/{len(results)}") + + # Show sample results + print("\nSample results:") + for i, result in enumerate(results[:3], 1): + print(f"\n {i}. {result['url']}") + if result['status'] == 'success': + print(f" Data: {result['data']}") + else: + print(f" Error: {result['error']}") + + return results + + except Exception as e: + print(f"❌ Error: {str(e)}") + return None + + +async def batch_process_with_rate_limit(): + """Demonstrate batch processing with rate limiting.""" + print("\n⏱️ Batch Processing with Rate Limit") + print("=" * 40) + + async with AsyncClient.from_env() as client: + try: + # Extract sitemap + print("Extracting sitemap...") + response = await client.sitemap(website_url="https://scrapegraphai.com") + + # Get URLs to process + urls_to_process = response.urls[:10] + print(f"Processing {len(urls_to_process)} URLs with rate limiting...") + + # Process in batches to avoid overwhelming the API + batch_size = 3 + results = [] + + for i in range(0, len(urls_to_process), batch_size): + batch = urls_to_process[i:i + batch_size] + print(f"\nProcessing batch {i // batch_size + 1}...") + + # Process batch + batch_tasks = [ + client.smartscraper( + website_url=url, + user_prompt="Extract title" + ) + for url in batch + ] + + batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True) + results.extend(batch_results) + + # Rate limiting: wait between batches + if i + batch_size < len(urls_to_process): + print("Waiting 2 seconds before next batch...") + await asyncio.sleep(2) + + successful = sum(1 for r in results if not isinstance(r, Exception)) + print(f"\n✅ Processed {successful}/{len(results)} URLs successfully") + + return results + + except Exception as e: + print(f"❌ Error: {str(e)}") + return None + + +async def main(): + """Main function demonstrating async sitemap functionality.""" + print("🚀 Async Sitemap API Examples") + print("=" * 40) + + try: + # Basic sitemap extraction + response = await basic_sitemap_example() + + if response and response.urls: + # Save URLs to file + await save_urls_to_file(response.urls, "async_scrapegraphai_sitemap") + + # Concurrent sitemaps + await concurrent_sitemaps_example() + + # Filter and scrape + await filter_and_scrape_example() + + # Batch processing with rate limit + await batch_process_with_rate_limit() + + print("\n🎯 All examples completed!") + + except Exception as e: + print(f"❌ Unexpected error: {str(e)}") + + print("\n📚 Next steps:") + print("• Experiment with different websites") + print("• Adjust batch sizes for your use case") + print("• Combine with other async operations") + print("• Implement custom error handling and retry logic") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scrapegraph-py/examples/sitemap/sync/sitemap_example.py b/scrapegraph-py/examples/sitemap/sync/sitemap_example.py new file mode 100644 index 0000000..ea963f6 --- /dev/null +++ b/scrapegraph-py/examples/sitemap/sync/sitemap_example.py @@ -0,0 +1,251 @@ +""" +Basic synchronous example demonstrating how to use the Sitemap API. + +This example shows: +1. How to extract URLs from a website's sitemap +2. How to save sitemap URLs to a file +3. How to combine sitemap with other scraping operations + +The Sitemap API automatically discovers the sitemap from: +- robots.txt file +- Common locations like /sitemap.xml +- Sitemap index files + +Equivalent curl command: +curl -X POST https://api.scrapegraphai.com/v1/sitemap \ + -H "Content-Type: application/json" \ + -H "SGAI-APIKEY: your-api-key-here" \ + -d '{ + "website_url": "https://example.com" + }' + +Requirements: +- Python 3.10+ +- scrapegraph-py +- python-dotenv +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +from pathlib import Path +from dotenv import load_dotenv + +from scrapegraph_py import Client + +# Load environment variables from .env file +load_dotenv() + + +def basic_sitemap_example(): + """Demonstrate basic sitemap extraction.""" + print("🗺️ Basic Sitemap Example") + print("=" * 40) + + # Initialize client + client = Client.from_env() + + try: + # Extract sitemap URLs + print("Extracting sitemap from https://scrapegraphai.com...") + response = client.sitemap(website_url="https://scrapegraphai.com") + + # Display results + print(f"✅ Success! Found {len(response.urls)} URLs\n") + + # Show first 10 URLs + print("First 10 URLs:") + for i, url in enumerate(response.urls[:10], 1): + print(f" {i}. {url}") + + if len(response.urls) > 10: + print(f" ... and {len(response.urls) - 10} more URLs") + + return response + + except Exception as e: + print(f"❌ Error: {str(e)}") + return None + finally: + client.close() + + +def save_urls_to_file(urls: list[str], filename: str): + """Save sitemap URLs to a text file.""" + output_dir = Path("sitemap_output") + output_dir.mkdir(exist_ok=True) + + file_path = output_dir / f"{filename}.txt" + with open(file_path, "w", encoding="utf-8") as f: + for url in urls: + f.write(url + "\n") + + print(f"💾 URLs saved to: {file_path}") + return file_path + + +def filter_urls_example(): + """Demonstrate filtering sitemap URLs by pattern.""" + print("\n🔍 Filtering URLs Example") + print("=" * 40) + + client = Client.from_env() + + try: + # Extract sitemap + print("Extracting sitemap...") + response = client.sitemap(website_url="https://scrapegraphai.com") + + # Filter URLs containing specific patterns + blog_urls = [url for url in response.urls if '/blog/' in url] + doc_urls = [url for url in response.urls if '/docs/' in url or '/documentation/' in url] + + print(f"✅ Total URLs: {len(response.urls)}") + print(f"📝 Blog URLs: {len(blog_urls)}") + print(f"📚 Documentation URLs: {len(doc_urls)}") + + # Show sample blog URLs + if blog_urls: + print("\nSample blog URLs:") + for url in blog_urls[:5]: + print(f" • {url}") + + return { + 'all_urls': response.urls, + 'blog_urls': blog_urls, + 'doc_urls': doc_urls + } + + except Exception as e: + print(f"❌ Error: {str(e)}") + return None + finally: + client.close() + + +def combine_with_smartscraper(): + """Demonstrate combining sitemap with smartscraper.""" + print("\n🤖 Sitemap + SmartScraper Example") + print("=" * 40) + + client = Client.from_env() + + try: + # First, get sitemap URLs + print("Step 1: Extracting sitemap...") + sitemap_response = client.sitemap(website_url="https://scrapegraphai.com") + + # Filter for specific pages (e.g., blog posts) + target_urls = [url for url in sitemap_response.urls if '/blog/' in url][:3] + + if not target_urls: + # If no blog URLs, use first 3 URLs + target_urls = sitemap_response.urls[:3] + + print(f"✅ Found {len(sitemap_response.urls)} URLs") + print(f"🎯 Selected {len(target_urls)} URLs to scrape\n") + + # Scrape selected URLs + print("Step 2: Scraping selected URLs...") + results = [] + + for i, url in enumerate(target_urls, 1): + print(f" Scraping ({i}/{len(target_urls)}): {url}") + + try: + # Use smartscraper to extract data + scrape_result = client.smartscraper( + website_url=url, + user_prompt="Extract the page title and main heading" + ) + + results.append({ + 'url': url, + 'data': scrape_result.get('result'), + 'status': 'success' + }) + print(f" ✅ Success") + + except Exception as e: + results.append({ + 'url': url, + 'error': str(e), + 'status': 'failed' + }) + print(f" ❌ Failed: {str(e)}") + + # Summary + successful = sum(1 for r in results if r['status'] == 'success') + print(f"\n📊 Summary:") + print(f" ✅ Successful: {successful}/{len(results)}") + print(f" ❌ Failed: {len(results) - successful}/{len(results)}") + + return results + + except Exception as e: + print(f"❌ Error: {str(e)}") + return None + finally: + client.close() + + +def demonstrate_curl_equivalent(): + """Show the equivalent curl command.""" + print("\n🌐 Equivalent curl command:") + print("=" * 40) + + print("curl -X POST https://api.scrapegraphai.com/v1/sitemap \\") + print(" -H \"Content-Type: application/json\" \\") + print(" -H \"SGAI-APIKEY: your-api-key-here\" \\") + print(" -d '{") + print(" \"website_url\": \"https://scrapegraphai.com\"") + print(" }'") + + +def main(): + """Main function demonstrating sitemap functionality.""" + print("🚀 Sitemap API Examples") + print("=" * 40) + + # Show curl equivalent first + demonstrate_curl_equivalent() + + try: + # Run examples + print("\n" + "=" * 40 + "\n") + + # Basic sitemap extraction + response = basic_sitemap_example() + + if response and response.urls: + # Save URLs to file + save_urls_to_file(response.urls, "scrapegraphai_sitemap") + + # Filter URLs by pattern + filtered = filter_urls_example() + + if filtered: + # Save filtered URLs + if filtered['blog_urls']: + save_urls_to_file(filtered['blog_urls'], "blog_urls") + if filtered['doc_urls']: + save_urls_to_file(filtered['doc_urls'], "doc_urls") + + # Advanced: Combine with smartscraper + combine_with_smartscraper() + + print("\n🎯 All examples completed!") + + except Exception as e: + print(f"❌ Unexpected error: {str(e)}") + + print("\n📚 Next steps:") + print("• Try the curl command in your terminal") + print("• Experiment with different websites") + print("• Combine sitemap with other scraping operations") + print("• Filter URLs based on your specific needs") + + +if __name__ == "__main__": + main() diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index ca51965..c33a6b0 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -27,6 +27,7 @@ GetSearchScraperRequest, SearchScraperRequest, ) +from scrapegraph_py.models.sitemap import SitemapRequest, SitemapResponse from scrapegraph_py.models.smartscraper import ( GetSmartScraperRequest, SmartScraperRequest, @@ -435,6 +436,49 @@ async def get_scrape(self, request_id: str): logger.info(f"✨ Successfully retrieved result for request {request_id}") return result + async def sitemap( + self, + website_url: str, + mock: bool = False, + ) -> SitemapResponse: + """Extract all URLs from a website's sitemap. + + Automatically discovers sitemap from robots.txt or common sitemap locations. + + Args: + website_url: The URL of the website to extract sitemap from + mock: Whether to use mock mode for this request + + Returns: + SitemapResponse: Object containing list of URLs extracted from sitemap + + Raises: + ValueError: If website_url is invalid + APIError: If the API request fails + + Examples: + >>> async with AsyncClient(api_key="your-api-key") as client: + ... response = await client.sitemap("https://example.com") + ... print(f"Found {len(response.urls)} URLs") + ... for url in response.urls[:5]: + ... print(url) + """ + logger.info(f"🗺️ Starting sitemap extraction for {website_url}") + + request = SitemapRequest( + website_url=website_url, + mock=mock + ) + logger.debug("✅ Request validation passed") + + result = await self._make_request( + "POST", f"{API_BASE_URL}/sitemap", json=request.model_dump() + ) + logger.info(f"✨ Sitemap extraction completed successfully - found {len(result.get('urls', []))} URLs") + + # Parse response into SitemapResponse model + return SitemapResponse(**result) + async def smartscraper( self, user_prompt: str, diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 353df03..30f9318 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -28,6 +28,7 @@ GetSearchScraperRequest, SearchScraperRequest, ) +from scrapegraph_py.models.sitemap import SitemapRequest, SitemapResponse from scrapegraph_py.models.smartscraper import ( GetSmartScraperRequest, SmartScraperRequest, @@ -455,6 +456,49 @@ def get_scrape(self, request_id: str): logger.info(f"✨ Successfully retrieved result for request {request_id}") return result + def sitemap( + self, + website_url: str, + mock: bool = False, + ) -> SitemapResponse: + """Extract all URLs from a website's sitemap. + + Automatically discovers sitemap from robots.txt or common sitemap locations. + + Args: + website_url: The URL of the website to extract sitemap from + mock: Whether to use mock mode for this request + + Returns: + SitemapResponse: Object containing list of URLs extracted from sitemap + + Raises: + ValueError: If website_url is invalid + APIError: If the API request fails + + Examples: + >>> client = Client(api_key="your-api-key") + >>> response = client.sitemap("https://example.com") + >>> print(f"Found {len(response.urls)} URLs") + >>> for url in response.urls[:5]: + ... print(url) + """ + logger.info(f"🗺️ Starting sitemap extraction for {website_url}") + + request = SitemapRequest( + website_url=website_url, + mock=mock + ) + logger.debug("✅ Request validation passed") + + result = self._make_request( + "POST", f"{API_BASE_URL}/sitemap", json=request.model_dump() + ) + logger.info(f"✨ Sitemap extraction completed successfully - found {len(result.get('urls', []))} URLs") + + # Parse response into SitemapResponse model + return SitemapResponse(**result) + def smartscraper( self, user_prompt: str, diff --git a/scrapegraph-py/scrapegraph_py/models/__init__.py b/scrapegraph-py/scrapegraph_py/models/__init__.py index 2f8f810..e2b57ce 100644 --- a/scrapegraph-py/scrapegraph_py/models/__init__.py +++ b/scrapegraph-py/scrapegraph_py/models/__init__.py @@ -4,6 +4,7 @@ from .scrape import GetScrapeRequest, ScrapeRequest from .markdownify import GetMarkdownifyRequest, MarkdownifyRequest from .searchscraper import GetSearchScraperRequest, SearchScraperRequest +from .sitemap import SitemapRequest, SitemapResponse from .smartscraper import GetSmartScraperRequest, SmartScraperRequest from .schema import GenerateSchemaRequest, GetSchemaStatusRequest, SchemaGenerationResponse @@ -19,6 +20,8 @@ "MarkdownifyRequest", "GetSearchScraperRequest", "SearchScraperRequest", + "SitemapRequest", + "SitemapResponse", "GetSmartScraperRequest", "SmartScraperRequest", "GenerateSchemaRequest", diff --git a/scrapegraph-py/scrapegraph_py/models/sitemap.py b/scrapegraph-py/scrapegraph_py/models/sitemap.py new file mode 100644 index 0000000..4095cbb --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/sitemap.py @@ -0,0 +1,192 @@ +"""Models for sitemap endpoint""" + +from typing import Optional + +from pydantic import BaseModel, Field, model_validator + + +class SitemapRequest(BaseModel): + """Request model for sitemap endpoint. + + Extracts all URLs from a website's sitemap. Automatically discovers sitemap + from robots.txt or common sitemap locations like /sitemap.xml and sitemap + index files. + + The sitemap endpoint is useful for: + - Discovering all pages on a website + - Building comprehensive crawling lists + - SEO audits and analysis + - Content inventory management + + Attributes: + website_url (str): The base URL of the website to extract sitemap from. + Must start with http:// or https://. The API will automatically + discover the sitemap location. + mock (bool): Whether to use mock mode for the request. When True, returns + stubbed responses without making actual API calls. Defaults to False. + + Raises: + ValueError: If website_url is empty, None, or doesn't start with + http:// or https://. + + Examples: + Basic usage:: + + >>> request = SitemapRequest(website_url="https://example.com") + >>> print(request.website_url) + https://example.com + + With mock mode:: + + >>> request = SitemapRequest( + ... website_url="https://example.com", + ... mock=True + ... ) + >>> print(request.mock) + True + + The API automatically discovers sitemaps from: + - robots.txt directives (Sitemap: https://example.com/sitemap.xml) + - Common locations (/sitemap.xml, /sitemap_index.xml) + - Sitemap index files with nested sitemaps + + Note: + The website_url should be the base domain URL. The API will handle + sitemap discovery automatically. + """ + + website_url: str = Field( + ..., + example="https://scrapegraphai.com/", + description="The URL of the website to extract sitemap from" + ) + mock: bool = Field( + default=False, + description="Whether to use mock mode for the request" + ) + + @model_validator(mode="after") + def validate_url(self) -> "SitemapRequest": + """Validate the website URL. + + Ensures the URL is not empty and uses http:// or https:// protocol. + + Returns: + SitemapRequest: The validated instance. + + Raises: + ValueError: If URL is empty or uses invalid protocol. + """ + if self.website_url is None or not self.website_url.strip(): + raise ValueError("Website URL cannot be empty") + if not ( + self.website_url.startswith("http://") + or self.website_url.startswith("https://") + ): + raise ValueError("URL must start with http:// or https://") + return self + + def model_dump(self, *args, **kwargs) -> dict: + """Serialize the model to a dictionary. + + Automatically excludes None values from the serialized output to + produce cleaner JSON payloads for the API. + + Args: + *args: Positional arguments passed to parent model_dump. + **kwargs: Keyword arguments passed to parent model_dump. + If 'exclude_none' is not specified, it defaults to True. + + Returns: + dict: Dictionary representation of the model with None values excluded. + + Examples: + >>> request = SitemapRequest(website_url="https://example.com") + >>> data = request.model_dump() + >>> print(data) + {'website_url': 'https://example.com', 'mock': False} + """ + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) + + +class SitemapResponse(BaseModel): + """Response model for sitemap endpoint. + + Contains the complete list of URLs extracted from the website's sitemap. + The URLs are returned in the order they appear in the sitemap, which + typically reflects the website's intended structure and priority. + + This response is useful for: + - Building comprehensive URL lists for crawling + - Identifying content structure and organization + - Discovering all public pages on a website + - Planning content migration or archival + + Attributes: + urls (list[str]): Complete list of URLs extracted from the sitemap. + Each URL is a fully-qualified absolute URL string. The list may + be empty if no sitemap is found or if the sitemap contains no URLs. + URLs are deduplicated and ordered as they appear in the sitemap. + + Examples: + Basic usage:: + + >>> response = SitemapResponse(urls=[ + ... "https://example.com/", + ... "https://example.com/about" + ... ]) + >>> print(f"Found {len(response.urls)} URLs") + Found 2 URLs + + Iterating over URLs:: + + >>> response = SitemapResponse(urls=[ + ... "https://example.com/", + ... "https://example.com/products", + ... "https://example.com/contact" + ... ]) + >>> for url in response.urls: + ... print(url) + https://example.com/ + https://example.com/products + https://example.com/contact + + Filtering URLs:: + + >>> response = SitemapResponse(urls=[ + ... "https://example.com/", + ... "https://example.com/blog/post-1", + ... "https://example.com/blog/post-2", + ... "https://example.com/products" + ... ]) + >>> blog_urls = [url for url in response.urls if '/blog/' in url] + >>> print(f"Found {len(blog_urls)} blog posts") + Found 2 blog posts + + Empty sitemap:: + + >>> response = SitemapResponse(urls=[]) + >>> if not response.urls: + ... print("No URLs found in sitemap") + No URLs found in sitemap + + Note: + The urls list may contain various types of pages including: + - Homepage and main sections + - Blog posts and articles + - Product pages + - Category and tag pages + - Media files (images, PDFs) if included in sitemap + """ + + urls: list[str] = Field( + ..., + description="List of URLs extracted from the sitemap", + example=[ + "https://example.com/", + "https://example.com/about", + "https://example.com/products", + "https://example.com/contact" + ] + ) diff --git a/scrapegraph-py/tests/test_sitemap_models.py b/scrapegraph-py/tests/test_sitemap_models.py new file mode 100644 index 0000000..bda25b0 --- /dev/null +++ b/scrapegraph-py/tests/test_sitemap_models.py @@ -0,0 +1,210 @@ +"""Tests for Sitemap models""" + +import pytest +from pydantic import ValidationError + +from scrapegraph_py.models.sitemap import SitemapRequest, SitemapResponse + + +class TestSitemapRequest: + """Test SitemapRequest model""" + + def test_valid_sitemap_request(self): + """Test valid sitemap request""" + request = SitemapRequest(website_url="https://example.com") + assert request.website_url == "https://example.com" + assert request.mock is False + + def test_valid_sitemap_request_with_mock(self): + """Test valid sitemap request with mock mode""" + request = SitemapRequest( + website_url="https://example.com", + mock=True + ) + assert request.website_url == "https://example.com" + assert request.mock is True + + def test_valid_sitemap_request_https(self): + """Test valid sitemap request with HTTPS URL""" + request = SitemapRequest(website_url="https://secure.example.com") + assert request.website_url == "https://secure.example.com" + + def test_valid_sitemap_request_http(self): + """Test valid sitemap request with HTTP URL""" + request = SitemapRequest(website_url="http://example.com") + assert request.website_url == "http://example.com" + + def test_valid_sitemap_request_with_path(self): + """Test valid sitemap request with URL containing path""" + request = SitemapRequest(website_url="https://example.com/section") + assert request.website_url == "https://example.com/section" + + def test_valid_sitemap_request_subdomain(self): + """Test valid sitemap request with subdomain""" + request = SitemapRequest(website_url="https://blog.example.com") + assert request.website_url == "https://blog.example.com" + + def test_invalid_empty_url(self): + """Test sitemap request with empty URL""" + with pytest.raises(ValidationError) as exc_info: + SitemapRequest(website_url="") + assert "Website URL cannot be empty" in str(exc_info.value) + + def test_invalid_none_url(self): + """Test sitemap request with None URL""" + with pytest.raises(ValidationError): + SitemapRequest(website_url=None) + + def test_invalid_whitespace_url(self): + """Test sitemap request with whitespace-only URL""" + with pytest.raises(ValidationError) as exc_info: + SitemapRequest(website_url=" ") + assert "Website URL cannot be empty" in str(exc_info.value) + + def test_invalid_protocol_url(self): + """Test sitemap request with invalid protocol""" + with pytest.raises(ValidationError) as exc_info: + SitemapRequest(website_url="ftp://example.com") + assert "URL must start with http:// or https://" in str(exc_info.value) + + def test_invalid_no_protocol_url(self): + """Test sitemap request with no protocol""" + with pytest.raises(ValidationError) as exc_info: + SitemapRequest(website_url="example.com") + assert "URL must start with http:// or https://" in str(exc_info.value) + + def test_invalid_relative_url(self): + """Test sitemap request with relative URL""" + with pytest.raises(ValidationError) as exc_info: + SitemapRequest(website_url="/path/to/page") + assert "URL must start with http:// or https://" in str(exc_info.value) + + def test_serialization(self): + """Test sitemap request serialization""" + request = SitemapRequest(website_url="https://example.com") + data = request.model_dump() + assert data["website_url"] == "https://example.com" + # mock defaults to False and should be excluded by exclude_none + assert data.get("mock") is False + + def test_serialization_with_mock(self): + """Test sitemap request serialization with mock mode""" + request = SitemapRequest( + website_url="https://example.com", + mock=True + ) + data = request.model_dump() + assert data["website_url"] == "https://example.com" + assert data["mock"] is True + + def test_url_validation_edge_cases(self): + """Test URL validation edge cases""" + valid_urls = [ + "https://example.com", + "http://example.com", + "https://sub.example.com", + "https://example.com:8080", + "https://example.com/path", + "https://example.com/path?param=value", + "https://example.com/path#fragment", + "https://blog.example.com/posts/2024/01/article" + ] + + for url in valid_urls: + request = SitemapRequest(website_url=url) + assert request.website_url == url + + invalid_urls = [ + "ftp://example.com", + "gopher://example.com", + "example.com", + "/relative/path", + "file:///path/to/file", + "www.example.com" + ] + + for url in invalid_urls: + with pytest.raises(ValidationError): + SitemapRequest(website_url=url) + + +class TestSitemapResponse: + """Test SitemapResponse model""" + + def test_valid_sitemap_response(self): + """Test valid sitemap response""" + urls = [ + "https://example.com/", + "https://example.com/about", + "https://example.com/products" + ] + response = SitemapResponse(urls=urls) + assert response.urls == urls + assert len(response.urls) == 3 + + def test_valid_sitemap_response_empty(self): + """Test valid sitemap response with empty list""" + response = SitemapResponse(urls=[]) + assert response.urls == [] + assert len(response.urls) == 0 + + def test_valid_sitemap_response_single_url(self): + """Test valid sitemap response with single URL""" + urls = ["https://example.com/"] + response = SitemapResponse(urls=urls) + assert response.urls == urls + assert len(response.urls) == 1 + + def test_valid_sitemap_response_many_urls(self): + """Test valid sitemap response with many URLs""" + urls = [f"https://example.com/page{i}" for i in range(100)] + response = SitemapResponse(urls=urls) + assert len(response.urls) == 100 + assert response.urls[0] == "https://example.com/page0" + assert response.urls[-1] == "https://example.com/page99" + + def test_invalid_none_urls(self): + """Test sitemap response with None URLs""" + with pytest.raises(ValidationError): + SitemapResponse(urls=None) + + def test_invalid_missing_urls(self): + """Test sitemap response with missing URLs field""" + with pytest.raises(ValidationError): + SitemapResponse() + + def test_serialization(self): + """Test sitemap response serialization""" + urls = [ + "https://example.com/", + "https://example.com/about" + ] + response = SitemapResponse(urls=urls) + data = response.model_dump() + assert data["urls"] == urls + assert isinstance(data["urls"], list) + + def test_urls_immutability(self): + """Test that URLs list is properly stored""" + original_urls = [ + "https://example.com/", + "https://example.com/about" + ] + response = SitemapResponse(urls=original_urls) + + # Verify the response has the correct URLs + assert response.urls == original_urls + + def test_various_url_formats(self): + """Test sitemap response with various URL formats""" + urls = [ + "https://example.com/", + "https://blog.example.com/post-1", + "https://example.com/path/to/page", + "https://example.com:8080/api", + "https://example.com/page?param=value", + "https://example.com/page#section" + ] + response = SitemapResponse(urls=urls) + assert response.urls == urls + assert len(response.urls) == 6