diff --git a/.env.example b/.env.example index 6f18cb2..44326a9 100644 --- a/.env.example +++ b/.env.example @@ -1,12 +1,9 @@ +# ScrapeGraphAI API Key (required for scrapegraph-py SDK) +SGAI_API_KEY=your-scrapegraphai-api-key-here + # Elasticsearch Configuration ELASTICSEARCH_HOST=localhost ELASTICSEARCH_PORT=9200 ELASTICSEARCH_SCHEME=http -ELASTICSEARCH_USERNAME=elastic -ELASTICSEARCH_PASSWORD=changeme - -# ScrapeGraphAI Configuration -SCRAPEGRAPHAI_API_KEY=your_api_key_here - -# Optional: OpenAI API Key for LLM functionality -OPENAI_API_KEY=your_openai_api_key_here +# ELASTICSEARCH_USERNAME= +# ELASTICSEARCH_PASSWORD= diff --git a/README.md b/README.md index 3ec90e3..43f3765 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,25 @@ # ScrapeGraphAI Elasticsearch Demo -A comprehensive demo project showcasing the integration of **ScrapeGraphAI SDK** with **Elasticsearch** for intelligent marketplace product scraping, storage, and comparison. +A comprehensive demo project showcasing the integration of **ScrapeGraphAI API (via scrapegraph-py SDK)** with **Elasticsearch** for intelligent marketplace product scraping, storage, and comparison. + +> **Note**: This demo uses the `scrapegraph-py` SDK which provides API-based scraping through ScrapeGraphAI's cloud service. This means simpler setup, no local LLM requirements, and managed infrastructure. ## ๐Ÿš€ Features -- **Web Scraping with ScrapeGraphAI**: Leverage AI-powered scraping to extract structured product data from marketplace websites +- **Web Scraping with ScrapeGraphAI API**: Leverage cloud-based AI scraping to extract structured product data from marketplace websites +- **Simple SDK Integration**: Use the `scrapegraph-py` SDK for easy API-based scraping - **Elasticsearch Integration**: Store and index product data for powerful search and analytics - **Multi-Marketplace Support**: Scrape and compare products across different marketplaces (Amazon, eBay, etc.) - **Product Comparison**: Advanced features to compare products by price, ratings, and specifications - **Flexible Search**: Full-text search with filters for marketplace, price range, and more - **Data Analytics**: Aggregations and statistics on product data +- **No Local LLM Setup**: All AI processing happens in the cloud - just use your API key ## ๐Ÿ“‹ Prerequisites - Python 3.8 or higher - Docker and Docker Compose (for Elasticsearch) -- OpenAI API key (optional, for AI-powered scraping) +- ScrapeGraphAI API key (get one at [scrapegraphai.com](https://scrapegraphai.com)) ## ๐Ÿ”ง Installation @@ -48,11 +52,16 @@ pip install -r requirements.txt # Copy the example environment file cp .env.example .env -# Edit .env and add your configuration -# At minimum, you need to set: -# - SCRAPEGRAPHAI_API_KEY or OPENAI_API_KEY +# Edit .env and add your ScrapeGraphAI API key +# Required: SGAI_API_KEY=your-api-key-here ``` +**Getting your API Key:** +1. Visit [scrapegraphai.com](https://scrapegraphai.com) +2. Sign up or log in to your account +3. Navigate to your API settings +4. Copy your API key and add it to `.env` as `SGAI_API_KEY` + ### 4. Start Elasticsearch ```bash @@ -117,14 +126,14 @@ This demonstrates: ```python from src.scrapegraph_demo import Config, ElasticsearchClient, MarketplaceScraper -# Load configuration +# Load configuration (reads SGAI_API_KEY from environment) config = Config.from_env() # Initialize clients es_client = ElasticsearchClient(config) scraper = MarketplaceScraper(config) -# Scrape a product +# Scrape a product using the SDK product = scraper.scrape_product( url="https://www.amazon.com/dp/PRODUCTID", marketplace="Amazon" @@ -144,12 +153,16 @@ results = es_client.search_products( # Print results for product in results: print(f"{product.name} - ${product.price}") + +# Clean up +scraper.close() +es_client.close() ``` ### Scraping Search Results ```python -# Scrape multiple products from a search +# Scrape multiple products from a search using the SDK products = scraper.scrape_search_results( search_query="wireless mouse", marketplace="Amazon", @@ -159,6 +172,9 @@ products = scraper.scrape_search_results( # Bulk index success, failed = es_client.index_products(products) print(f"Indexed {success} products") + +# Don't forget to close the scraper +scraper.close() ``` ### Product Comparison @@ -213,11 +229,12 @@ Manages all Elasticsearch operations: ### MarketplaceScraper -Handles web scraping using ScrapeGraphAI: -- Scrape individual product pages -- Scrape search results +Handles web scraping using ScrapeGraphAI API (via scrapegraph-py SDK): +- Scrape individual product pages using cloud-based AI +- Scrape search results with structured data extraction - Extract structured data (price, rating, specs, etc.) - Support for multiple marketplaces +- Automatic fallback to mock data if API is unavailable ### Product Model @@ -234,15 +251,14 @@ Pydantic model representing a marketplace product: | Variable | Description | Required | Default | |----------|-------------|----------|---------| +| `SGAI_API_KEY` | ScrapeGraphAI API key | Yes* | - | | `ELASTICSEARCH_HOST` | Elasticsearch host | No | `localhost` | | `ELASTICSEARCH_PORT` | Elasticsearch port | No | `9200` | | `ELASTICSEARCH_SCHEME` | HTTP or HTTPS | No | `http` | | `ELASTICSEARCH_USERNAME` | Elasticsearch username | No | - | | `ELASTICSEARCH_PASSWORD` | Elasticsearch password | No | - | -| `SCRAPEGRAPHAI_API_KEY` | ScrapeGraphAI API key | Yes* | - | -| `OPENAI_API_KEY` | OpenAI API key | Yes* | - | -*Either `SCRAPEGRAPHAI_API_KEY` or `OPENAI_API_KEY` is required for AI-powered scraping. +*`SGAI_API_KEY` is required for API-based scraping. Without it, the demo will use mock data for testing. ## ๐Ÿ“Š Elasticsearch Index @@ -278,15 +294,23 @@ Use Kibana to: ## ๐Ÿงช Testing -The project includes mock data functionality for testing without actual web scraping: +Run the test suite: + +```bash +python run_tests.py +``` + +The project includes mock data functionality for testing without API credits: ```python -# The scraper automatically falls back to mock data if ScrapeGraphAI is unavailable +# The scraper automatically falls back to mock data if API key is not set scraper = MarketplaceScraper(config) products = scraper.scrape_search_results("laptop", "Amazon", max_results=5) # Returns mock products for testing ``` +All tests use mock data and don't require an API key. + ## ๐Ÿค Contributing Contributions are welcome! Please feel free to submit a Pull Request. @@ -297,9 +321,11 @@ This project is provided as-is for demonstration purposes. ## ๐Ÿ”— Related Resources -- [ScrapeGraphAI Documentation](https://scrapegraphai.com/docs) +- [ScrapeGraphAI Website](https://scrapegraphai.com) - Get your API key +- [ScrapeGraphAI SDK Documentation](https://github.com/ScrapeGraphAI/scrapegraph-sdk) - scrapegraph-py SDK reference +- [ScrapeGraphAI API Documentation](https://scrapegraphai.com/docs) - API documentation - [Elasticsearch Documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html) -- [ScrapeGraphAI GitHub](https://github.com/ScrapeGraphAI/Scrapegraph-ai) +- [ScrapeGraphAI Open Source](https://github.com/ScrapeGraphAI/Scrapegraph-ai) - Original open-source library ## ๐Ÿ’ก Use Cases @@ -313,6 +339,21 @@ This demo can be adapted for various use cases: ## ๐Ÿ› Troubleshooting +### ScrapeGraphAI API Issues + +```bash +# Verify your API key is set +echo $SGAI_API_KEY + +# Test the SDK +python -c "from scrapegraph_py import Client; print('SDK installed correctly')" +``` + +**Common Issues:** +- **"SGAI_API_KEY not set"**: Make sure you've added your API key to `.env` +- **API credits exhausted**: Check your account at scrapegraphai.com +- **Connection timeout**: Check your internet connection + ### Elasticsearch Connection Issues ```bash diff --git a/examples/advanced_search.py b/examples/advanced_search.py index 34c2665..2f59c00 100644 --- a/examples/advanced_search.py +++ b/examples/advanced_search.py @@ -124,6 +124,7 @@ def main(): print("Product not found") # Clean up + scraper.close() es_client.close() print("\n\n=== Advanced search demo completed! ===") diff --git a/examples/basic_usage.py b/examples/basic_usage.py index d7caefe..4b29478 100644 --- a/examples/basic_usage.py +++ b/examples/basic_usage.py @@ -85,6 +85,7 @@ def main(): # Clean up print("\n9. Closing connections...") + scraper.close() es_client.close() print("\n=== Demo completed successfully! ===") diff --git a/examples/product_comparison.py b/examples/product_comparison.py index d291b76..b41f026 100644 --- a/examples/product_comparison.py +++ b/examples/product_comparison.py @@ -119,6 +119,7 @@ def main(): print(f" Availability: {product.availability}") # Clean up + scraper.close() es_client.close() print("\n" + "=" * 60) diff --git a/quickstart.py b/quickstart.py index b52c3c8..9465fa0 100644 --- a/quickstart.py +++ b/quickstart.py @@ -75,7 +75,11 @@ def main(): print_step(3, "Initializing Marketplace Scraper") scraper = MarketplaceScraper(config) print("โœ“ Scraper initialized") - print(" Using mock data for demonstration") + if not config.sgai_api_key: + print(" Note: SGAI_API_KEY not set, using mock data for demonstration") + print(" To use real API scraping, set SGAI_API_KEY in your .env file") + else: + print(" Using ScrapeGraphAI SDK for scraping") wait_for_user() # Step 4: Scrape Products @@ -220,6 +224,9 @@ def main(): print(" - python examples/advanced_search.py") print() + # Clean up connections + scraper.close() + if es_connected: print(" 5. Access Kibana at http://localhost:5601 for data visualization") print() diff --git a/requirements.txt b/requirements.txt index 4659888..18ac372 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -# ScrapeGraphAI SDK -scrapegraphai>=1.0.0 +# ScrapeGraphAI SDK (API-based) +scrapegraph-py>=1.0.0 # Elasticsearch elasticsearch>=8.0.0 diff --git a/src/scrapegraph_demo/config.py b/src/scrapegraph_demo/config.py index 9371e8c..e74fe2b 100644 --- a/src/scrapegraph_demo/config.py +++ b/src/scrapegraph_demo/config.py @@ -19,11 +19,8 @@ class Config: elasticsearch_username: Optional[str] elasticsearch_password: Optional[str] - # ScrapeGraphAI settings - scrapegraphai_api_key: Optional[str] - - # OpenAI settings (optional) - openai_api_key: Optional[str] + # ScrapeGraphAI SDK settings + sgai_api_key: Optional[str] @classmethod def from_env(cls) -> "Config": @@ -36,8 +33,7 @@ def from_env(cls) -> "Config": elasticsearch_scheme=os.getenv("ELASTICSEARCH_SCHEME", "http"), elasticsearch_username=os.getenv("ELASTICSEARCH_USERNAME"), elasticsearch_password=os.getenv("ELASTICSEARCH_PASSWORD"), - scrapegraphai_api_key=os.getenv("SCRAPEGRAPHAI_API_KEY"), - openai_api_key=os.getenv("OPENAI_API_KEY"), + sgai_api_key=os.getenv("SGAI_API_KEY"), ) @property diff --git a/src/scrapegraph_demo/scraper.py b/src/scrapegraph_demo/scraper.py index 776c640..7920ee5 100644 --- a/src/scrapegraph_demo/scraper.py +++ b/src/scrapegraph_demo/scraper.py @@ -7,35 +7,41 @@ from datetime import datetime try: - from scrapegraphai.graphs import SmartScraperGraph + from scrapegraph_py import Client + SCRAPEGRAPH_AVAILABLE = True except ImportError: - # Fallback if scrapegraphai is not installed - SmartScraperGraph = None + # Fallback if scrapegraph-py is not installed + Client = None + SCRAPEGRAPH_AVAILABLE = False from .config import Config from .models import Product class MarketplaceScraper: - """Scraper for marketplace product data using ScrapeGraphAI""" + """Scraper for marketplace product data using ScrapeGraphAI SDK""" def __init__(self, config: Config): - """Initialize the scraper""" + """Initialize the scraper with SDK client""" self.config = config - # Configure graph settings - self.graph_config = { - "llm": { - "api_key": config.openai_api_key or config.scrapegraphai_api_key, - "model": "gpt-3.5-turbo", - }, - "verbose": True, - "headless": True, - } + # Initialize SDK client + if SCRAPEGRAPH_AVAILABLE and config.sgai_api_key: + try: + self.client = Client(api_key=config.sgai_api_key) + except Exception as e: + print(f"Warning: Could not initialize ScrapeGraph SDK client: {e}") + self.client = None + else: + self.client = None + if not SCRAPEGRAPH_AVAILABLE: + print("Warning: scrapegraph-py SDK not available, using mock data") + elif not config.sgai_api_key: + print("Warning: SGAI_API_KEY not set, using mock data") def scrape_product(self, url: str, marketplace: str) -> Optional[Product]: """ - Scrape a single product from a marketplace URL + Scrape a single product from a marketplace URL using SDK Args: url: Product URL to scrape @@ -44,45 +50,68 @@ def scrape_product(self, url: str, marketplace: str) -> Optional[Product]: Returns: Product object or None if scraping fails """ - if SmartScraperGraph is None: - print("Warning: ScrapeGraphAI not available, using mock data") + if self.client is None: + print("Warning: ScrapeGraph SDK client not available, using mock data") return self._mock_scrape_product(url, marketplace) try: # Define the prompt for extracting product information prompt = """ - Extract the following product information: - - Product name - - Price (numeric value only) - - Currency - - Product ID or SKU - - Description - - Brand - - Category - - Rating (if available) - - Review count (if available) - - Availability status - - Image URL - - Any key specifications + Extract the following product information from this page: + - name: Product name + - price: Price (numeric value only, without currency symbol) + - currency: Currency code (e.g., USD, EUR) + - product_id: Product ID or SKU + - description: Product description + - brand: Product brand + - category: Product category + - rating: Rating out of 5 (if available) + - review_count: Number of reviews (if available) + - availability: Availability status (e.g., "In Stock") + - image_url: Main product image URL + - specifications: Any key specifications as a dictionary """ - # Create the scraper graph - smart_scraper = SmartScraperGraph( - prompt=prompt, - source=url, - config=self.graph_config - ) + # Define output schema for structured extraction + output_schema = { + "name": "string", + "price": "float", + "currency": "string", + "product_id": "string", + "description": "string", + "brand": "string", + "category": "string", + "rating": "float", + "review_count": "integer", + "availability": "string", + "image_url": "string", + "specifications": "object" + } - # Run the scraper - result = smart_scraper.run() + # Use SDK's smartscraper method + response = self.client.smartscraper( + website_url=url, + user_prompt=prompt, + output_schema=output_schema + ) - # Parse and structure the result - product = self._parse_scraped_data(result, url, marketplace) - return product + # Parse SDK response with validation + if response and isinstance(response, dict) and 'result' in response: + result_data = response['result'] + if isinstance(result_data, dict): + product = self._parse_scraped_data(result_data, url, marketplace) + return product + else: + print(f"Warning: Invalid result structure in SDK response for {url}") + return self._mock_scrape_product(url, marketplace) + else: + print(f"Warning: No valid response from SDK for {url}") + return self._mock_scrape_product(url, marketplace) except Exception as e: print(f"Error scraping {url}: {str(e)}") - return None + print("Falling back to mock data") + return self._mock_scrape_product(url, marketplace) def scrape_search_results( self, @@ -91,7 +120,7 @@ def scrape_search_results( max_results: int = 10 ) -> List[Product]: """ - Scrape multiple products from search results + Scrape multiple products from search results using SDK Args: search_query: Search query to use @@ -101,23 +130,104 @@ def scrape_search_results( Returns: List of Product objects """ - # This is a simplified implementation - # In a real scenario, you would: - # 1. Construct a search URL for the marketplace - # 2. Scrape the search results page to get product URLs - # 3. Scrape each individual product page - - print(f"Scraping search results for '{search_query}' on {marketplace}") - print(f"Note: This is a simplified implementation using mock data") + if self.client is None: + print(f"Warning: ScrapeGraph SDK client not available, using mock data") + print(f"Scraping search results for '{search_query}' on {marketplace}") + products = [] + for i in range(min(max_results, 5)): + mock_url = f"https://{marketplace.lower()}.com/product/{i}" + product = self._mock_scrape_product(mock_url, marketplace, search_query, i) + if product: + products.append(product) + return products - # Mock implementation - in production, you would scrape actual search results + try: + # Construct search URL (simplified - would need marketplace-specific logic) + search_url = f"https://www.{marketplace.lower()}.com/s?k={search_query.replace(' ', '+')}" + + # Define prompt for extracting multiple products + prompt = f""" + Extract information for up to {max_results} products from the search results. + For each product, extract: + - name: Product name + - price: Price (numeric value only) + - currency: Currency code + - product_url: Full product URL + - rating: Rating out of 5 (if available) + - review_count: Number of reviews (if available) + Return as a list of products. + """ + + output_schema = { + "products": [ + { + "name": "string", + "price": "float", + "currency": "string", + "product_url": "string", + "rating": "float", + "review_count": "integer" + } + ] + } + + # Use SDK's smartscraper for search results + response = self.client.smartscraper( + website_url=search_url, + user_prompt=prompt, + output_schema=output_schema + ) + + # Parse response with validation + products = [] + if response and isinstance(response, dict) and 'result' in response: + result_data = response.get('result', {}) + if isinstance(result_data, dict) and 'products' in result_data: + products_list = result_data.get('products', []) + if isinstance(products_list, list): + for i, product_data in enumerate(products_list[:max_results]): + if not isinstance(product_data, dict): + continue + product_url = product_data.get('product_url', f"https://{marketplace.lower()}.com/product/{i}") + + # Create Product object from search result data + product = Product( + product_id=self._extract_product_id(product_url), + name=product_data.get('name', f"{search_query} Product {i+1}"), + price=self._extract_price(str(product_data.get('price', 0))), + currency=product_data.get('currency', 'USD'), + url=product_url, + marketplace=marketplace, + rating=product_data.get('rating'), + review_count=product_data.get('review_count'), + scraped_at=datetime.utcnow() + ) + products.append(product) + + return products + + # If we get here, no valid results were found + print(f"Warning: No valid search results from SDK, using mock data") + return self._get_mock_search_results(search_query, marketplace, max_results) + + except Exception as e: + print(f"Error scraping search results: {str(e)}") + print("Falling back to mock data") + return self._get_mock_search_results(search_query, marketplace, max_results) + + def _get_mock_search_results( + self, + search_query: str, + marketplace: str, + max_results: int + ) -> List[Product]: + """Helper to get mock search results""" products = [] for i in range(min(max_results, 5)): mock_url = f"https://{marketplace.lower()}.com/product/{i}" product = self._mock_scrape_product(mock_url, marketplace, search_query, i) if product: products.append(product) - return products def _parse_scraped_data( @@ -238,3 +348,16 @@ def _mock_scrape_product( }, scraped_at=datetime.utcnow() ) + + def close(self): + """Close the SDK client connection""" + if self.client is not None: + try: + self.client.close() + except (AttributeError, RuntimeError) as e: + # AttributeError: if close method doesn't exist + # RuntimeError: if connection already closed or other runtime errors + print(f"Warning: Error closing SDK client ({type(e).__name__}): {e}") + except Exception as e: + # Catch any other unexpected exceptions but log the type + print(f"Warning: Unexpected error closing SDK client ({type(e).__name__}): {e}") diff --git a/tests/test_config.py b/tests/test_config.py index 365e432..7414637 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -28,8 +28,7 @@ def test_elasticsearch_url(): elasticsearch_scheme="http", elasticsearch_username=None, elasticsearch_password=None, - scrapegraphai_api_key=None, - openai_api_key=None + sgai_api_key=None ) assert config.elasticsearch_url == "http://localhost:9200" @@ -44,14 +43,12 @@ def test_config_with_credentials(): elasticsearch_scheme="https", elasticsearch_username="user", elasticsearch_password="pass", - scrapegraphai_api_key="test_key", - openai_api_key="openai_key" + sgai_api_key="test_key" ) assert config.elasticsearch_username == "user" assert config.elasticsearch_password == "pass" - assert config.scrapegraphai_api_key == "test_key" - assert config.openai_api_key == "openai_key" + assert config.sgai_api_key == "test_key" assert config.elasticsearch_url == "https://localhost:9200" print("โœ“ test_config_with_credentials passed") diff --git a/tests/test_scraper.py b/tests/test_scraper.py index 4311206..0f337eb 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -15,7 +15,7 @@ def test_scraper_initialization(): scraper = MarketplaceScraper(config) assert scraper.config is not None - assert scraper.graph_config is not None + # Client may be None if API key is not set, which is expected print("โœ“ test_scraper_initialization passed") @@ -36,6 +36,9 @@ def test_mock_scrape_product(): assert product.price > 0 assert product.product_id is not None assert "test product" in product.name + + # Clean up + scraper.close() print("โœ“ test_mock_scrape_product passed") @@ -49,6 +52,9 @@ def test_scrape_search_results(): assert len(products) == 5 assert all(p.marketplace == "Amazon" for p in products) assert all("laptop" in p.name for p in products) + + # Clean up + scraper.close() print("โœ“ test_scrape_search_results passed")