From 491492833977ba34ebdce573dca712f8675772fb Mon Sep 17 00:00:00 2001 From: SwapnilSonker Date: Wed, 18 Dec 2024 09:10:46 +0530 Subject: [PATCH] #772 added functionality to change browser to firefox --- examples/extras/chromium_selenium.py | 14 ++++-- scrapegraphai/docloaders/chromium.py | 75 ++++++++++++++++++++++++---- 2 files changed, 74 insertions(+), 15 deletions(-) diff --git a/examples/extras/chromium_selenium.py b/examples/extras/chromium_selenium.py index 5e647bce..fba530d4 100644 --- a/examples/extras/chromium_selenium.py +++ b/examples/extras/chromium_selenium.py @@ -87,8 +87,11 @@ async def main(): # Test with Playwright backend print("\n--- Testing Playwright Backend ---") try: - scraper_playwright = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True) - await test_scraper_with_analysis(scraper_playwright, urls_to_scrape) + scraper_playwright_chromium = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "chromium") + await test_scraper_with_analysis(scraper_playwright_chromium, urls_to_scrape) + + scraper_playwright_firefox = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "firefox") + await test_scraper_with_analysis(scraper_playwright_firefox, urls_to_scrape) except ImportError as ie: print(f"❌ Playwright ImportError: {ie}") except Exception as e: @@ -97,8 +100,11 @@ async def main(): # Test with Selenium backend print("\n--- Testing Selenium Backend ---") try: - scraper_selenium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True) - await test_scraper_with_analysis(scraper_selenium, urls_to_scrape) + scraper_selenium_chromium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "chromium") + await test_scraper_with_analysis(scraper_selenium_chromium, urls_to_scrape) + + scraper_selenium_firefox = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "firefox") + await test_scraper_with_analysis(scraper_selenium_firefox, urls_to_scrape) except ImportError as ie: print(f"❌ Selenium ImportError: {ie}") except Exception as e: diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 94d57016..31043730 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -4,6 +4,8 @@ from langchain_core.documents import Document import aiohttp import async_timeout +from selenium import webdriver +from selenium.webdriver.chrome.options import Options as ChromeOptions from typing import Union from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy @@ -36,6 +38,7 @@ def __init__( load_state: str = "domcontentloaded", requires_js_support: bool = False, storage_state: Optional[str] = None, + browser_name: str = "chromium", #default chromium **kwargs: Any, ): """Initialize the loader with a list of URL paths. @@ -66,6 +69,7 @@ def __init__( self.load_state = load_state self.requires_js_support = requires_js_support self.storage_state = storage_state + self.browser_name = browser_name async def scrape(self, url:str) -> str: if self.backend == "playwright": @@ -95,11 +99,35 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str: while attempt < self.RETRY_LIMIT: try: async with async_timeout.timeout(self.TIMEOUT): - driver = uc.Chrome(headless=self.headless) - driver.get(url) - results = driver.page_source - logger.info(f"Successfully scraped {url}") - break + # Handling browser selection + if self.backend == "selenium": + if self.browser_name == "chromium": + options = ChromeOptions() + options.headless = self.headless + # Initialize undetected chromedriver for Selenium + driver = uc.Chrome(options=options) + driver.get(url) + results = driver.page_source + logger.info(f"Successfully scraped {url} with {self.browser_name}") + break + elif self.browser_name == "firefox": + from selenium.webdriver.firefox.options import Options as FirefoxOptions + options = FirefoxOptions() + options.headless = self.headless + # Initialize undetected Firefox driver (if required) + driver = webdriver.Firefox(options=options) + driver.get(url) + results = driver.page_source + logger.info(f"Successfully scraped {url} with {self.browser_name}") + break + else: + logger.error(f"Unsupported browser {self.browser_name} for Selenium.") + results = f"Error: Unsupported browser {self.browser_name}." + break + else: + logger.error(f"Unsupported backend {self.backend}.") + results = f"Error: Unsupported backend {self.backend}." + break except (aiohttp.ClientError, asyncio.TimeoutError) as e: attempt += 1 logger.error(f"Attempt {attempt} failed: {e}") @@ -118,7 +146,8 @@ async def ascrape_playwright_scroll( timeout: Union[int, None]=30, scroll: int=15000, sleep: float=2, - scroll_to_bottom: bool=False + scroll_to_bottom: bool=False, + browser_name: str = "chromium" #default chrome is added ) -> str: """ Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling. @@ -175,9 +204,17 @@ async def ascrape_playwright_scroll( while attempt < self.RETRY_LIMIT: try: async with async_playwright() as p: - browser = await p.chromium.launch( + browser = None + if browser_name == "chromium": + browser = await p.chromium.launch( headless=self.headless, proxy=self.proxy, **self.browser_config ) + elif browser_name == "firefox": + browser = await p.firefox.launch( + headless=self.headless, proxy=self.proxy, **self.browser_config + ) + else: + raise ValueError(f"Invalid browser name: {browser_name}") context = await browser.new_context() await Malenia.apply_stealth(context) page = await context.new_page() @@ -235,7 +272,7 @@ async def ascrape_playwright_scroll( return results - async def ascrape_playwright(self, url: str) -> str: + async def ascrape_playwright(self, url: str, browser_name: str = "chromium") -> str: """ Asynchronously scrape the content of a given URL using Playwright's async API. @@ -255,9 +292,17 @@ async def ascrape_playwright(self, url: str) -> str: while attempt < self.RETRY_LIMIT: try: async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT): - browser = await p.chromium.launch( + browser = None + if browser_name == "chromium": + browser = await p.chromium.launch( headless=self.headless, proxy=self.proxy, **self.browser_config ) + elif browser_name == "firefox": + browser = await p.firefox.launch( + headless=self.headless, proxy=self.proxy, **self.browser_config + ) + else: + raise ValueError(f"Invalid browser name: {browser_name}") context = await browser.new_context( storage_state=self.storage_state ) @@ -282,7 +327,7 @@ async def ascrape_playwright(self, url: str) -> str: - async def ascrape_with_js_support(self, url: str) -> str: + async def ascrape_with_js_support(self, url: str , browser_name:str = "chromium") -> str: """ Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright. @@ -302,9 +347,17 @@ async def ascrape_with_js_support(self, url: str) -> str: while attempt < self.RETRY_LIMIT: try: async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT): - browser = await p.chromium.launch( + browser = None + if browser_name == "chromium": + browser = await p.chromium.launch( headless=self.headless, proxy=self.proxy, **self.browser_config ) + elif browser_name == "firefox": + browser = await p.firefox.launch( + headless=self.headless, proxy=self.proxy, **self.browser_config + ) + else: + raise ValueError(f"Invalid browser name: {browser_name}") context = await browser.new_context( storage_state=self.storage_state )