From 4f816f3b04974e90ca4208158f05724cfe68ffb8 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Wed, 9 Oct 2024 10:39:29 +0200
Subject: [PATCH 1/2] feat: update chromium loader

---
 scrapegraphai/docloaders/chromium.py | 72 ++++++++++++++++++----------
 1 file changed, 48 insertions(+), 24 deletions(-)

diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
index 3bee3c9d..555d1669 100644
--- a/scrapegraphai/docloaders/chromium.py
+++ b/scrapegraphai/docloaders/chromium.py
@@ -1,10 +1,12 @@
-"""
-Chromium module
+""""
+chromium module
 """
 import asyncio
 from typing import Any, AsyncIterator, Iterator, List, Optional
 from langchain_community.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
+import aiohttp
+import async_timeout
 from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy
 
 logger = get_logger("web-loader")
@@ -21,6 +23,9 @@ class ChromiumLoader(BaseLoader):
         urls: A list of URLs to scrape content from.
     """
 
+    RETRY_LIMIT = 3
+    TIMEOUT = 10
+
     def __init__(
         self,
         urls: List[str],
@@ -66,17 +71,29 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
 
         Returns:
             str: The scraped HTML content or an error message if an exception occurs.
-
         """
         import undetected_chromedriver as uc
 
         logger.info(f"Starting scraping with {self.backend}...")
         results = ""
-        try:
-            driver = uc.Chrome(headless=self.headless)
-            results = driver.get(url).page_content
-        except Exception as e:
-            results = f"Error: {e}"
+        attempt = 0
+
+        while attempt < self.RETRY_LIMIT:
+            try:
+                async with async_timeout.timeout(self.TIMEOUT):
+                    driver = uc.Chrome(headless=self.headless)
+                    driver.get(url)
+                    results = driver.page_content
+                    logger.info(f"Successfully scraped {url}")
+                    break
+            except (aiohttp.ClientError, asyncio.TimeoutError) as e:
+                attempt += 1
+                logger.error(f"Attempt {attempt} failed: {e}")
+                if attempt == self.RETRY_LIMIT:
+                    results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
+            finally:
+                driver.quit()
+
         return results
 
     async def ascrape_playwright(self, url: str) -> str:
@@ -88,28 +105,36 @@ async def ascrape_playwright(self, url: str) -> str:
 
         Returns:
             str: The scraped HTML content or an error message if an exception occurs.
-
         """
         from playwright.async_api import async_playwright
         from undetected_playwright import Malenia
 
         logger.info(f"Starting scraping with {self.backend}...")
         results = ""
-        async with async_playwright() as p:
-            browser = await p.chromium.launch(
-                headless=self.headless, proxy=self.proxy, **self.browser_config
-            )
+        attempt = 0
+
+        while attempt < self.RETRY_LIMIT:
             try:
-                context = await browser.new_context()
-                await Malenia.apply_stealth(context)
-                page = await context.new_page()
-                await page.goto(url, wait_until="domcontentloaded")
-                await page.wait_for_load_state(self.load_state)
-                results = await page.content()  # Simply get the HTML content
-                logger.info("Content scraped")
-            except Exception as e:
-                results = f"Error: {e}"
-            await browser.close()
+                async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT):
+                    browser = await p.chromium.launch(
+                        headless=self.headless, proxy=self.proxy, **self.browser_config
+                    )
+                    context = await browser.new_context()
+                    await Malenia.apply_stealth(context)
+                    page = await context.new_page()
+                    await page.goto(url, wait_until="domcontentloaded")
+                    await page.wait_for_load_state(self.load_state)
+                    results = await page.content()
+                    logger.info("Content scraped")
+                    break
+            except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
+                attempt += 1
+                logger.error(f"Attempt {attempt} failed: {e}")
+                if attempt == self.RETRY_LIMIT:
+                    results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
+            finally:
+                await browser.close()
+
         return results
 
     def lazy_load(self) -> Iterator[Document]:
@@ -121,7 +146,6 @@ def lazy_load(self) -> Iterator[Document]:
 
         Yields:
             Document: The scraped content encapsulated within a Document object.
-
         """
         scraping_fn = getattr(self, f"ascrape_{self.backend}")
 

From 88ba2310acc2f992cdbf92a8431c0df2f67ea582 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Wed, 9 Oct 2024 10:45:47 +0200
Subject: [PATCH 2/2] feat:add dynamic rendering

---
 scrapegraphai/docloaders/chromium.py | 45 ++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
index 555d1669..48058436 100644
--- a/scrapegraphai/docloaders/chromium.py
+++ b/scrapegraphai/docloaders/chromium.py
@@ -1,5 +1,5 @@
-""""
-chromium module
+"""
+chromiumloader module
 """
 import asyncio
 from typing import Any, AsyncIterator, Iterator, List, Optional
@@ -83,7 +83,7 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
                 async with async_timeout.timeout(self.TIMEOUT):
                     driver = uc.Chrome(headless=self.headless)
                     driver.get(url)
-                    results = driver.page_content
+                    results = driver.page_source
                     logger.info(f"Successfully scraped {url}")
                     break
             except (aiohttp.ClientError, asyncio.TimeoutError) as e:
@@ -137,6 +137,45 @@ async def ascrape_playwright(self, url: str) -> str:
 
         return results
 
+    async def ascrape_with_js_support(self, url: str) -> str:
+        """
+        Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
+
+        Args:
+            url (str): The URL to scrape.
+
+        Returns:
+            str: The fully rendered HTML content after JavaScript execution, 
+            or an error message if an exception occurs.
+        """
+        from playwright.async_api import async_playwright
+
+        logger.info(f"Starting scraping with JavaScript support for {url}...")
+        results = ""
+        attempt = 0
+
+        while attempt < self.RETRY_LIMIT:
+            try:
+                async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT):
+                    browser = await p.chromium.launch(
+                        headless=self.headless, proxy=self.proxy, **self.browser_config
+                    )
+                    context = await browser.new_context()
+                    page = await context.new_page()
+                    await page.goto(url, wait_until="networkidle")
+                    results = await page.content()
+                    logger.info("Content scraped after JavaScript rendering")
+                    break
+            except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
+                attempt += 1
+                logger.error(f"Attempt {attempt} failed: {e}")
+                if attempt == self.RETRY_LIMIT:
+                    results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
+            finally:
+                await browser.close()
+
+        return results
+
     def lazy_load(self) -> Iterator[Document]:
         """
         Lazily load text content from the provided URLs.