diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 3eabc66f..ea84b824 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -3,12 +3,13 @@ """ import pandas as pd import json +import requests from typing import List, Optional from langchain_community.document_loaders import AsyncChromiumLoader from langchain_core.documents import Document from langchain_community.document_loaders import PyPDFLoader from .base_node import BaseNode -from ..utils.remover import remover +from ..utils.cleanup_html import cleanup_html class FetchNode(BaseNode): @@ -38,6 +39,8 @@ def __init__(self, input: str, output: List[str], node_config: Optional[dict] = "headless", True) self.verbose = False if node_config is None else node_config.get( "verbose", False) + self.useSoup = True if node_config is None else node_config.get( + "useSoup", True) def execute(self, state): """ @@ -94,9 +97,17 @@ def execute(self, state): pass elif not source.startswith("http"): - compressed_document = [Document(page_content=remover(source), metadata={ + compressed_document = [Document(page_content=cleanup_html(source), metadata={ "source": "local_dir" })] + + elif self.useSoup: + response = requests.get(source) + if response.status_code == 200: + cleanedup_html = cleanup_html(response.text, source) + compressed_document = [Document(page_content=cleanedup_html)] + else: + print(f"Failed to retrieve contents from the webpage at url: {url}") else: if self.node_config is not None and self.node_config.get("endpoint") is not None: @@ -114,7 +125,7 @@ def execute(self, state): document = loader.load() compressed_document = [ - Document(page_content=remover(str(document[0].page_content)))] + Document(page_content=cleanup_html(str(document[0].page_content)))] state.update({self.output[0]: compressed_document}) return state \ No newline at end of file diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 218506f3..8662cc5c 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -6,4 +6,4 @@ from .convert_to_json import convert_to_json from .prettify_exec_info import prettify_exec_info from .proxy_rotation import proxy_generator -from .remover import remover +from .cleanup_html import cleanup_html diff --git a/scrapegraphai/utils/remover.py b/scrapegraphai/utils/cleanup_html.py similarity index 70% rename from scrapegraphai/utils/remover.py rename to scrapegraphai/utils/cleanup_html.py index c5a0507b..bc16a99b 100644 --- a/scrapegraphai/utils/remover.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -3,9 +3,9 @@ """ from bs4 import BeautifulSoup from minify_html import minify +from urllib.parse import urljoin - -def remover(html_content: str) -> str: +def cleanup_html(html_content: str, base_url: str) -> str: """ Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content. @@ -33,11 +33,21 @@ def remover(html_content: str) -> str: for tag in soup.find_all(['script', 'style']): tag.extract() + # Links extraction + links = soup.find_all('a') + link_urls = [] + for link in links: + if 'href' in link.attrs: + link_urls.append(urljoin(base_url, link['href'])) + # Body Extraction (if it exists) body_content = soup.find('body') if body_content: # Minify the HTML within the body tag minimized_body = minify(str(body_content)) - return "Title: " + title + ", Body: " + minimized_body + print("Came here") + return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + - return "Title: " + title + ", Body: No body content found" \ No newline at end of file + print("No Came here") + return "Title: " + title + ", Body: No body content found" + ", Links: " + str(link_urls) \ No newline at end of file