diff --git a/examples/anthropic/custom_graph_haiku.py b/examples/anthropic/custom_graph_haiku.py index d8b4dc19..96115d2e 100644 --- a/examples/anthropic/custom_graph_haiku.py +++ b/examples/anthropic/custom_graph_haiku.py @@ -40,7 +40,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/bedrock/custom_graph_bedrock.py b/examples/bedrock/custom_graph_bedrock.py index 9002a598..d72f6999 100644 --- a/examples/bedrock/custom_graph_bedrock.py +++ b/examples/bedrock/custom_graph_bedrock.py @@ -55,7 +55,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/ernie/custom_graph_ernie.py b/examples/ernie/custom_graph_ernie.py index 5dad8bac..57d422e5 100644 --- a/examples/ernie/custom_graph_ernie.py +++ b/examples/ernie/custom_graph_ernie.py @@ -43,7 +43,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/fireworks/custom_graph_fireworks.py b/examples/fireworks/custom_graph_fireworks.py index d0dcd994..66784d5b 100644 --- a/examples/fireworks/custom_graph_fireworks.py +++ b/examples/fireworks/custom_graph_fireworks.py @@ -43,7 +43,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/groq/custom_graph_groq.py b/examples/groq/custom_graph_groq.py index 79d2f0c6..f0d7e215 100644 --- a/examples/groq/custom_graph_groq.py +++ b/examples/groq/custom_graph_groq.py @@ -43,7 +43,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/huggingfacehub/custom_graph_huggingfacehub.py b/examples/huggingfacehub/custom_graph_huggingfacehub.py index 604bfae8..cec007b7 100644 --- a/examples/huggingfacehub/custom_graph_huggingfacehub.py +++ b/examples/huggingfacehub/custom_graph_huggingfacehub.py @@ -55,7 +55,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/local_models/custom_graph_ollama.py b/examples/local_models/custom_graph_ollama.py index 66dd59b6..c505d068 100644 --- a/examples/local_models/custom_graph_ollama.py +++ b/examples/local_models/custom_graph_ollama.py @@ -44,7 +44,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/mistral/custom_graph_mistral.py b/examples/mistral/custom_graph_mistral.py index f02ead0c..ec2878c1 100644 --- a/examples/mistral/custom_graph_mistral.py +++ b/examples/mistral/custom_graph_mistral.py @@ -42,7 +42,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/mixed_models/custom_graph_groq_openai.py b/examples/mixed_models/custom_graph_groq_openai.py index 942b0fcb..d83c31db 100644 --- a/examples/mixed_models/custom_graph_groq_openai.py +++ b/examples/mixed_models/custom_graph_groq_openai.py @@ -51,7 +51,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/nemotron/custom_graph_nemotron.py b/examples/nemotron/custom_graph_nemotron.py index 07702680..22c6a4a1 100644 --- a/examples/nemotron/custom_graph_nemotron.py +++ b/examples/nemotron/custom_graph_nemotron.py @@ -42,7 +42,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/oneapi/custom_graph_oneapi.py b/examples/oneapi/custom_graph_oneapi.py index be58d1d1..1e27dcf9 100644 --- a/examples/oneapi/custom_graph_oneapi.py +++ b/examples/oneapi/custom_graph_oneapi.py @@ -38,7 +38,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/examples/openai/custom_graph_openai.py b/examples/openai/custom_graph_openai.py index b1471a21..a4cf9351 100644 --- a/examples/openai/custom_graph_openai.py +++ b/examples/openai/custom_graph_openai.py @@ -43,7 +43,7 @@ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "verbose": True, "headless": True, diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py index 0b0c84f8..10aa61ae 100644 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ b/scrapegraphai/graphs/deep_scraper_graph.py @@ -69,7 +69,7 @@ def _create_repeated_graph(self) -> BaseGraph: """ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"] + output=["doc"] ) parse_node = ParseNode( input="doc", diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py index 288b8ee1..2137a023 100644 --- a/scrapegraphai/graphs/json_scraper_graph.py +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -56,7 +56,7 @@ def _create_graph(self) -> BaseGraph: fetch_node = FetchNode( input="json | json_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], ) generate_answer_node = GenerateAnswerNode( diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py index 006533d9..bc9afa08 100644 --- a/scrapegraphai/graphs/omni_scraper_graph.py +++ b/scrapegraphai/graphs/omni_scraper_graph.py @@ -65,16 +65,17 @@ def _create_graph(self) -> BaseGraph: """ fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "loader_kwargs": self.config.get("loader_kwargs", {}), } ) parse_node = ParseNode( - input="doc", - output=["parsed_doc"], + input="doc & (url | local_dir)", + output=["parsed_doc", "link_urls", "img_urls"], node_config={ "chunk_size": self.model_token, + "parse_urls": True, "llm_model": self.llm_model } ) diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index 73c709eb..2f080764 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -62,7 +62,7 @@ def _create_graph(self) -> BaseGraph: fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "llm_model": self.llm_model, "loader_kwargs": self.config.get("loader_kwargs", {}), diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py index 566569a8..b373f4e0 100644 --- a/scrapegraphai/graphs/search_link_graph.py +++ b/scrapegraphai/graphs/search_link_graph.py @@ -52,7 +52,7 @@ def _create_graph(self) -> BaseGraph: fetch_node = FetchNode( input="url| local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "llm_model": self.llm_model, "force": self.config.get("force", False), diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 0167103e..e7ff1195 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -61,7 +61,7 @@ def _create_graph(self) -> BaseGraph: """ fetch_node = FetchNode( input="url| local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "llm_model": self.llm_model, "force": self.config.get("force", False), diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 8c1ff278..4388bf66 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -62,7 +62,7 @@ def _create_graph(self) -> BaseGraph: fetch_node = FetchNode( input="url | local_dir", - output=["doc", "link_urls", "img_urls"] + output=["doc"] ) parse_node = ParseNode( input="doc", diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index f5806f56..29184f28 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -60,7 +60,7 @@ def _create_graph(self) -> BaseGraph: fetch_node = FetchNode( input="xml | xml_dir", - output=["doc", "link_urls", "img_urls"] + output=["doc"] ) generate_answer_node = GenerateAnswerNode( diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py index 934710ef..13113aa0 100644 --- a/scrapegraphai/nodes/image_to_text_node.py +++ b/scrapegraphai/nodes/image_to_text_node.py @@ -1,9 +1,11 @@ """ ImageToTextNode Module """ +import traceback from typing import List, Optional from ..utils.logging import get_logger from .base_node import BaseNode +from langchain_core.messages import HumanMessage class ImageToTextNode(BaseNode): """ @@ -58,16 +60,25 @@ def execute(self, state: dict) -> dict: if isinstance(urls, str): urls = [urls] elif len(urls) == 0: - return state + return state.update({self.output[0]: []}) # Skip the image-to-text conversion if self.max_images < 1: - return state - + return state.update({self.output[0]: []}) + img_desc = [] for url in urls[: self.max_images]: try: - text_answer = self.llm_model.run(url) + message = HumanMessage( + content=[ + {"type": "text", "text": "Describe the provided image."}, + { + "type": "image_url", + "image_url": {"url": url}, + }, + ] + ) + text_answer = self.llm_model.invoke([message]).content except Exception as e: text_answer = f"Error: incompatible image format or model failure." img_desc.append(text_answer) diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 7413229f..bfb825f6 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -1,11 +1,15 @@ """ ParseNode Module """ -from typing import List, Optional +from typing import Tuple, List, Optional +from urllib.parse import urljoin from semchunk import chunk from langchain_community.document_transformers import Html2TextTransformer from langchain_core.documents import Document from .base_node import BaseNode +from ..helpers import default_filters + +import re class ParseNode(BaseNode): """ @@ -41,6 +45,66 @@ def __init__( True if node_config is None else node_config.get("parse_html", True) ) self.llm_model = node_config['llm_model'] + self.parse_urls = ( + False if node_config is None else node_config.get("parse_urls", False) + ) + + def _clean_urls(self, urls: List[str]) -> List[str]: + """ + Cleans the URLs extracted from the text. + + Args: + urls (List[str]): The list of URLs to clean. + + Returns: + List[str]: The cleaned URLs. + """ + cleaned_urls = [] + for url in urls: + # Remove any leading 'thumbnail](' or similar patterns + url = re.sub(r'.*?\]\(', '', url) + + # Remove any trailing parentheses or brackets + url = url.rstrip(').') + + cleaned_urls.append(url) + + return cleaned_urls + + def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]: + """ + Extracts URLs from the given text. + + Args: + text (str): The text to extract URLs from. + + Returns: + Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs. + """ + # Return empty lists if the URLs are not to be parsed + if not self.parse_urls: + return [], [] + + # Regular expression to find URLs (both links and images) + image_extensions = default_filters.filter_dict["img_exts"] + image_extension_seq = '|'.join(image_extensions).replace('.','') + url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))') + + # Find all URLs in the string + all_urls = url_pattern.findall(text) + all_urls = self._clean_urls(all_urls) + + if not source.startswith("http"): + # Remove any URLs that is not complete + all_urls = [url for url in all_urls if url.startswith("http")] + else: + # Add to local URLs the source URL + all_urls = [urljoin(source, url) for url in all_urls] + + images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)] + links = [url for url in all_urls if url not in images] + + return links, images def execute(self, state: dict) -> dict: """ @@ -63,7 +127,9 @@ def execute(self, state: dict) -> dict: input_keys = self.get_input_keys(state) input_data = [state[key] for key in input_keys] + docs_transformed = input_data[0] + source = input_data[1] if self.parse_urls else None def count_tokens(text): from ..utils import token_count @@ -73,12 +139,17 @@ def count_tokens(text): docs_transformed = Html2TextTransformer().transform_documents(input_data[0]) docs_transformed = docs_transformed[0] + link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source) + chunks = chunk(text=docs_transformed.page_content, chunk_size=self.node_config.get("chunk_size", 4096)-250, token_counter=count_tokens, memoize=False) else: docs_transformed = docs_transformed[0] + + link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source) + chunk_size = self.node_config.get("chunk_size", 4096) chunk_size = min(chunk_size - 500, int(chunk_size * 0.9)) @@ -94,4 +165,8 @@ def count_tokens(text): memoize=False) state.update({self.output[0]: chunks}) + if self.parse_urls: + state.update({self.output[1]: link_urls}) + state.update({self.output[2]: img_urls}) + return state diff --git a/tests/graphs/abstract_graph_test.py b/tests/graphs/abstract_graph_test.py index 60c8ab4c..5ac63638 100644 --- a/tests/graphs/abstract_graph_test.py +++ b/tests/graphs/abstract_graph_test.py @@ -22,7 +22,7 @@ def __init__(self, prompt: str, config: dict): def _create_graph(self) -> BaseGraph: fetch_node = FetchNode( input="url| local_dir", - output=["doc", "link_urls", "img_urls"], + output=["doc"], node_config={ "llm_model": self.llm_model, "force": self.config.get("force", False),