From bd4b26d7d7c1a7953d1bc9d78b436007880028c9 Mon Sep 17 00:00:00 2001 From: ekinsenler Date: Wed, 4 Sep 2024 17:53:32 +0300 Subject: [PATCH 1/3] feat: ConcatNode.py added for heavy merge operations --- .../smart_scraper_multi_concat_ollama.py | 42 +++++++ scrapegraphai/graphs/__init__.py | 1 + .../smart_scraper_multi_concat_graph.py | 115 ++++++++++++++++++ scrapegraphai/nodes/__init__.py | 1 + scrapegraphai/nodes/concat_answers_node.py | 76 ++++++++++++ 5 files changed, 235 insertions(+) create mode 100644 examples/local_models/smart_scraper_multi_concat_ollama.py create mode 100644 scrapegraphai/graphs/smart_scraper_multi_concat_graph.py create mode 100644 scrapegraphai/nodes/concat_answers_node.py diff --git a/examples/local_models/smart_scraper_multi_concat_ollama.py b/examples/local_models/smart_scraper_multi_concat_ollama.py new file mode 100644 index 00000000..665b5db4 --- /dev/null +++ b/examples/local_models/smart_scraper_multi_concat_ollama.py @@ -0,0 +1,42 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/llama3.1", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 6dda222d..966f9978 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -25,3 +25,4 @@ from .markdown_scraper_multi_graph import MDScraperMultiGraph from .search_link_graph import SearchLinkGraph from .screenshot_scraper_graph import ScreenshotScraperGraph +from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph diff --git a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py new file mode 100644 index 00000000..a85eb5bb --- /dev/null +++ b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py @@ -0,0 +1,115 @@ +""" +SmartScraperMultiGraph Module +""" + +from copy import copy, deepcopy +from typing import List, Optional +from pydantic import BaseModel + +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .smart_scraper_graph import SmartScraperGraph + +from ..nodes import ( + GraphIteratorNode, + ConcatAnswersNode +) + + +class SmartScraperMultiConcatGraph(AbstractGraph): + """ + SmartScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. + It only requires a user prompt and a list of URLs. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + source (List[str]): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (Optional[BaseModel]): The schema for the graph output. + + Example: + >>> search_graph = SmartScraperMultiConcatGraph( + ... "What is Chioggia famous for?", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = search_graph.run() + """ + + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): + + if all(isinstance(value, str) for value in config.values()): + self.copy_config = copy(config) + else: + self.copy_config = deepcopy(config) + + self.copy_schema = deepcopy(schema) + + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping and searching. + + Returns: + BaseGraph: A graph instance representing the web scraping and searching workflow. + """ + + # ************************************************ + # Create a SmartScraperGraph instance + # ************************************************ + + smart_scraper_instance = SmartScraperGraph( + prompt="", + source="", + config=self.copy_config, + schema=self.copy_schema + ) + + # ************************************************ + # Define the graph nodes + # ************************************************ + + graph_iterator_node = GraphIteratorNode( + input="user_prompt & urls", + output=["results"], + node_config={ + "graph_instance": smart_scraper_instance, + } + ) + + concat_answers_node = ConcatAnswersNode( + input="results", + output=["answer"] + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + concat_answers_node, + ], + edges=[ + (graph_iterator_node, concat_answers_node), + ], + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the web scraping and searching process. + + Returns: + str: The answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, "urls": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index dd1c3fcc..f31dd1b7 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -22,3 +22,4 @@ from .merge_generated_scripts import MergeGeneratedScriptsNode from .fetch_screen_node import FetchScreenNode from .generate_answer_from_image_node import GenerateAnswerFromImageNode +from .concat_answers_node import ConcatAnswersNode \ No newline at end of file diff --git a/scrapegraphai/nodes/concat_answers_node.py b/scrapegraphai/nodes/concat_answers_node.py new file mode 100644 index 00000000..8360e18c --- /dev/null +++ b/scrapegraphai/nodes/concat_answers_node.py @@ -0,0 +1,76 @@ +""" +ConcatAnswersNode Module +""" + +from typing import List, Optional +from ..utils.logging import get_logger +from .base_node import BaseNode + +class ConcatAnswersNode(BaseNode): + """ + A node responsible for concatenating the answers from multiple graph instances into a single answer. + + Attributes: + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "ConcatAnswers", + ): + super().__init__(node_name, "node", input, output, 1, node_config) + + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + + def _merge_dict(self, items): + + return {"products": {f"item_{i+1}": item for i, item in enumerate(items)}} + + def execute(self, state: dict) -> dict: + """ + Executes the node's logic to concatenate the answers from multiple graph instances into a + single answer. + + Args: + state (dict): The current state of the graph. The input keys will be used + to fetch the correct data from the state. + + Returns: + dict: The updated state with the output key containing the generated answer. + + Raises: + KeyError: If the input keys are not found in the state, indicating + that the necessary information for generating an answer is missing. + """ + + self.logger.info(f"--- Executing {self.node_name} Node ---") + + # Interpret input keys based on the provided input expression + input_keys = self.get_input_keys(state) + + # Fetching data from the state based on the input keys + input_data = [state[key] for key in input_keys] + + answers = input_data[0] + + if len(answers) > 1: + # merge the answers in one string + answer = self._merge_dict(answers) + + # Update the state with the generated answer + state.update({self.output[0]: answer}) + + else: + state.update({self.output[0]: answers[0]}) + return state From f83c3d1ae64aecbf58eba5282b09b0b6458cf1ee Mon Sep 17 00:00:00 2001 From: ekinsenler Date: Wed, 4 Sep 2024 20:29:04 +0300 Subject: [PATCH 2/3] add example for gemini --- .../smart_scraper_multi_concat.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 examples/google_genai/smart_scraper_multi_concat.py diff --git a/examples/google_genai/smart_scraper_multi_concat.py b/examples/google_genai/smart_scraper_multi_concat.py new file mode 100644 index 00000000..facd74c3 --- /dev/null +++ b/examples/google_genai/smart_scraper_multi_concat.py @@ -0,0 +1,40 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "google_genai/gemini-pro", + }, +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiConcatGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) From c0339d9dc68898478d2626dab17154da5e0cd5f3 Mon Sep 17 00:00:00 2001 From: ekinsenler Date: Wed, 4 Sep 2024 20:31:24 +0300 Subject: [PATCH 3/3] fix file name --- ...raper_multi_concat.py => smart_scraper_multi_concat_gemini.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/google_genai/{smart_scraper_multi_concat.py => smart_scraper_multi_concat_gemini.py} (100%) diff --git a/examples/google_genai/smart_scraper_multi_concat.py b/examples/google_genai/smart_scraper_multi_concat_gemini.py similarity index 100% rename from examples/google_genai/smart_scraper_multi_concat.py rename to examples/google_genai/smart_scraper_multi_concat_gemini.py