From bd4b26d7d7c1a7953d1bc9d78b436007880028c9 Mon Sep 17 00:00:00 2001
From: ekinsenler <ekinsenler@gmail.com>
Date: Wed, 4 Sep 2024 17:53:32 +0300
Subject: [PATCH 1/3] feat: ConcatNode.py added for heavy merge operations

---
 .../smart_scraper_multi_concat_ollama.py      |  42 +++++++
 scrapegraphai/graphs/__init__.py              |   1 +
 .../smart_scraper_multi_concat_graph.py       | 115 ++++++++++++++++++
 scrapegraphai/nodes/__init__.py               |   1 +
 scrapegraphai/nodes/concat_answers_node.py    |  76 ++++++++++++
 5 files changed, 235 insertions(+)
 create mode 100644 examples/local_models/smart_scraper_multi_concat_ollama.py
 create mode 100644 scrapegraphai/graphs/smart_scraper_multi_concat_graph.py
 create mode 100644 scrapegraphai/nodes/concat_answers_node.py

diff --git a/examples/local_models/smart_scraper_multi_concat_ollama.py b/examples/local_models/smart_scraper_multi_concat_ollama.py
new file mode 100644
index 00000000..665b5db4
--- /dev/null
+++ b/examples/local_models/smart_scraper_multi_concat_ollama.py
@@ -0,0 +1,42 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperMultiConcatGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/llama3.1",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        "base_url": "http://localhost:11434", # set ollama URL arbitrarily
+    },
+    "verbose": True,
+    "headless": False
+}
+
+# *******************************************************
+# Create the SmartScraperMultiGraph instance and run it
+# *******************************************************
+
+multiple_search_graph = SmartScraperMultiConcatGraph(
+    prompt="Who is Marco Perini?",
+    source= [
+        "https://perinim.github.io/",
+        "https://perinim.github.io/cv/"
+        ],
+    schema=None,
+    config=graph_config
+)
+
+result = multiple_search_graph.run()
+print(json.dumps(result, indent=4))
diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py
index 6dda222d..966f9978 100644
--- a/scrapegraphai/graphs/__init__.py
+++ b/scrapegraphai/graphs/__init__.py
@@ -25,3 +25,4 @@
 from .markdown_scraper_multi_graph import MDScraperMultiGraph
 from .search_link_graph import SearchLinkGraph
 from .screenshot_scraper_graph import ScreenshotScraperGraph
+from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph
diff --git a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py
new file mode 100644
index 00000000..a85eb5bb
--- /dev/null
+++ b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py
@@ -0,0 +1,115 @@
+""" 
+SmartScraperMultiGraph Module
+"""
+
+from copy import copy, deepcopy
+from typing import List, Optional
+from pydantic import BaseModel
+
+from .base_graph import BaseGraph
+from .abstract_graph import AbstractGraph
+from .smart_scraper_graph import SmartScraperGraph
+
+from ..nodes import (
+    GraphIteratorNode,
+    ConcatAnswersNode
+)
+
+
+class SmartScraperMultiConcatGraph(AbstractGraph):
+    """ 
+    SmartScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt.
+    It only requires a user prompt and a list of URLs.
+
+    Attributes:
+        prompt (str): The user prompt to search the internet.
+        llm_model (dict): The configuration for the language model.
+        embedder_model (dict): The configuration for the embedder model.
+        headless (bool): A flag to run the browser in headless mode.
+        verbose (bool): A flag to display the execution information.
+        model_token (int): The token limit for the language model.
+
+    Args:
+        prompt (str): The user prompt to search the internet.
+        source (List[str]): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (Optional[BaseModel]): The schema for the graph output.
+
+    Example:
+        >>> search_graph = SmartScraperMultiConcatGraph(
+        ...     "What is Chioggia famous for?",
+        ...     {"llm": {"model": "gpt-3.5-turbo"}}
+        ... )
+        >>> result = search_graph.run()
+    """
+
+    def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
+
+        if all(isinstance(value, str) for value in config.values()):
+            self.copy_config = copy(config)
+        else:
+            self.copy_config = deepcopy(config)
+        
+        self.copy_schema = deepcopy(schema)
+
+        super().__init__(prompt, config, source, schema)
+
+    def _create_graph(self) -> BaseGraph:
+        """
+        Creates the graph of nodes representing the workflow for web scraping and searching.
+
+        Returns:
+            BaseGraph: A graph instance representing the web scraping and searching workflow.
+        """
+
+        # ************************************************
+        # Create a SmartScraperGraph instance
+        # ************************************************
+
+        smart_scraper_instance = SmartScraperGraph(
+            prompt="",
+            source="",
+            config=self.copy_config,
+            schema=self.copy_schema
+        )
+
+        # ************************************************
+        # Define the graph nodes
+        # ************************************************
+
+        graph_iterator_node = GraphIteratorNode(
+            input="user_prompt & urls",
+            output=["results"],
+            node_config={
+                "graph_instance": smart_scraper_instance,
+            }
+        )
+
+        concat_answers_node = ConcatAnswersNode(
+            input="results",
+            output=["answer"]
+        )
+
+        return BaseGraph(
+            nodes=[
+                graph_iterator_node,
+                concat_answers_node,
+            ],
+            edges=[
+                (graph_iterator_node, concat_answers_node),
+            ],
+            entry_point=graph_iterator_node,
+            graph_name=self.__class__.__name__
+        )
+
+    def run(self) -> str:
+        """
+        Executes the web scraping and searching process.
+
+        Returns:
+            str: The answer to the prompt.
+        """
+        inputs = {"user_prompt": self.prompt, "urls": self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+
+        return self.final_state.get("answer", "No answer found.")
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
index dd1c3fcc..f31dd1b7 100644
--- a/scrapegraphai/nodes/__init__.py
+++ b/scrapegraphai/nodes/__init__.py
@@ -22,3 +22,4 @@
 from .merge_generated_scripts import MergeGeneratedScriptsNode
 from .fetch_screen_node import FetchScreenNode
 from .generate_answer_from_image_node import GenerateAnswerFromImageNode
+from .concat_answers_node import ConcatAnswersNode
\ No newline at end of file
diff --git a/scrapegraphai/nodes/concat_answers_node.py b/scrapegraphai/nodes/concat_answers_node.py
new file mode 100644
index 00000000..8360e18c
--- /dev/null
+++ b/scrapegraphai/nodes/concat_answers_node.py
@@ -0,0 +1,76 @@
+"""
+ConcatAnswersNode Module
+"""
+
+from typing import List, Optional
+from ..utils.logging import get_logger
+from .base_node import BaseNode
+
+class ConcatAnswersNode(BaseNode):
+    """
+    A node responsible for concatenating the answers from multiple graph instances into a single answer.
+
+    Attributes:
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "ConcatAnswers",
+    ):
+        super().__init__(node_name, "node", input, output, 1, node_config)
+
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+
+    def _merge_dict(self, items):
+        
+        return {"products": {f"item_{i+1}": item for i, item in enumerate(items)}}
+
+    def execute(self, state: dict) -> dict:
+        """
+        Executes the node's logic to concatenate the answers from multiple graph instances into a
+        single answer.
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used
+                            to fetch the correct data from the state.
+
+        Returns:
+            dict: The updated state with the output key containing the generated answer.
+
+        Raises:
+            KeyError: If the input keys are not found in the state, indicating
+                      that the necessary information for generating an answer is missing.
+        """
+
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+        # Interpret input keys based on the provided input expression
+        input_keys = self.get_input_keys(state)
+
+        # Fetching data from the state based on the input keys
+        input_data = [state[key] for key in input_keys]
+
+        answers = input_data[0]
+        
+        if len(answers) > 1:
+            # merge the answers in one string
+            answer = self._merge_dict(answers)
+
+            # Update the state with the generated answer
+            state.update({self.output[0]: answer})
+
+        else:
+            state.update({self.output[0]: answers[0]})
+        return state

From f83c3d1ae64aecbf58eba5282b09b0b6458cf1ee Mon Sep 17 00:00:00 2001
From: ekinsenler <ekinsenler@gmail.com>
Date: Wed, 4 Sep 2024 20:29:04 +0300
Subject: [PATCH 2/3] add example for gemini

---
 .../smart_scraper_multi_concat.py             | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 examples/google_genai/smart_scraper_multi_concat.py

diff --git a/examples/google_genai/smart_scraper_multi_concat.py b/examples/google_genai/smart_scraper_multi_concat.py
new file mode 100644
index 00000000..facd74c3
--- /dev/null
+++ b/examples/google_genai/smart_scraper_multi_concat.py
@@ -0,0 +1,40 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperMultiConcatGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+gemini_key = os.getenv("GOOGLE_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": gemini_key,
+        "model": "google_genai/gemini-pro",
+    },
+}
+
+# *******************************************************
+# Create the SmartScraperMultiGraph instance and run it
+# *******************************************************
+
+multiple_search_graph = SmartScraperMultiConcatGraph(
+    prompt="Who is Marco Perini?",
+    source= [
+        "https://perinim.github.io/",
+        "https://perinim.github.io/cv/"
+        ],
+    schema=None,
+    config=graph_config
+)
+
+result = multiple_search_graph.run()
+print(json.dumps(result, indent=4))

From c0339d9dc68898478d2626dab17154da5e0cd5f3 Mon Sep 17 00:00:00 2001
From: ekinsenler <ekinsenler@gmail.com>
Date: Wed, 4 Sep 2024 20:31:24 +0300
Subject: [PATCH 3/3] fix file name

---
 ...raper_multi_concat.py => smart_scraper_multi_concat_gemini.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/google_genai/{smart_scraper_multi_concat.py => smart_scraper_multi_concat_gemini.py} (100%)

diff --git a/examples/google_genai/smart_scraper_multi_concat.py b/examples/google_genai/smart_scraper_multi_concat_gemini.py
similarity index 100%
rename from examples/google_genai/smart_scraper_multi_concat.py
rename to examples/google_genai/smart_scraper_multi_concat_gemini.py