diff --git a/CHANGELOG.md b/CHANGELOG.md index 37f3d75e..87860fbb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,17 +1,75 @@ -## [0.5.0-beta.8](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.0-beta.7...v0.5.0-beta.8) (2024-05-02) +## [0.6.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.2...v0.6.0) (2024-05-02) ### Features +* added node and graph for CSV scraping ([4d542a8](https://github.com/VinciGit00/Scrapegraph-ai/commit/4d542a88f7d949a5ba360dcd880716c8110a5d14)) * Allow end users to pass model instances for llm and embedding model ([b86aac2](https://github.com/VinciGit00/Scrapegraph-ai/commit/b86aac2188887642564a34d13d55d0fcff220ec1)) +* modified node name ([02d1af0](https://github.com/VinciGit00/Scrapegraph-ai/commit/02d1af006cb89bf860ee4f1186f582e2049a8e3d)) + + +### CI + +* **release:** 0.5.0-beta.7 [skip ci] ([40b2a34](https://github.com/VinciGit00/Scrapegraph-ai/commit/40b2a346d57865ca21915ecaa658096c52a2cc6b)) +* **release:** 0.5.0-beta.8 [skip ci] ([c11331a](https://github.com/VinciGit00/Scrapegraph-ai/commit/c11331a26ac325dfcf489272442ceeed13225a39)) + +## [0.5.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.1...v0.5.2) (2024-05-02) + + +### Bug Fixes -## [0.5.0-beta.7](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.0-beta.6...v0.5.0-beta.7) (2024-05-01) +* bug on script_creator_graph.py ([4a3bc37](https://github.com/VinciGit00/Scrapegraph-ai/commit/4a3bc37f2fbb24953edd68f28234ff14302ac120)) + +## [0.5.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.0...v0.5.1) (2024-05-02) + + +### Bug Fixes + +* examples and graphs ([5cf4e4f](https://github.com/VinciGit00/Scrapegraph-ai/commit/5cf4e4f92f024041c44211aebd2e3bdf73351a00)) + + +### Docs + +* added venv suggestion ([ba2b24b](https://github.com/VinciGit00/Scrapegraph-ai/commit/ba2b24b4cd82d63f9235051eb0e95519c51fd639)) +* base and fetch node ([e981796](https://github.com/VinciGit00/Scrapegraph-ai/commit/e9817963c8e98e35662cc5a140b0348792d25307)) +* change contributing.md with new ci/cd workflow ([3e91a46](https://github.com/VinciGit00/Scrapegraph-ai/commit/3e91a46522ab1f6b2f733efd234b06df4687c695)) +* fixed basegraph docstring ([29427c2](https://github.com/VinciGit00/Scrapegraph-ai/commit/29427c233485816967c4ecd6c1951351be9b27ce)) +* graphs and helpers docstrings ([0631985](https://github.com/VinciGit00/Scrapegraph-ai/commit/0631985e6156bd21ec5317faff9e345c8aa7f88b)) +* refactor examples ([c11fc28](https://github.com/VinciGit00/Scrapegraph-ai/commit/c11fc288963e1a2818e451279a3bf53eb33e22be)) +* refactor models docstrings ([18c20eb](https://github.com/VinciGit00/Scrapegraph-ai/commit/18c20eb03de183a0311be5ffe21f53ec4edf1b87)) +* refactor nodes docstrings ([1409797](https://github.com/VinciGit00/Scrapegraph-ai/commit/140979747598210674131befadd786800c9fb5ec)) +* update utils docstrings ([cf038b3](https://github.com/VinciGit00/Scrapegraph-ai/commit/cf038b33eaae42f65d7d9c782b5729092b272dd0)) + +## [0.5.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.4.1...v0.5.0) (2024-04-30) ### Features -* added node and graph for CSV scraping ([4d542a8](https://github.com/VinciGit00/Scrapegraph-ai/commit/4d542a88f7d949a5ba360dcd880716c8110a5d14)) -* modified node name ([02d1af0](https://github.com/VinciGit00/Scrapegraph-ai/commit/02d1af006cb89bf860ee4f1186f582e2049a8e3d)) +* add cluade integration ([e0ffc83](https://github.com/VinciGit00/Scrapegraph-ai/commit/e0ffc838b06c0f024026a275fc7f7b4243ad5cf9)) +* add co-author ([719a353](https://github.com/VinciGit00/Scrapegraph-ai/commit/719a353410992cc96f46ec984a5d3ec372e71ad2)) +* **fetch:** added playwright support ([42ab0aa](https://github.com/VinciGit00/Scrapegraph-ai/commit/42ab0aa1d275b5798ab6fc9feea575fe59b6e767)) +* added verbose flag to suppress print statements ([2dd7817](https://github.com/VinciGit00/Scrapegraph-ai/commit/2dd7817cfb37cfbeb7e65b3a24655ab238f48026)) +* base groq + requirements + toml update with groq ([7dd5b1a](https://github.com/VinciGit00/Scrapegraph-ai/commit/7dd5b1a03327750ffa5b2fb647eda6359edd1fc2)) +* **refactor:** changed variable names ([8fba7e5](https://github.com/VinciGit00/Scrapegraph-ai/commit/8fba7e5490f916b325588443bba3fff5c0733c17)) +* **llm:** implemented groq model ([dbbf10f](https://github.com/VinciGit00/Scrapegraph-ai/commit/dbbf10fc77b34d99d64c6cd7f74524b6d8e57fa5)) +* updated requirements.txt ([d368725](https://github.com/VinciGit00/Scrapegraph-ai/commit/d36872518a6d234eba5f8b7ddca7da93797874b2)) + + +### Bug Fixes + +* script generator and add new benchmarks ([e3d0194](https://github.com/VinciGit00/Scrapegraph-ai/commit/e3d0194dc93b20dc254fc48bba11559bf8a3a185)) + + +### CI + +* **release:** 0.4.0-beta.3 [skip ci] ([d13321b](https://github.com/VinciGit00/Scrapegraph-ai/commit/d13321b2f86d98e2a3a0c563172ca0dd29cdf5fb)) +* **release:** 0.5.0-beta.1 [skip ci] ([450291f](https://github.com/VinciGit00/Scrapegraph-ai/commit/450291f52e48cd35b2b8cc50ff66f5336326fa25)) +* **release:** 0.5.0-beta.2 [skip ci] ([ff7d12f](https://github.com/VinciGit00/Scrapegraph-ai/commit/ff7d12f1389d8eed87e9f6b2fc8b099767a904a9)) +* **release:** 0.5.0-beta.3 [skip ci] ([7e81f7c](https://github.com/VinciGit00/Scrapegraph-ai/commit/7e81f7c03f79c43219743be52affabbaf0d66387)) +* **release:** 0.5.0-beta.4 [skip ci] ([14e56f6](https://github.com/VinciGit00/Scrapegraph-ai/commit/14e56f6ab1711a08e749edbda860d349db491dae)) +* **release:** 0.5.0-beta.5 [skip ci] ([5ac97e2](https://github.com/VinciGit00/Scrapegraph-ai/commit/5ac97e2fb321be40c9787fbf8cb53fa62cf0ce06)) +* **release:** 0.5.0-beta.6 [skip ci] ([9356124](https://github.com/VinciGit00/Scrapegraph-ai/commit/9356124ce39568e88f7d2965181579c4ff0a5752)) + ## [0.5.0-beta.6](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.0-beta.5...v0.5.0-beta.6) (2024-04-30) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0c069a37..6f9f98f9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,22 +15,31 @@ Thank you for your interest in contributing to **ScrapeGraphAI**! We welcome con To get started with contributing, follow these steps: -1. Fork the repository on GitHub. +1. Fork the repository on GitHub **(FROM pre/beta branch)**. 2. Clone your forked repository to your local machine. -3. Install the necessary dependencies. +3. Install the necessary dependencies from requirements.txt or via pyproject.toml as you prefere :). 4. Make your changes or additions. 5. Test your changes thoroughly. 6. Commit your changes with descriptive commit messages. 7. Push your changes to your forked repository. -8. Submit a pull request to the main repository. +8. Submit a pull request to the pre/beta branch. + +N.B All the pull request to the main branch will be rejected! ## Contributing Guidelines Please adhere to the following guidelines when contributing to ScrapeGraphAI: - Follow the code style and formatting guidelines specified in the [Code Style](#code-style) section. -- Make sure your changes are well-documented and include any necessary updates to the project's documentation. -- Write clear and concise commit messages that describe the purpose of your changes. +- Make sure your changes are well-documented and include any necessary updates to the project's documentation and requirements if needed. +- Write clear and concise commit messages that describe the purpose of your changes and the last commit before the pull request has to follow the following format: + - `feat: Add new feature` + - `fix: Correct issue with existing feature` + - `docs: Update documentation` + - `style: Improve formatting and style` + - `refactor: Restructure code` + - `test: Add or update tests` + - `perf: Improve performance` - Be respectful and considerate towards other contributors and maintainers. ## Code Style @@ -42,6 +51,7 @@ Please make sure to format your code accordingly before submitting a pull reques - [Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/) - [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html) - [The Hitchhiker's Guide to Python](https://docs.python-guide.org/writing/style/) +- [Pylint style of code for the documentation](https://pylint.pycqa.org/en/1.6.0/tutorial.html) ## Submitting a Pull Request @@ -53,7 +63,7 @@ To submit your changes for review, please follow these steps: 4. Select your forked repository and the branch containing your changes. 5. Provide a descriptive title and detailed description for your pull request. 6. Reviewers will provide feedback and discuss any necessary changes. -7. Once your pull request is approved, it will be merged into the main repository. +7. Once your pull request is approved, it will be merged into the pre/beta branch. ## Reporting Issues diff --git a/README.md b/README.md index 44036096..d409ee38 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,9 @@ you will also need to install Playwright for javascript-based scraping: ```bash playwright install ``` + +**Note**: it is recommended to install the library in a virtual environment to avoid conflicts with other libraries 🐱 + ## 🔍 Demo Official streamlit demo: diff --git a/examples/groq/.env.example b/examples/groq/.env.example index e39e60fc..c934d4fa 100644 --- a/examples/groq/.env.example +++ b/examples/groq/.env.example @@ -1 +1,2 @@ -GROQ_APIKEY= "your groq key" \ No newline at end of file +GROQ_APIKEY= "your groq key" +OPENAI_APIKEY="your openai api key" \ No newline at end of file diff --git a/examples/groq/smart_scraper_groq.py b/examples/groq/smart_scraper_groq_ollama.py similarity index 100% rename from examples/groq/smart_scraper_groq.py rename to examples/groq/smart_scraper_groq_ollama.py diff --git a/examples/groq/smart_scraper_groq_openai.py b/examples/groq/smart_scraper_groq_openai.py new file mode 100644 index 00000000..19f86145 --- /dev/null +++ b/examples/groq/smart_scraper_groq_openai.py @@ -0,0 +1,52 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +groq_key = os.getenv("GROQ_APIKEY") +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "embeddings": { + "api_key": openai_key, + "model": "gpt-3.5-turbo", + }, + "headless": False +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/mixed_models/search_graph_mixed.py b/examples/mixed_models/search_graph_groq_ollama.py similarity index 100% rename from examples/mixed_models/search_graph_mixed.py rename to examples/mixed_models/search_graph_groq_ollama.py diff --git a/examples/openai/custom_graph_openai.py b/examples/openai/custom_graph_openai.py index 175c51ab..dab82c1f 100644 --- a/examples/openai/custom_graph_openai.py +++ b/examples/openai/custom_graph_openai.py @@ -40,6 +40,7 @@ fetch_node = FetchNode( input="url | local_dir", output=["doc"], + node_config={"headless": True, "verbose": True} ) parse_node = ParseNode( input="doc", diff --git a/examples/openai/scrape_plain_text_openai.py b/examples/openai/scrape_plain_text_openai.py index 845853e1..ffbcf12f 100644 --- a/examples/openai/scrape_plain_text_openai.py +++ b/examples/openai/scrape_plain_text_openai.py @@ -53,11 +53,3 @@ graph_exec_info = smart_scraper_graph.get_execution_info() print(prettify_exec_info(graph_exec_info)) - - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/openai/script_generator_openai.py b/examples/openai/script_generator_openai.py index be597d98..e731f852 100644 --- a/examples/openai/script_generator_openai.py +++ b/examples/openai/script_generator_openai.py @@ -20,7 +20,7 @@ "api_key": openai_key, "model": "gpt-3.5-turbo", }, - "library": "beautifoulsoup" + "library": "beautifulsoup" } # ************************************************ diff --git a/examples/openai/xml_scraper_openai.py b/examples/openai/xml_scraper_openai.py index 32b79981..06600afa 100644 --- a/examples/openai/xml_scraper_openai.py +++ b/examples/openai/xml_scraper_openai.py @@ -23,13 +23,14 @@ # Define the configuration for the graph # ************************************************ -gemini_key = os.getenv("GOOGLE_APIKEY") +openai_key = os.getenv("OPENAI_APIKEY") graph_config = { "llm": { - "api_key": gemini_key, + "api_key": openai_key, "model": "gpt-3.5-turbo", }, + "verbose":False, } # ************************************************ diff --git a/examples/single_node/fetch_node.py b/examples/single_node/fetch_node.py index 90660996..d03cb495 100644 --- a/examples/single_node/fetch_node.py +++ b/examples/single_node/fetch_node.py @@ -12,6 +12,9 @@ robots_node = FetchNode( input="url | local_dir", output=["doc"], + node_config={ + "headless": False + } ) # ************************************************ diff --git a/examples/single_node/robot_node.py b/examples/single_node/robot_node.py index 8aa26446..0e446262 100644 --- a/examples/single_node/robot_node.py +++ b/examples/single_node/robot_node.py @@ -26,7 +26,9 @@ robots_node = RobotsNode( input="url", output=["is_scrapable"], - node_config={"llm": llm_model} + node_config={"llm": llm_model, + "headless": False + } ) # ************************************************ diff --git a/pyproject.toml b/pyproject.toml index f5d904c1..bb36dd36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,8 @@ [tool.poetry] name = "scrapegraphai" -version = "0.5.0b8" +version = "0.6.0" + description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ diff --git a/scrapegraphai/builders/graph_builder.py b/scrapegraphai/builders/graph_builder.py index 8001bc3b..7280c50b 100644 --- a/scrapegraphai/builders/graph_builder.py +++ b/scrapegraphai/builders/graph_builder.py @@ -1,5 +1,5 @@ """ -Module for making the graph building +GraphBuilder Module """ from langchain_core.prompts import ChatPromptTemplate diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 25a29ac7..79e50e44 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -1,6 +1,7 @@ """ __init__.py file for graphs folder """ + from .base_graph import BaseGraph from .smart_scraper_graph import SmartScraperGraph from .speech_graph import SpeechGraph diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index e70c9e95..7949c114 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -1,6 +1,7 @@ """ -Module having abstract class for creating all the graphs +AbstractGraph Module """ + from abc import ABC, abstractmethod from typing import Optional from ..models import OpenAI, Gemini, Ollama, AzureOpenAI, HuggingFace, Groq @@ -9,13 +10,34 @@ class AbstractGraph(ABC): """ - Abstract class representing a generic graph-based tool. + Scaffolding class for creating a graph representation and executing it. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + prompt (str): The prompt for the graph. + config (dict): Configuration parameters for the graph. + source (str, optional): The source of the graph. + + Example: + >>> class MyGraph(AbstractGraph): + ... def _create_graph(self): + ... # Implementation of graph creation here + ... return graph + ... + >>> my_graph = MyGraph("Example Graph", {"llm": {"model": "gpt-3.5-turbo"}}, "example_source") + >>> result = my_graph.run() """ def __init__(self, prompt: str, config: dict, source: Optional[str] = None): - """ - Initializes the AbstractGraph with a prompt, file source, and configuration. - """ + self.prompt = prompt self.source = source self.config = config @@ -32,6 +54,7 @@ def __init__(self, prompt: str, config: dict, source: Optional[str] = None): self.final_state = None self.execution_info = None + def _set_model_token(self, llm): if 'Azure' in str(type(llm)): @@ -43,8 +66,18 @@ def _set_model_token(self, llm): def _create_llm(self, llm_config: dict, chat=False) -> object: """ - Creates an instance of the language model (OpenAI or Gemini) based on configuration. + Create a large language model instance based on the configuration provided. + + Args: + llm_config (dict): Configuration parameters for the language model. + + Returns: + object: An instance of the language model client. + + Raises: + KeyError: If the model is not supported. """ + llm_defaults = { "temperature": 0, "streaming": False @@ -119,8 +152,15 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: def get_state(self, key=None) -> dict: """"" - Obtain the current state + Get the final state of the graph. + + Args: + key (str, optional): The key of the final state to retrieve. + + Returns: + dict: The final state of the graph. """ + if key is not None: return self.final_state[key] return self.final_state @@ -128,7 +168,11 @@ def get_state(self, key=None) -> dict: def get_execution_info(self): """ Returns the execution information of the graph. + + Returns: + dict: The execution information of the graph. """ + return self.execution_info @abstractmethod diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index 855085ca..5dd4cac4 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -1,6 +1,7 @@ """ -Module for creating the base graphs - """ +BaseGraph Module +""" + import time import warnings from langchain_community.callbacks import get_openai_callback @@ -16,21 +17,33 @@ class BaseGraph: key-value pair corresponds to the from-node and to-node relationship. entry_point (str): The name of the entry point node from which the graph execution begins. - Methods: - execute(initial_state): Executes the graph's nodes starting from the entry point and - traverses the graph based on the provided initial state. - Args: nodes (iterable): An iterable of node instances that will be part of the graph. edges (iterable): An iterable of tuples where each tuple represents a directed edge in the graph, defined by a pair of nodes (from_node, to_node). entry_point (BaseNode): The node instance that represents the entry point of the graph. + + Raises: + Warning: If the entry point node is not the first node in the list. + + Example: + >>> BaseGraph( + ... nodes=[ + ... fetch_node, + ... parse_node, + ... rag_node, + ... generate_answer_node, + ... ], + ... edges=[ + ... (fetch_node, parse_node), + ... (parse_node, rag_node), + ... (rag_node, generate_answer_node) + ... ], + ... entry_point=fetch_node + ... ) """ def __init__(self, nodes: list, edges: list, entry_point: str): - """ - Initializes the graph with nodes, edges, and the entry point. - """ self.nodes = nodes self.edges = self._create_edges({e for e in edges}) @@ -51,6 +64,7 @@ def _create_edges(self, edges: list) -> dict: Returns: dict: A dictionary of edges with the from-node as keys and to-node as values. """ + edge_dict = {} for from_node, to_node in edges: edge_dict[from_node.node_name] = to_node.node_name @@ -66,8 +80,9 @@ def execute(self, initial_state: dict) -> Tuple[dict, list]: initial_state (dict): The initial state to pass to the entry point node. Returns: - dict: The state after execution has completed, which may have been altered by the nodes. + Tuple[dict, list]: A tuple containing the final state and a list of execution info. """ + current_node_name = self.nodes[0] state = initial_state diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py index 851ba8de..f7392212 100644 --- a/scrapegraphai/graphs/json_scraper_graph.py +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -1,6 +1,7 @@ """ -Module for creating the smart scraper +JSONScraperGraph Module """ + from .base_graph import BaseGraph from ..nodes import ( FetchNode, @@ -13,22 +14,45 @@ class JSONScraperGraph(AbstractGraph): """ - SmartScraper is a comprehensive web scraping tool that automates the process of extracting - information from web pages using a natural language model to interpret and answer prompts. + JSONScraperGraph defines a scraping pipeline for JSON files. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, + configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + + Example: + >>> json_scraper = JSONScraperGraph( + ... "List me all the attractions in Chioggia.", + ... "data/chioggia.json", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = json_scraper.run() """ def __init__(self, prompt: str, source: str, config: dict): - """ - Initializes the JsonScraperGraph with a prompt, source, and configuration. - """ super().__init__(prompt, config, source) self.input_key = "json" if source.endswith("json") else "json_dir" - def _create_graph(self): + def _create_graph(self) -> BaseGraph: """ Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. """ + fetch_node = FetchNode( input="json_dir", output=["doc"], @@ -81,7 +105,11 @@ def _create_graph(self): def run(self) -> str: """ Executes the web scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index 1a64512e..105048db 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -1,6 +1,7 @@ """ -Module for creating the smart scraper +ScriptCreatorGraph Module """ + from .base_graph import BaseGraph from ..nodes import ( FetchNode, @@ -13,47 +14,79 @@ class ScriptCreatorGraph(AbstractGraph): """ - SmartScraper is a comprehensive web scraping tool that automates the process of extracting - information from web pages using a natural language model to interpret and answer prompts. + ScriptCreatorGraph defines a scraping pipeline for generating web scraping scripts. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, + configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + model_token (int): The token limit for the language model. + library (str): The library used for web scraping. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + + Example: + >>> script_creator = ScriptCreatorGraph( + ... "List me all the attractions in Chioggia.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = script_creator.run() """ def __init__(self, prompt: str, source: str, config: dict): - """ - Initializes the ScriptCreatorGraph with a prompt, source, and configuration. - """ + self.library = config['library'] super().__init__(prompt, config, source) self.input_key = "url" if source.startswith("http") else "local_dir" - def _create_graph(self): + def _create_graph(self) -> BaseGraph: """ Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. """ + fetch_node = FetchNode( input="url | local_dir", output=["doc"], node_config={ - "headless": True if self.config is None else self.config.get("headless", True)} + "headless": self.headless, + "verbose": self.verbose + } ) parse_node = ParseNode( input="doc", output=["parsed_doc"], - node_config={"chunk_size": self.model_token} + node_config={"chunk_size": self.model_token, + "verbose": self.verbose + } ) rag_node = RAGNode( input="user_prompt & (parsed_doc | doc)", output=["relevant_chunks"], node_config={ "llm": self.llm_model, - "embedder_model": self.embedder_model + "embedder_model": self.embedder_model, + "verbose": self.verbose } ) generate_scraper_node = GenerateScraperNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], - node_config={"llm": self.llm_model}, + node_config={"llm": self.llm_model, + "verbose": self.verbose}, library=self.library, website=self.source ) @@ -76,7 +109,11 @@ def _create_graph(self): def run(self) -> str: """ Executes the web scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index 4cc179bb..41548a77 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -1,6 +1,7 @@ """ -Module for making the search on the intenet +SearchGraph Module """ + from .base_graph import BaseGraph from ..nodes import ( SearchInternetNode, @@ -14,13 +15,37 @@ class SearchGraph(AbstractGraph): """ - Module for searching info on the internet + SearchGraph is a scraping pipeline that searches the internet for answers to a given prompt. + It only requires a user prompt to search the internet and generate an answer. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + config (dict): Configuration parameters for the graph. + + Example: + >>> search_graph = SearchGraph( + ... "What is Chioggia famous for?", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = search_graph.run() """ - def _create_graph(self): + def _create_graph(self) -> BaseGraph: """ Creates the graph of nodes representing the workflow for web scraping and searching. + + Returns: + BaseGraph: A graph instance representing the web scraping and searching workflow. """ + search_internet_node = SearchInternetNode( input="user_prompt", output=["url"], @@ -83,7 +108,11 @@ def _create_graph(self): def run(self) -> str: """ Executes the web scraping and searching process. + + Returns: + str: The answer to the prompt. """ + inputs = {"user_prompt": self.prompt} self.final_state, self.execution_info = self.graph.execute(inputs) diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 77fd09ee..4d6b0e93 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -1,6 +1,7 @@ """ -Module for creating the smart scraper +SmartScraperGraph Module """ + from .base_graph import BaseGraph from ..nodes import ( FetchNode, @@ -13,22 +14,46 @@ class SmartScraperGraph(AbstractGraph): """ - SmartScraper is a comprehensive web scraping tool that automates the process of extracting - information from web pages using a natural language model to interpret and answer prompts. + SmartScraper is a scraping pipeline that automates the process of + extracting information from web pages + using a natural language model to interpret and answer prompts. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, + configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + + Example: + >>> smart_scraper = SmartScraperGraph( + ... "List me all the attractions in Chioggia.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper.run() + ) """ def __init__(self, prompt: str, source: str, config: dict): - """ - Initializes the SmartScraperGraph with a prompt, source, and configuration. - """ super().__init__(prompt, config, source) self.input_key = "url" if source.startswith("http") else "local_dir" - - def _create_graph(self): + def _create_graph(self) -> BaseGraph: """ Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. """ fetch_node = FetchNode( input="url | local_dir", @@ -81,8 +106,12 @@ def _create_graph(self): def run(self) -> str: """ - Executes the web scraping process and returns the answer to the prompt. + Executes the scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 7a2524e9..3edadfd0 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -1,6 +1,7 @@ """ -Module for converting text to speach +SpeechGraph Module """ + from scrapegraphai.utils.save_audio_from_bytes import save_audio_from_bytes from ..models import OpenAITextToSpeech from .base_graph import BaseGraph @@ -16,22 +17,43 @@ class SpeechGraph(AbstractGraph): """ - SpeechSummaryGraph is a tool that automates the process of extracting and summarizing - information from web pages, then converting that summary into spoken word via an MP3 file. + SpeechyGraph is a scraping pipeline that scrapes the web, provide an answer to a given prompt, and generate an audio file. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + + Example: + >>> speech_graph = SpeechGraph( + ... "List me all the attractions in Chioggia and generate an audio summary.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "gpt-3.5-turbo"}} """ def __init__(self, prompt: str, source: str, config: dict): - """ - Initializes the SmartScraperGraph with a prompt, source, and configuration. - """ super().__init__(prompt, config, source) self.input_key = "url" if source.startswith("http") else "local_dir" - def _create_graph(self): + def _create_graph(self) -> BaseGraph: """ - Creates the graph of nodes representing the workflow for web scraping and summarization. + Creates the graph of nodes representing the workflow for web scraping and audio generation. + + Returns: + BaseGraph: A graph instance representing the web scraping and audio generation workflow. """ + fetch_node = FetchNode( input="url | local_dir", output=["doc"], @@ -93,8 +115,12 @@ def _create_graph(self): def run(self) -> str: """ - Executes the web scraping, summarization, and text-to-speech process. + Executes the scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) @@ -105,4 +131,4 @@ def run(self) -> str: "output_path", "output.mp3")) print(f"Audio saved to {self.config.get('output_path', 'output.mp3')}") - return self.final_state + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index 659de51c..c84e1506 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -1,6 +1,7 @@ """ -Module for creating the smart scraper +XMLScraperGraph Module """ + from .base_graph import BaseGraph from ..nodes import ( FetchNode, @@ -13,22 +14,47 @@ class XMLScraperGraph(AbstractGraph): """ - SmartScraper is a comprehensive web scraping tool that automates the process of extracting - information from web pages using a natural language model to interpret and answer prompts. + XMLScraperGraph is a scraping pipeline that extracts information from XML files using a natural + language model to interpret and answer prompts. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, + configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + + Example: + >>> xml_scraper = XMLScraperGraph( + ... "List me all the attractions in Chioggia.", + ... "data/chioggia.xml", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = xml_scraper.run() """ def __init__(self, prompt: str, source: str, config: dict): - """ - Initializes the XmlScraperGraph with a prompt, source, and configuration. - """ super().__init__(prompt, config, source) self.input_key = "xml" if source.endswith("xml") else "xml_dir" - def _create_graph(self): + def _create_graph(self) -> BaseGraph: """ Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. """ + fetch_node = FetchNode( input="xml_dir", output=["doc"], @@ -81,7 +107,11 @@ def _create_graph(self): def run(self) -> str: """ Executes the web scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) diff --git a/scrapegraphai/helpers/__init__.py b/scrapegraphai/helpers/__init__.py index 4565e2d9..23bc0154 100644 --- a/scrapegraphai/helpers/__init__.py +++ b/scrapegraphai/helpers/__init__.py @@ -1,7 +1,7 @@ """ __init__.py for th e helpers folder - """ + from .nodes_metadata import nodes_metadata from .schemas import graph_schema from .models_tokens import models_tokens diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 9c8abdef..73a3999f 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -1,6 +1,7 @@ """ Models token """ + models_tokens = { "openai": { "gpt-3.5-turbo-0125": 16385, diff --git a/scrapegraphai/helpers/robots.py b/scrapegraphai/helpers/robots.py index e89d203d..de49a98c 100644 --- a/scrapegraphai/helpers/robots.py +++ b/scrapegraphai/helpers/robots.py @@ -1,7 +1,7 @@ - """ Module for mapping the models in ai agents """ + robots_dictionary = { "gpt-3.5-turbo": ["GPTBot", "ChatGPT-user"], "gpt-4-turbo": ["GPTBot", "ChatGPT-user"], diff --git a/scrapegraphai/models/azure_openai.py b/scrapegraphai/models/azure_openai.py index 4a7c079b..ae47d4e6 100644 --- a/scrapegraphai/models/azure_openai.py +++ b/scrapegraphai/models/azure_openai.py @@ -1,19 +1,17 @@ """ -Azure Openai configuration wrapper +AzureOpenAI Module """ from langchain_openai import AzureChatOpenAI class AzureOpenAI(AzureChatOpenAI): - """Class for wrapping openai module""" + """ + A wrapper for the AzureChatOpenAI class that provides default configuration + and could be extended with additional methods if needed. + + Args: + llm_config (dict): Configuration parameters for the language model. + """ def __init__(self, llm_config: dict): - """ - A wrapper for the ChatOpenAI class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - # Initialize the superclass (AzureChatOpenAI) with provided config parameters super().__init__(**llm_config) diff --git a/scrapegraphai/models/gemini.py b/scrapegraphai/models/gemini.py index e35fd684..91632907 100644 --- a/scrapegraphai/models/gemini.py +++ b/scrapegraphai/models/gemini.py @@ -1,20 +1,17 @@ """ -Gemini module configuration +Gemini Module """ from langchain_google_genai import ChatGoogleGenerativeAI class Gemini(ChatGoogleGenerativeAI): - """Class for wrapping gemini module""" + """ + A wrapper for the Gemini class that provides default configuration + and could be extended with additional methods if needed. - def __init__(self, llm_config: dict): - """ - A wrapper for the Gemini class that provides default configuration - and could be extended with additional methods if needed. + Args: + llm_config (dict): Configuration parameters for the language model (e.g., model="gemini-pro") + """ - Args: - llm_config (dict): Configuration parameters for the language model. - such as model="gemini-pro" and api_key - """ - # Initialize the superclass (ChatOpenAI) with provided config parameters + def __init__(self, llm_config: dict): super().__init__(**llm_config) diff --git a/scrapegraphai/models/groq.py b/scrapegraphai/models/groq.py index cf2f4755..92d8f8bb 100644 --- a/scrapegraphai/models/groq.py +++ b/scrapegraphai/models/groq.py @@ -1,21 +1,18 @@ """ -Groq module configuration +Groq Module """ from langchain_groq import ChatGroq class Groq(ChatGroq): - """Class for wrapping Groq module""" + """ + A wrapper for the Groq class that provides default configuration + and could be extended with additional methods if needed. - def __init__(self, llm_config: dict): - """ - A wrapper for the Groq class that provides default configuration - and could be extended with additional methods if needed. + Args: + llm_config (dict): Configuration parameters for the language model (e.g., model="llama3-70b-8192") + """ - Args: - llm_config (dict): Configuration parameters for the language model. - such as model="llama3-70b-8192" and api_key - """ - # Initialize the superclass (ChatOpenAI) with provided config parameters + def __init__(self, llm_config: dict): super().__init__(**llm_config) \ No newline at end of file diff --git a/scrapegraphai/models/hugging_face.py b/scrapegraphai/models/hugging_face.py index d2df52d3..9696db1e 100644 --- a/scrapegraphai/models/hugging_face.py +++ b/scrapegraphai/models/hugging_face.py @@ -1,22 +1,17 @@ """ -Module for implementing the hugginface class +HuggingFace Module """ from langchain_community.chat_models.huggingface import ChatHuggingFace class HuggingFace(ChatHuggingFace): - """Provides a convenient wrapper for interacting with Hugging Face language models - designed for conversational AI applications. + """ + A wrapper for the HuggingFace class that provides default configuration + and could be extended with additional methods if needed. Args: - llm_config (dict): A configuration dictionary containing: - * api_key (str, optional): Your Hugging Face API key. - * model_name (str): The name of the Hugging Face LLM to load. - * tokenizer_name (str, optional): Name of the corresponding tokenizer. - * device (str, optional): Device for running the model ('cpu' by default). - + llm_config (dict): Configuration parameters for the language model. """ def __init__(self, llm_config: dict): - """Initializes the HuggingFace chat model wrapper""" super().__init__(**llm_config) diff --git a/scrapegraphai/models/ollama.py b/scrapegraphai/models/ollama.py index 9636a257..4bf48178 100644 --- a/scrapegraphai/models/ollama.py +++ b/scrapegraphai/models/ollama.py @@ -1,19 +1,17 @@ """ -openai configuration wrapper +Ollama Module """ from langchain_community.chat_models import ChatOllama class Ollama(ChatOllama): - """Class for wrapping ollama module""" + """ + A wrapper for the ChatOllama class that provides default configuration + and could be extended with additional methods if needed. - def __init__(self, llm_config: dict): - """ - A wrapper for the ChatOllama class that provides default configuration - and could be extended with additional methods if needed. + Args: + llm_config (dict): Configuration parameters for the language model. + """ - Args: - llm_config (dict): Configuration parameters for the language model. - """ - # Initialize the superclass (ChatOllama) with provided config parameters + def __init__(self, llm_config: dict): super().__init__(**llm_config) diff --git a/scrapegraphai/models/openai.py b/scrapegraphai/models/openai.py index 7c76a4b1..bfd9d74c 100644 --- a/scrapegraphai/models/openai.py +++ b/scrapegraphai/models/openai.py @@ -1,19 +1,17 @@ """ -openai configuration wrapper +OpenAI Module """ from langchain_openai import ChatOpenAI class OpenAI(ChatOpenAI): - """Class for wrapping openai module""" + """ + A wrapper for the ChatOpenAI class that provides default configuration + and could be extended with additional methods if needed. - def __init__(self, llm_config: dict): - """ - A wrapper for the ChatOpenAI class that provides default configuration - and could be extended with additional methods if needed. + Args: + llm_config (dict): Configuration parameters for the language model. + """ - Args: - llm_config (dict): Configuration parameters for the language model. - """ - # Initialize the superclass (ChatOpenAI) with provided config parameters + def __init__(self, llm_config: dict): super().__init__(**llm_config) diff --git a/scrapegraphai/models/openai_itt.py b/scrapegraphai/models/openai_itt.py index 0ab8f4ef..5bbdf8ad 100644 --- a/scrapegraphai/models/openai_itt.py +++ b/scrapegraphai/models/openai_itt.py @@ -1,6 +1,5 @@ """ -This module contains the OpenAIImageToText class, -which is a subclass of ChatOpenAI that is specialized for converting images to text. +OpenAIImageToText Module """ from langchain_openai import ChatOpenAI @@ -9,39 +8,27 @@ class OpenAIImageToText(ChatOpenAI): """ - A class that uses OpenAI's Chat API to convert an image to text. + A wrapper for the OpenAIImageToText class that provides default configuration + and could be extended with additional methods if needed. Args: - llm_config (dict): The configuration for the language model. - - Attributes: - max_tokens (int): The maximum number of tokens to generate in the response. - - Methods: - run(image_url): Runs the image-to-text conversion using the provided image URL. + llm_config (dict): Configuration parameters for the language model. + max_tokens (int): The maximum number of tokens to generate. """ def __init__(self, llm_config: dict): - """ - Initializes an instance of the OpenAIImageToText class. - - Args: - llm_config (dict): The configuration for the language model. - - """ super().__init__(**llm_config, max_tokens=256) - def run(self, image_url: str): + def run(self, image_url: str) -> str: """ Runs the image-to-text conversion using the provided image URL. Args: - image_url (str): The URL of the image to convert to text. + image_url (str): The URL of the image to convert. Returns: - str: The generated text description of the image. - + str: The text description of the image. """ message = HumanMessage( content=[ diff --git a/scrapegraphai/models/openai_tts.py b/scrapegraphai/models/openai_tts.py index f2227f8c..a4432398 100644 --- a/scrapegraphai/models/openai_tts.py +++ b/scrapegraphai/models/openai_tts.py @@ -1,6 +1,5 @@ """ -This module contains the OpenAITextToSpeech class, which uses OpenAI's API -to convert text into speech. +OpenAITextToSpeech Module """ from openai import OpenAI @@ -8,44 +7,33 @@ class OpenAITextToSpeech: """ - A class that uses OpenAI's API to convert text to speech. - - Args: - llm_config (dict): The configuration for the language model. + Implements a text-to-speech model using the OpenAI API. Attributes: + client (OpenAI): The OpenAI client used to interact with the API. model (str): The model to use for text-to-speech conversion. voice (str): The voice model to use for generating speech. - Methods: - run(text): Converts the provided text to speech and returns the - bytes of the generated speech. + Args: + tts_config (dict): Configuration parameters for the text-to-speech model. """ def __init__(self, tts_config: dict): - """ - Initializes an instance of the OpenAITextToSpeech class. - - Args: - llm_config (dict): The configuration for the language model. - model (str, optional): The model to use for text-to-speech conversion. - Defaults to "tts-1". - voice (str, optional): The voice model to use for generating speech. - Defaults to "alloy". - """ # convert model_name to model self.client = OpenAI(api_key=tts_config.get("api_key")) self.model = tts_config.get("model", "tts-1") self.voice = tts_config.get("voice", "alloy") - def run(self, text): + def run(self, text: str) -> bytes: """ Converts the provided text to speech and returns the bytes of the generated speech. Args: text (str): The text to convert to speech. + Returns: + bytes: The bytes of the generated speech audio. """ response = self.client.audio.speech.create( model=self.model, diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index f00fd869..f3329320 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -1,6 +1,7 @@ """ -Module for creating the basic node +BaseNode Module """ + from abc import ABC, abstractmethod from typing import Optional, List import re @@ -8,50 +9,40 @@ class BaseNode(ABC): """ - An abstract base class for nodes in a graph-based workflow. Each node is - intended to perform a specific action when executed as part of the graph's - processing flow. + An abstract base class for nodes in a graph-based workflow, designed to perform specific actions when executed. Attributes: - node_name (str): A unique identifier for the node. - node_type (str): Specifies the node's type, which influences how the - node interacts within the graph. Valid values are - "node" for standard nodes and "conditional_node" for - nodes that determine the flow based on conditions. - - Methods: - execute(state): An abstract method that subclasses must implement. This - method should contain the logic that the node executes - when it is reached in the graph's flow. It takes the - graph's current state as input and returns the updated - state after execution. - + node_name (str): The unique identifier name for the node. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of + min_input_len (int): Minimum required number of input keys. + node_config (Optional[dict]): Additional configuration for the node. + Args: - node_name (str): The unique identifier name for the node. This name is - used to reference the node within the graph. - node_type (str): The type of the node, limited to "node" or - "conditional_node". This categorization helps in - determining the node's role and behavior within the - graph. + node_name (str): Name for identifying the node. + node_type (str): Type of the node; must be 'node' or 'conditional_node'. + input (str): Expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + min_input_len (int, optional): Minimum required number of input keys; defaults to 1. + node_config (Optional[dict], optional): Additional configuration for the node; defaults to None. Raises: - ValueError: If the provided `node_type` is not one of the allowed - values ("node" or "conditional_node"), a ValueError is - raised to indicate the incorrect usage. + ValueError: If `node_type` is not one of the allowed types. + + Example: + >>> class MyNode(BaseNode): + ... def execute(self, state): + ... # Implementation of node logic here + ... return state + ... + >>> my_node = MyNode("ExampleNode", "node", "input_spec", ["output_spec"]) + >>> updated_state = my_node.execute({'key': 'value'}) + {'key': 'value'} """ def __init__(self, node_name: str, node_type: str, input: str, output: List[str], min_input_len: int = 1, node_config: Optional[dict] = None): - """ - Initialize the node with a unique identifier and a specified node type. - - Args: - node_name (str): The unique identifier name for the node. - node_type (str): The type of the node, limited to "node" or "conditional_node". - Raises: - ValueError: If node_type is not "node" or "conditional_node". - """ self.node_name = node_name self.input = input self.output = output @@ -66,17 +57,31 @@ def __init__(self, node_name: str, node_type: str, input: str, output: List[str] @abstractmethod def execute(self, state: dict) -> dict: """ - Execute the node's logic and return the updated state. + Execute the node's logic based on the current state and update it accordingly. + Args: state (dict): The current state of the graph. - :return: The updated state after executing this node. + + Returns: + dict: The updated state after executing the node's logic. """ + pass def get_input_keys(self, state: dict) -> List[str]: - """Use the _parse_input_keys method to identify which state keys are - needed based on the input attribute """ + Determines the necessary state keys based on the input specification. + + Args: + state (dict): The current state of the graph used to parse input keys. + + Returns: + List[str]: A list of input keys required for node operation. + + Raises: + ValueError: If error occurs in parsing input keys. + """ + try: input_keys = self._parse_input_keys(state, self.input) self._validate_input_keys(input_keys) @@ -86,6 +91,16 @@ def get_input_keys(self, state: dict) -> List[str]: f"Error parsing input keys for {self.node_name}: {str(e)}") def _validate_input_keys(self, input_keys): + """ + Validates if the provided input keys meet the minimum length requirement. + + Args: + input_keys (List[str]): The list of input keys to validate. + + Raises: + ValueError: If the number of input keys is less than the minimum required. + """ + if len(input_keys) < self.min_input_len: raise ValueError( f"""{self.node_name} requires at least {self.min_input_len} input keys, @@ -93,8 +108,8 @@ def _validate_input_keys(self, input_keys): def _parse_input_keys(self, state: dict, expression: str) -> List[str]: """ - Parses the input keys expression and identifies the corresponding keys - from the state that match the expression logic. + Parses the input keys expression to extract relevant keys from the state based on logical conditions. + The expression can contain AND (&), OR (|), and parentheses to group conditions. Args: state (dict): The current state of the graph. @@ -102,7 +117,11 @@ def _parse_input_keys(self, state: dict, expression: str) -> List[str]: Returns: List[str]: A list of key names that match the input keys expression logic. + + Raises: + ValueError: If the expression is invalid or if no state keys match the expression. """ + # Check for empty expression if not expression: raise ValueError("Empty expression.") @@ -142,9 +161,12 @@ def _parse_input_keys(self, state: dict, expression: str) -> List[str]: "Missing or unbalanced parentheses in expression.") # Helper function to evaluate an expression without parentheses - def evaluate_simple_expression(exp): + def evaluate_simple_expression(exp: str) -> List[str]: + """Evaluate an expression without parentheses.""" + # Split the expression by the OR operator and process each segment for or_segment in exp.split('|'): + # Check if all elements in an AND segment are in state and_segment = or_segment.split('&') if all(elem.strip() in state for elem in and_segment): @@ -152,13 +174,17 @@ def evaluate_simple_expression(exp): return [] # Helper function to evaluate expressions with parentheses - def evaluate_expression(expression): + def evaluate_expression(expression: str) -> List[str]: + """Evaluate an expression with parentheses.""" + while '(' in expression: start = expression.rfind('(') end = expression.find(')', start) sub_exp = expression[start + 1:end] + # Replace the evaluated part with a placeholder and then evaluate it sub_result = evaluate_simple_expression(sub_exp) + # For simplicity in handling, join sub-results with OR to reprocess them later expression = expression[:start] + \ '|'.join(sub_result) + expression[end+1:] diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 7a34536e..f873654d 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -1,5 +1,5 @@ """ -Module for fetching the HTML node +FetchNode Module """ from typing import List, Optional @@ -12,38 +12,24 @@ class FetchNode(BaseNode): """ A node responsible for fetching the HTML content of a specified URL and updating - the graph's state with this content. It uses the AsyncHtmlLoader for asynchronous - document loading. + the graph's state with this content. It uses the AsyncChromiumLoader to fetch the + content asynchronously. This node acts as a starting point in many scraping workflows, preparing the state with the necessary HTML content for further processing by subsequent nodes in the graph. Attributes: - node_name (str): The unique identifier name for the node. - node_type (str): The type of the node, defaulting to "node". This categorization - helps in determining the node's role and behavior within the graph. - The "node" type is used for standard operational nodes. - + headless (bool): A flag indicating whether the browser should run in headless mode. + verbose (bool): A flag indicating whether to print verbose output during execution. + Args: - node_name (str): The unique identifier name for the node. This name is used to - reference the node within the graph. - node_type (str, optional): The type of the node, limited to "node" or - "conditional_node". Defaults to "node". - - Methods: - execute(state): Fetches the HTML content for the URL specified in the state and - updates the state with this content under the 'document' key. - The 'url' key must be present in the state for the operation - to succeed. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (Optional[dict]): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Fetch". """ def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"): - """ - Initializes the FetchHTMLNode with a node name and node type. - Arguments: - node_name (str): name of the node - prox_rotation (bool): if you wamt to rotate proxies - """ super().__init__(node_name, "node", input, output, 1) self.headless = True if node_config is None else node_config.get("headless", True) @@ -55,13 +41,14 @@ def execute(self, state): update the state with this content. Args: - state (dict): The current state of the graph, expected to contain a 'url' key. + state (dict): The current state of the graph. The input keys will be used + to fetch the correct data types from the state. Returns: - dict: The updated state with a new 'document' key containing the fetched HTML content. + dict: The updated state with a new output key containing the fetched HTML content. Raises: - KeyError: If the 'url' key is not found in the state, indicating that the + KeyError: If the input key is not found in the state, indicating that the necessary information to perform the operation is missing. """ if self.verbose: diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index e4047356..e9b4dd40 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,6 +1,7 @@ """ -Module for generating the answer node +GenerateAnswerNode Module """ + # Imports from standard library from typing import List from tqdm import tqdm @@ -16,57 +17,43 @@ class GenerateAnswerNode(BaseNode): """ - A node that generates an answer using a language model (LLM) based on the user's input + A node that generates an answer using a large language model (LLM) based on the user's input and the content extracted from a webpage. It constructs a prompt from the user's input and the scraped content, feeds it to the LLM, and parses the LLM's response to produce an answer. Attributes: - llm: An instance of a language model client, configured for generating answers. - node_name (str): The unique identifier name for the node, defaulting - to "GenerateAnswerNode". - node_type (str): The type of the node, set to "node" indicating a - standard operational node. + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. Args: - llm: An instance of the language model client (e.g., ChatOpenAI) used - for generating answers. - node_name (str, optional): The unique identifier name for the node. - Defaults to "GenerateAnswerNode". - - Methods: - execute(state): Processes the input and document from the state to generate an answer, - updating the state with the generated answer under the 'answer' key. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "GenerateAnswer"): - """ - Initializes the GenerateAnswerNode with a language model client and a node name. - Args: - llm: An instance of the OpenAIImageToText class. - node_name (str): name of the node - """ super().__init__(node_name, "node", input, output, 2, node_config) + self.llm_model = node_config["llm"] self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ Generates an answer by constructing a prompt from the user's input and the scraped content, querying the language model, and parsing its response. - The method updates the state with the generated answer under the 'answer' key. - Args: - state (dict): The current state of the graph, expected to contain 'user_input', - and optionally 'parsed_document' or 'relevant_chunks' within 'keys'. + state (dict): The current state of the graph. The input keys will be used + to fetch the correct data from the state. Returns: - dict: The updated state with the 'answer' key containing the generated answer. + dict: The updated state with the output key containing the generated answer. Raises: - KeyError: If 'user_input' or 'document' is not found in the state, indicating + KeyError: If the input keys are not found in the state, indicating that the necessary information for generating an answer is missing. """ diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index d60ff6db..9c80fc19 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -1,6 +1,7 @@ """ -Module for generating the answer node +GenerateScraperNode Module """ + # Imports from standard library from typing import List from tqdm import tqdm @@ -16,58 +17,46 @@ class GenerateScraperNode(BaseNode): """ - A node that generates an answer using a language model (LLM) based on the user's input - and the content extracted from a webpage. It constructs a prompt from the user's input - and the scraped content, feeds it to the LLM, and parses the LLM's response to produce - an answer. + Generates a python script for scraping a website using the specified library. + It takes the user's prompt and the scraped content as input and generates a python script + that extracts the information requested by the user. Attributes: - llm: An instance of a language model client, configured for generating answers. - node_name (str): The unique identifier name for the node, defaulting - to "GenerateScraperNode". - node_type (str): The type of the node, set to "node" indicating a - standard operational node. + llm_model: An instance of a language model client, configured for generating answers. + library (str): The python library to use for scraping the website. + source (str): The website to scrape. Args: - llm: An instance of the language model client (e.g., ChatOpenAI) used - for generating answers. - node_name (str, optional): The unique identifier name for the node. - Defaults to "GenerateScraperNode". - - Methods: - execute(state): Processes the input and document from the state to generate an answer, - updating the state with the generated answer under the 'answer' key. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + library (str): The python library to use for scraping the website. + website (str): The website to scrape. + node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer". + """ def __init__(self, input: str, output: List[str], node_config: dict, library: str, website: str, node_name: str = "GenerateAnswer"): - """ - Initializes the GenerateScraperNode with a language model client and a node name. - Args: - llm (OpenAIImageToText): An instance of the OpenAIImageToText class. - node_name (str): name of the node - """ super().__init__(node_name, "node", input, output, 2, node_config) + self.llm_model = node_config["llm"] self.library = library self.source = website - def execute(self, state): + def execute(self, state: dict) -> dict: """ - Generates an answer by constructing a prompt from the user's input and the scraped - content, querying the language model, and parsing its response. - - The method updates the state with the generated answer under the 'answer' key. + Generates a python script for scraping a website using the specified library. Args: - state (dict): The current state of the graph, expected to contain 'user_input', - and optionally 'parsed_document' or 'relevant_chunks' within 'keys'. + state (dict): The current state of the graph. The input keys will be used + to fetch the correct data from the state. Returns: - dict: The updated state with the 'answer' key containing the generated answer. + dict: The updated state with the output key containing the generated answer. Raises: - KeyError: If 'user_input' or 'document' is not found in the state, indicating + KeyError: If input keys are not found in the state, indicating that the necessary information for generating an answer is missing. """ diff --git a/scrapegraphai/nodes/get_probable_tags_node.py b/scrapegraphai/nodes/get_probable_tags_node.py index 20688143..11977c62 100644 --- a/scrapegraphai/nodes/get_probable_tags_node.py +++ b/scrapegraphai/nodes/get_probable_tags_node.py @@ -1,6 +1,7 @@ """ -Module for proobable tags +GetProbableTagsNode Module """ + from typing import List from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate @@ -15,47 +16,36 @@ class GetProbableTagsNode(BaseNode): list of probable tags. Attributes: - llm: An instance of a language model client, configured for generating tag predictions. - node_name (str): The unique identifier name for the node, - defaulting to "GetProbableTagsNode". - node_type (str): The type of the node, set to "node" indicating a standard operational node. + llm_model: An instance of the language model client used for tag predictions. Args: - llm: An instance of the language model client (e.g., ChatOpenAI) used for tag predictions. - node_name (str, optional): The unique identifier name for the node. - Defaults to "GetProbableTagsNode". - - Methods: - execute(state): Processes the user's input and the URL from the state to generate a list of - probable HTML tags, updating the state with these tags under the 'tags' key. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + model_config (dict): Additional configuration for the language model. + node_name (str): The unique identifier name for the node, defaulting to "GetProbableTags". """ def __init__(self, input: str, output: List[str], model_config: dict, node_name: str = "GetProbableTags"): - """ - Initializes the GetProbableTagsNode with a language model client and a node name. - Args: - llm (OpenAIImageToText): An instance of the OpenAIImageToText class. - node_name (str): name of the node - """ super().__init__(node_name, "node", input, output, 2, model_config) + self.llm_model = model_config["llm_model"] - def execute(self, state): + def execute(self, state: dict) -> dict: """ Generates a list of probable HTML tags based on the user's input and updates the state with this list. The method constructs a prompt for the language model, submits it, and parses the output to identify probable tags. Args: - state (dict): The current state of the graph, expected to contain 'user_input', 'url', - and optionally 'document' within 'keys'. + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data types from the state. Returns: - dict: The updated state with the 'tags' key containing a list of probable HTML tags. + dict: The updated state with the input key containing a list of probable HTML tags. Raises: - KeyError: If 'user_input' or 'url' is not found in the state, indicating that the + KeyError: If input keys are not found in the state, indicating that the necessary information for generating tag predictions is missing. """ diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py index fff877df..d9d4f1cc 100644 --- a/scrapegraphai/nodes/image_to_text_node.py +++ b/scrapegraphai/nodes/image_to_text_node.py @@ -1,45 +1,44 @@ """ -Module for the ImageToTextNode class. +ImageToTextNode Module """ + from typing import List from .base_node import BaseNode class ImageToTextNode(BaseNode): """ - A class representing a node that processes an image and returns the text description. + Retrieve an image from an URL and convert it to text using an ImageToText model. Attributes: - llm_model (OpenAIImageToText): An instance of the OpenAIImageToText class. - - Methods: - execute(state, url): Execute the node's logic and return the updated state. + llm_model: An instance of the language model client used for image-to-text conversion. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "ImageToText". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "ImageToText"): - """ - Initializes an instance of the ImageToTextNode class. - - Args: - input (str): The input for the node. - output (List[str]): The output of the node. - node_config (dict): Configuration for the model. - node_name (str): Name of the node. - """ super().__init__(node_name, "node", input, output, 1, node_config) + self.llm_model = node_config["llm_model"] self.verbose = True if node_config is None else node_config.get("verbose", False) def execute(self, state: dict) -> dict: """ - Execute the node's logic and return the updated state. + Generate text from an image using an image-to-text model. The method retrieves the image + from the URL provided in the state. Args: - state (dict): The current state of the graph. + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data types from the state. Returns: - dict: The updated state after executing this node. + dict: The updated state with the input key containing the text extracted from the image. """ if self.verbose: diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 789ce057..b552ece4 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -1,6 +1,7 @@ """ -Module for parsing the HTML node +ParseNode Module """ + from typing import List from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_transformers import Html2TextTransformer @@ -10,56 +11,40 @@ class ParseNode(BaseNode): """ A node responsible for parsing HTML content from a document. - It uses BeautifulSoupTransformer for parsing, providing flexibility in extracting - specific parts of an HTML document. + The parsed content is split into chunks for further processing. This node enhances the scraping workflow by allowing for targeted extraction of content, thereby optimizing the processing of large HTML documents. Attributes: - node_name (str): The unique identifier name for the node, defaulting to "ParseHTMLNode". - node_type (str): The type of the node, set to "node" indicating a standard operational node. + verbose (bool): A flag indicating whether to show print statements during execution. Args: - node_name (str, optional): The unique identifier name for the node. - Defaults to "ParseHTMLNode". - - Methods: - execute(state): Parses the HTML document contained within the state using - the specified tags, if provided, and updates the state with the parsed content. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "Parse"): - """ - Initializes the ParseHTMLNode with a node name. - Args: - doc_type (str): type of the input document - chunks_size (int): size of the chunks to split the document - node_name (str): name of the node - node_type (str, optional): type of the node - """ super().__init__(node_name, "node", input, output, 1, node_config) self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ - Executes the node's logic to parse the HTML document based on specified tags. - If tags are provided in the state, the document is parsed accordingly; otherwise, - the document remains unchanged. The method updates the state with either the original - or parsed document under the 'parsed_document' key. + Executes the node's logic to parse the HTML document content and split it into chunks. Args: - state (dict): The current state of the graph, expected to contain - 'document' within 'keys', and optionally 'tags' for targeted parsing. + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data from the state. Returns: - dict: The updated state with the 'parsed_document' key containing the parsed content, - if tags were provided, or the original document otherwise. + dict: The updated state with the output key containing the parsed content chunks. Raises: - KeyError: If 'document' is not found in the state, indicating that the necessary - information for parsing is missing. + KeyError: If the input keys are not found in the state, indicating that the + necessary information for parsing the content is missing. """ if self.verbose: diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 3401ff23..d3842742 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -1,5 +1,5 @@ """ -Module for parsing the HTML node +RAGNode Module """ from typing import List @@ -18,46 +18,44 @@ class RAGNode(BaseNode): """ A node responsible for compressing the input tokens and storing the document - in a vector database for retrieval. + in a vector database for retrieval. Relevant chunks are stored in the state. It allows scraping of big documents without exceeding the token limit of the language model. Attributes: - node_name (str): The unique identifier name for the node, defaulting to "ParseHTMLNode". - node_type (str): The type of the node, set to "node" indicating a standard operational node. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. Args: - node_name (str, optional): The unique identifier name for the node. - Defaults to "ParseHTMLNode". - - Methods: - execute(state): Parses the HTML document contained within the state using - the specified tags, if provided, and updates the state with the parsed content. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "RAG"): - """ - Initializes the ParseHTMLNode with a node name. - """ super().__init__(node_name, "node", input, output, 2, node_config) + self.llm_model = node_config["llm"] self.embedder_model = node_config.get("embedder_model", None) self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ - Executes the node's logic to implement RAG (Retrieval-Augmented Generation) + Executes the node's logic to implement RAG (Retrieval-Augmented Generation). The method updates the state with relevant chunks of the document. Args: - state (dict): The state containing the 'document' key with the HTML content + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data from the state. Returns: - dict: The updated state containing the 'relevant_chunks' key with the relevant chunks. + dict: The updated state with the output key containing the relevant chunks of the document. Raises: - KeyError: If 'document' is not found in the state, indicating that the necessary - information for parsing is missing. + KeyError: If the input keys are not found in the state, indicating that the + necessary information for compressing the content is missing. """ if self.verbose: diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index 3df9603d..001de62d 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -1,6 +1,7 @@ """ -Module for checking if a website is scrapepable or not +RobotsNode Module """ + from typing import List from urllib.parse import urlparse from langchain_community.document_loaders import AsyncHtmlLoader @@ -12,75 +13,53 @@ class RobotsNode(BaseNode): """ - A node responsible for checking if a website is scrapepable or not. - It uses the AsyncHtmlLoader for asynchronous - document loading. + A node responsible for checking if a website is scrapeable or not based on the robots.txt file. + It uses a language model to determine if the website allows scraping of the provided path. This node acts as a starting point in many scraping workflows, preparing the state with the necessary HTML content for further processing by subsequent nodes in the graph. Attributes: - This node acts as a starting point in many scraping workflows, preparing the state - with the necessary HTML content for further processing by subsequent nodes in the graph. - - Attributes: - node_name (str): The unique identifier name for the node. - node_type (str): The type of the node, defaulting to "node". This categorization - helps in determining the node's role and behavior within the graph. - The "node" type is used for standard operational nodes. + llm_model: An instance of the language model client used for checking scrapeability. + force_scraping (bool): A flag indicating whether scraping should be enforced even + if disallowed by robots.txt. + verbose (bool): A flag indicating whether to show print statements during execution. Args: - node_name (str): The unique identifier name for the node. This name is used to - reference the node within the graph. - node_type (str, optional): The type of the node, limited to "node" or - "conditional_node". Defaults to "node". - node_config (dict): Configuration parameters for the node. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. force_scraping (bool): A flag indicating whether scraping should be enforced even - if disallowed by robots.txt. Defaults to True. - input (str): Input expression defining how to interpret the incoming data. - output (List[str]): List of output keys where the results will be stored. - - Methods: - execute(state): Fetches the HTML content for the URL specified in the state and - updates the state with this content under the 'document' key. - The 'url' key must be present in the state for the operation - to succeed. + if disallowed by robots.txt. Defaults to True. + node_name (str): The unique identifier name for the node, defaulting to "Robots". """ def __init__(self, input: str, output: List[str], node_config: dict, force_scraping=True, node_name: str = "Robots"): - """ - Initializes the RobotsNode with a node name, input/output expressions - and node configuration. - - Arguments: - input (str): Input expression defining how to interpret the incoming data. - output (List[str]): List of output keys where the results will be stored. - node_config (dict): Configuration parameters for the node. - force_scraping (bool): A flag indicating whether scraping should be enforced even - if disallowed by robots.txt. Defaults to True. - node_name (str, optional): The unique identifier name for the node. - Defaults to "Robots". - """ super().__init__(node_name, "node", input, output, 1) + self.llm_model = node_config["llm"] self.force_scraping = force_scraping self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ - Executes the node's logic to fetch HTML content from a specified URL and - update the state with this content. + Checks if a website is scrapeable based on the robots.txt file and updates the state + with the scrapeability status. The method constructs a prompt for the language model, + submits it, and parses the output to determine if scraping is allowed. Args: - state (dict): The current state of the graph, expected to contain a 'url' key. + state (dict): The current state of the graph. The input keys will be used to fetch the Returns: - dict: The updated state with a new 'document' key containing the fetched HTML content. + dict: The updated state with the output key containing the scrapeability status. Raises: - KeyError: If the 'url' key is not found in the state, indicating that the - necessary information to perform the operation is missing. + KeyError: If the input keys are not found in the state, indicating that the + necessary information for checking scrapeability is missing. + KeyError: If the large language model is not found in the robots_dictionary. + ValueError: If the website is not scrapeable based on the robots.txt file and + scraping is not enforced. """ if self.verbose: diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 91dfa427..00cf9211 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -1,6 +1,7 @@ """ -Module for generating the answer node +SearchInternetNode Module """ + from typing import List from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate @@ -10,63 +11,46 @@ class SearchInternetNode(BaseNode): """ - A node that generates an answer by querying a language model (LLM) based on the user's input - and the content extracted from a webpage. It constructs a prompt from the user's input - and the scraped content, feeds it to the LLM, and parses the LLM's response to produce - an answer. + A node that generates a search query based on the user's input and searches the internet + for relevant information. The node constructs a prompt for the language model, submits it, + and processes the output to generate a search query. It then uses the search query to find + relevant information on the internet and updates the state with the generated answer. Attributes: - node_name (str): The unique identifier name for the node. - node_type (str): The type of the node, set to "node" indicating a standard operational node. - input (str): The user input used to construct the prompt. - output (List[str]): The keys in the state dictionary - where the generated answer will be stored. - model_config (dict): Configuration parameters for the language model client. + llm_model: An instance of the language model client used for generating search queries. + verbose (bool): A flag indicating whether to show print statements during execution. Args: - input (str): The user input used to construct the prompt. - output (List[str]): The keys in the state dictionary where the - generated answer will be stored. - model_config (dict): Configuration parameters for the language model client. - node_name (str, optional): The unique identifier name for the node. - - Methods: - execute(state): Processes the input and document from the state to generate an answer, - updating the state with the generated answer under the 'answer' key. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "SearchInternet". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "SearchInternet"): - """ - Initializes the SearchInternetNode with input, output, model configuration, and a node name. - Args: - input (str): The user input used to construct the prompt. - output (List[str]): The keys in the state dictionary where the - generated answer will be stored. - model_config (dict): Configuration parameters for the language model client. - node_name (str): The unique identifier name for the node. - """ super().__init__(node_name, "node", input, output, 1, node_config) + self.llm_model = node_config["llm"] self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ Generates an answer by constructing a prompt from the user's input and the scraped content, querying the language model, and parsing its response. - The method updates the state with the generated answer under the 'answer' key. + The method updates the state with the generated answer. Args: - state (dict): The current state of the graph, expected to contain 'user_input', - and optionally 'parsed_document' or 'relevant_chunks' within 'keys'. + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data types from the state. Returns: - dict: The updated state with the 'answer' key containing the generated answer. + dict: The updated state with the output key containing the generated answer. Raises: - KeyError: If 'user_input' or 'document' is not found in the state, indicating - that the necessary information for generating an answer is missing. + KeyError: If the input keys are not found in the state, indicating that the + necessary information for generating the answer is missing. """ if self.verbose: diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 5d7cfca9..7f766b5b 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -1,6 +1,7 @@ """ -Module for generating the answer node +SearchLinkNode Module """ + # Imports from standard library from typing import List from tqdm import tqdm @@ -18,58 +19,42 @@ class SearchLinkNode(BaseNode): """ - A node that generates an answer using a language model (LLM) based on the user's input - and the content extracted from a webpage. It constructs a prompt from the user's input - and the scraped content, feeds it to the LLM, and parses the LLM's response to produce - an answer. + A node that look for all the links in a web page and returns them. + It initially tries to extract the links using classical methods, if it fails it uses the LLM to extract the links. Attributes: - llm: An instance of a language model client, configured for generating answers. - node_name (str): The unique identifier name for the node, defaulting - to "GenerateAnswerNode". - node_type (str): The type of the node, set to "node" indicating a - standard operational node. + llm_model: An instance of the language model client used for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. Args: - llm: An instance of the language model client (e.g., ChatOpenAI) used - for generating answers. - node_name (str, optional): The unique identifier name for the node. - Defaults to "GenerateAnswerNode". - - Methods: - execute(state): Processes the input and document from the state to generate an answer, - updating the state with the generated answer under the 'answer' key. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "GenerateLinks"): - """ - Initializes the GenerateAnswerNode with a language model client and a node name. - Args: - llm: An instance of the OpenAIImageToText class. - node_name (str): name of the node - """ super().__init__(node_name, "node", input, output, 1, node_config) + self.llm_model = node_config["llm"] self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ - Generates an answer by constructing a prompt from the user's input and the scraped - content, querying the language model, and parsing its response. - - The method updates the state with the generated answer under the 'answer' key. + Generates a list of links by extracting them from the provided HTML content. + First, it tries to extract the links using classical methods, if it fails it uses the LLM to extract the links. Args: - state (dict): The current state of the graph, expected to contain 'user_input', - and optionally 'parsed_document' or 'relevant_chunks' within 'keys'. + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data types from the state. Returns: - dict: The updated state with the 'answer' key containing the generated answer. + dict: The updated state with the output key containing the list of links. Raises: - KeyError: If 'user_input' or 'document' is not found in the state, indicating - that the necessary information for generating an answer is missing. + KeyError: If the input keys are not found in the state, indicating that the + necessary information for generating the answer is missing. """ if self.verbose: @@ -90,7 +75,7 @@ def execute(self, state): except Exception as e: if self.verbose: - print("error on using classical methods. Using LLM for getting the links") + print("Error extracting links using classical methods. Using LLM to extract links.") output_parser = JsonOutputParser() diff --git a/scrapegraphai/nodes/text_to_speech_node.py b/scrapegraphai/nodes/text_to_speech_node.py index 5a5c0b48..53da713a 100644 --- a/scrapegraphai/nodes/text_to_speech_node.py +++ b/scrapegraphai/nodes/text_to_speech_node.py @@ -1,39 +1,47 @@ - """ -Module for parsing the text to voice +TextToSpeechNode Module """ + from typing import List from .base_node import BaseNode class TextToSpeechNode(BaseNode): """ - A class representing a node that processes text and returns the voice. + Converts text to speech using the specified text-to-speech model. Attributes: - llm (OpenAITextToSpeech): An instance of the OpenAITextToSpeech class. - - Methods: - execute(state, text): Execute the node's logic and return the updated state. + tts_model: An instance of the text-to-speech model client. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "TextToSpeech". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "TextToSpeech"): - """ - Initializes an instance of the TextToSpeechNode class. - """ super().__init__(node_name, "node", input, output, 1, node_config) + self.tts_model = node_config["tts_model"] self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ - Execute the node's logic and return the updated state. - Args: - state (dict): The current state of the graph. - text (str): The text to convert to speech. + Converts text to speech using the specified text-to-speech model. - :return: The updated state after executing this node. + Args: + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data types from the state. + + Returns: + dict: The updated state with the output key containing the audio generated from the text. + + Raises: + KeyError: If the input keys are not found in the state, indicating that the + necessary information for generating the audio is missing. """ if self.verbose: diff --git a/scrapegraphai/utils/convert_to_csv.py b/scrapegraphai/utils/convert_to_csv.py index 9b430fff..be001d06 100644 --- a/scrapegraphai/utils/convert_to_csv.py +++ b/scrapegraphai/utils/convert_to_csv.py @@ -6,20 +6,27 @@ import pandas as pd -def convert_to_csv(data: dict, filename: str, position: str = None): +def convert_to_csv(data: dict, filename: str, position: str = None) -> None: """ - Converts a dictionary to a CSV file and saves it. + Converts a dictionary to a CSV file and saves it at a specified location. Args: - data (dict): Data to be converted to CSV. - position (str): Optional path where the file should be saved. If not provided, - the directory of the caller script will be used. + data (dict): The data to be converted into CSV format. + filename (str): The name of the output CSV file, without the '.csv' extension. + position (str, optional): The file path where the CSV should be saved. Defaults to the directory of the caller script if not provided. + Returns: + None: The function does not return anything. + Raises: - FileNotFoundError: If the specified directory does not exist. - PermissionError: If the program lacks write permission for the directory. - TypeError: If the input data is not a dictionary. - Exception: For other potential errors during DataFrame creation or CSV saving. + FileNotFoundError: If the specified directory does not exist. + PermissionError: If write permissions are lacking for the directory. + TypeError: If `data` is not a dictionary. + Exception: For other issues that may arise during the creation or saving of the CSV file. + + Example: + >>> convert_to_csv({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save') + Saves a CSV file named 'output.csv' at '/path/to/save'. """ if ".csv" in filename: diff --git a/scrapegraphai/utils/convert_to_json.py b/scrapegraphai/utils/convert_to_json.py index c349ad1d..7cf12c53 100644 --- a/scrapegraphai/utils/convert_to_json.py +++ b/scrapegraphai/utils/convert_to_json.py @@ -6,23 +6,33 @@ import sys -def convert_to_json(data: dict, filename: str, position: str = None): +def convert_to_json(data: dict, filename: str, position: str = None) -> None: """ - Convert data to JSON format and save it to a file. + Converts a dictionary to a JSON file and saves it at a specified location. Args: - data (dict): Data to save. - filename (str): Name of the file to save without .json extension. - position (str): Directory where the file should be saved. If None, - the directory of the caller script will be used. + data (dict): The data to be converted into JSON format. + filename (str): The name of the output JSON file, without the '.json' extension. + position (str, optional): The file path where the JSON file should be saved. Defaults to the directory of the caller script if not provided. + Returns: + None: The function does not return anything. + Raises: - ValueError: If filename contains '.json'. - FileNotFoundError: If the specified directory does not exist. - PermissionError: If the program does not have permission to write to the directory. + ValueError: If 'filename' contains '.json'. + FileNotFoundError: If the specified directory does not exist. + PermissionError: If write permissions are lacking for the directory. + + Example: + >>> convert_to_json({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save') + Saves a JSON file named 'output.json' at '/path/to/save'. + + Notes: + This function automatically ensures the directory exists before attempting to write the file. If the directory does not exist, it will attempt to create it. """ + if ".json" in filename: - filename = filename.replace(".json", "") # Remove .csv extension + filename = filename.replace(".json", "") # Remove .json extension # Get the directory of the caller script if position is None: diff --git a/scrapegraphai/utils/parse_state_keys.py b/scrapegraphai/utils/parse_state_keys.py index 5c99a60f..6afc2ecb 100644 --- a/scrapegraphai/utils/parse_state_keys.py +++ b/scrapegraphai/utils/parse_state_keys.py @@ -4,12 +4,30 @@ import re -def parse_expression(expression, state: dict): - """ - Function for parsing the expressions +def parse_expression(expression, state: dict) -> list: + """ + Parses a complex boolean expression involving state keys. + Args: - state (dict): state to elaborate + expression (str): The boolean expression to parse. + state (dict): Dictionary of state keys used to evaluate the expression. + + Raises: + ValueError: If the expression is empty, has adjacent state keys without operators, invalid operator usage, + unbalanced parentheses, or if no state keys match the expression. + + Returns: + list: A list of state keys that match the boolean expression, ensuring each key appears only once. + + Example: + >>> parse_expression("user_input & (relevant_chunks | parsed_document | document)", + {"user_input": None, "document": None, "parsed_document": None, "relevant_chunks": None}) + ['user_input', 'relevant_chunks', 'parsed_document', 'document'] + + This function evaluates the expression to determine the logical inclusion of state keys based on provided boolean logic. + It checks for syntax errors such as unbalanced parentheses, incorrect adjacency of operators, and empty expressions. """ + # Check for empty expression if not expression: raise ValueError("Empty expression.") diff --git a/scrapegraphai/utils/prettify_exec_info.py b/scrapegraphai/utils/prettify_exec_info.py index 21004b71..6bda73c6 100644 --- a/scrapegraphai/utils/prettify_exec_info.py +++ b/scrapegraphai/utils/prettify_exec_info.py @@ -7,13 +7,17 @@ def prettify_exec_info(complete_result: list[dict]) -> pd.DataFrame: """ - Transform the execution information of the graph into a DataFrame for better visualization. + Transforms the execution information of a graph into a DataFrame for enhanced visualization. Args: - - complete_result (list[dict]): The complete execution information of the graph. + complete_result (list[dict]): The complete execution information of the graph. Returns: - - pd.DataFrame: The execution information of the graph in a DataFrame. + pd.DataFrame: A DataFrame that organizes the execution information for better readability and analysis. + + Example: + >>> prettify_exec_info([{'node': 'A', 'status': 'success'}, {'node': 'B', 'status': 'failure'}]) + DataFrame with columns 'node' and 'status' showing execution results for each node. """ df_nodes = pd.DataFrame(complete_result) diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py index 0019b421..576a91e4 100644 --- a/scrapegraphai/utils/proxy_rotation.py +++ b/scrapegraphai/utils/proxy_rotation.py @@ -4,26 +4,29 @@ from fp.fp import FreeProxy -def proxy_generator(num_ips: int): +def proxy_generator(num_ips: int) -> list: """ - Rotates through a specified number of proxy IPs using the FreeProxy library. + Generates a specified number of proxy IP addresses using the FreeProxy library. Args: - num_ips (int): The number of proxy IPs to rotate through. + num_ips (int): The number of proxy IPs to generate and rotate through. Returns: - dict: A dictionary containing the rotated proxy IPs, indexed by their position in rotation. + list: A list of proxy IP addresses. Example: >>> proxy_generator(5) - { - 0: '192.168.1.1:8080', - 1: '103.10.63.135:8080', - 2: '176.9.75.42:8080', - 3: '37.57.216.2:8080', - 4: '113.20.31.250:8080' - } + [ + '192.168.1.1:8080', + '103.10.63.135:8080', + '176.9.75.42:8080', + '37.57.216.2:8080', + '113.20.31.250:8080' + ] + + This function fetches fresh proxies and indexes them, making it easy to manage multiple proxy configurations. """ + res = [] for i in range(0, num_ips): diff --git a/scrapegraphai/utils/remover.py b/scrapegraphai/utils/remover.py index 60f7592b..5e203249 100644 --- a/scrapegraphai/utils/remover.py +++ b/scrapegraphai/utils/remover.py @@ -7,15 +7,20 @@ def remover(html_content: str) -> str: """ - This function processes HTML content, removes unnecessary tags - (including style tags), minifies the HTML, and retrieves the - title and body content. + Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content. - Parameters: - html_content (str): The HTML content to parse + Args: + html_content (str): The HTML content to be processed. Returns: - str: The parsed title followed by the minified body content + str: A string combining the parsed title and the minified body content. If no body content is found, it indicates so. + + Example: + >>> html_content = "
Hello World!
" + >>> remover(html_content) + 'Title: Example, Body:Hello World!
' + + This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized. """ soup = BeautifulSoup(html_content, 'html.parser') diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 8f48adcd..398ae00a 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -8,16 +8,25 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]: - """ - Function that given a query it finds it on the intenet + """ + Searches the web for a given query using specified search engine options. + Args: - query (str): query to search on internet - search_engine (str, optional): type of browser, it could be DuckDuckGo or Google, - default: Google - max_results (int, optional): maximum number of results + query (str): The search query to find on the internet. + search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'. + max_results (int, optional): The maximum number of search results to return. Returns: - List[str]: List of strings of web link + List[str]: A list of URLs as strings that are the search results. + + Raises: + ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'. + + Example: + >>> search_on_web("example query", search_engine="Google", max_results=5) + ['http://example.com', 'http://example.org', ...] + + This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs. """ if search_engine == "Google": diff --git a/scrapegraphai/utils/save_audio_from_bytes.py b/scrapegraphai/utils/save_audio_from_bytes.py index 41c53d7b..3027e4e8 100644 --- a/scrapegraphai/utils/save_audio_from_bytes.py +++ b/scrapegraphai/utils/save_audio_from_bytes.py @@ -7,12 +7,18 @@ def save_audio_from_bytes(byte_response: bytes, output_path: Union[str, Path]) -> None: """ - Saves the byte response as an audio file. + Saves the byte response as an audio file to the specified path. Args: - byte_response (bytes): The byte response containing the generated speech. - output_path (str or Path): The file path where the generated speech should be saved. + byte_response (bytes): The byte array containing audio data. + output_path (Union[str, Path]): The destination file path where the audio file will be saved. + + Example: + >>> save_audio_from_bytes(b'audio data', 'path/to/audio.mp3') + + This function writes the byte array containing audio data to a file, saving it as an audio file. """ + if not isinstance(output_path, Path): output_path = Path(output_path) diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py index 5b46a1b8..5b23fdf4 100644 --- a/scrapegraphai/utils/token_calculator.py +++ b/scrapegraphai/utils/token_calculator.py @@ -8,15 +8,21 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]: """ - It creates a list of strings to create max dimension tokenizable elements + Truncates text into chunks that are small enough to be processed by specified llm models. Args: - text (str): The input text to be truncated into tokenizable elements. - model (str): The name of the language model to be used. - encoding_name (str): The name of the encoding to be used (default: EMBEDDING_ENCODING). + text (str): The input text to be truncated. + model (str): The name of the llm model to determine the maximum token limit. + encoding_name (str): The encoding strategy used to encode the text before truncation. Returns: - List[str]: A list of tokenizable elements created from the input text. + List[str]: A list of text chunks, each within the token limit of the specified model. + + Example: + >>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING") + ["This is a sample text", "for truncation."] + + This function ensures that each chunk of text can be tokenized by the specified model without exceeding the model's token limit. """ encoding = tiktoken.get_encoding(encoding_name) diff --git a/tests/graphs/scrape_json_ollama.py b/tests/graphs/scrape_json_ollama.py new file mode 100644 index 00000000..a1ce8875 --- /dev/null +++ b/tests/graphs/scrape_json_ollama.py @@ -0,0 +1,56 @@ +""" +Module for scraping json documents +""" +import os +import pytest +from scrapegraphai.graphs import JSONScraperGraph + + +@pytest.fixture +def sample_json(): + """ + Example of text + """ + file_name = "inputs/example.json" + curr_dir = os.path.dirname(os.path.realpath(__file__)) + file_path = os.path.join(curr_dir, file_name) + + with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + + return text + + +@pytest.fixture +def graph_config(): + """ + Configuration of the graph + """ + return { + "llm": { + "model": "ollama/mistral", + "temperature": 0, + "format": "json", + "base_url": "http://localhost:11434", + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434", + } + } + + +def test_scraping_pipeline(sample_json: str, graph_config: dict): + """ + Start of the scraping pipeline + """ + smart_scraper_graph = JSONScraperGraph( + prompt="List me all the titles", + source=sample_json, + config=graph_config + ) + + result = smart_scraper_graph.run() + + assert result is not None diff --git a/tests/graphs/scrape_xml_ollama_test.py b/tests/graphs/scrape_xml_ollama_test.py index afa7527f..04494543 100644 --- a/tests/graphs/scrape_xml_ollama_test.py +++ b/tests/graphs/scrape_xml_ollama_test.py @@ -3,7 +3,7 @@ """ import os import pytest -from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.graphs import XMLScraperGraph @pytest.fixture @@ -45,7 +45,7 @@ def test_scraping_pipeline(sample_xml: str, graph_config: dict): """ Start of the scraping pipeline """ - smart_scraper_graph = SmartScraperGraph( + smart_scraper_graph = XMLScraperGraph( prompt="List me all the authors, title and genres of the books", source=sample_xml, config=graph_config diff --git a/tests/graphs/script_generator_test.py b/tests/graphs/script_generator_test.py index 6114bac4..4982184e 100644 --- a/tests/graphs/script_generator_test.py +++ b/tests/graphs/script_generator_test.py @@ -46,6 +46,4 @@ def test_script_creator_graph(graph_config: dict): assert graph_exec_info is not None - assert isinstance(graph_exec_info, dict) - print(prettify_exec_info(graph_exec_info)) diff --git a/tests/nodes/fetch_node_test.py b/tests/nodes/fetch_node_test.py index e0552a05..811c2daf 100644 --- a/tests/nodes/fetch_node_test.py +++ b/tests/nodes/fetch_node_test.py @@ -17,6 +17,9 @@ def setup(): robots_node = FetchNode( input="url | local_dir", output=["doc"], + node_config={ + "headless": False + } ) return robots_node diff --git a/tests/nodes/robot_node_test.py b/tests/nodes/robot_node_test.py index 7808a976..cae3a895 100644 --- a/tests/nodes/robot_node_test.py +++ b/tests/nodes/robot_node_test.py @@ -32,7 +32,9 @@ def setup(): robots_node = RobotsNode( input="url", output=["is_scrapable"], - node_config={"llm": llm_model} + node_config={"llm": llm_model, + "headless": False + } ) return robots_node