From 39bf4c960d703a321af64e3b1b41ca9a1a15794e Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Mon, 17 Jun 2024 13:56:13 +0200 Subject: [PATCH] docs: refactor graph section and added telemetry --- docs/source/conf.py | 5 +- docs/source/index.rst | 3 - docs/source/scrapers/graphs.rst | 229 +---------------------------- docs/source/scrapers/telemetry.rst | 72 +++++++++ docs/source/scrapers/types.rst | 225 ++++++++++++++++++++++++++++ 5 files changed, 309 insertions(+), 225 deletions(-) create mode 100644 docs/source/scrapers/telemetry.rst create mode 100644 docs/source/scrapers/types.rst diff --git a/docs/source/conf.py b/docs/source/conf.py index 43c849c4..9fc3aec7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -36,4 +36,7 @@ "source_repository": "https://github.com/VinciGit00/Scrapegraph-ai/", "source_branch": "main", "source_directory": "docs/source/", -} \ No newline at end of file + 'navigation_with_keys': True, + 'sidebar_hide_name': False, +} + diff --git a/docs/source/index.rst b/docs/source/index.rst index e49f54a9..acc0db73 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -22,9 +22,6 @@ :caption: Scrapers scrapers/graphs - scrapers/llm - scrapers/graph_config - scrapers/benchmarks .. toctree:: :maxdepth: 2 diff --git a/docs/source/scrapers/graphs.rst b/docs/source/scrapers/graphs.rst index 892a4ef1..ee5f072f 100644 --- a/docs/source/scrapers/graphs.rst +++ b/docs/source/scrapers/graphs.rst @@ -3,224 +3,11 @@ Graphs Graphs are scraping pipelines aimed at solving specific tasks. They are composed by nodes which can be configured individually to address different aspects of the task (fetching data, extracting information, etc.). -There are several types of graphs available in the library, each with its own purpose and functionality. The most common ones are: - -- **SmartScraperGraph**: one-page scraper that requires a user-defined prompt and a URL (or local file) to extract information using LLM. -- **SearchGraph**: multi-page scraper that only requires a user-defined prompt to extract information from a search engine using LLM. It is built on top of SmartScraperGraph. -- **SpeechGraph**: text-to-speech pipeline that generates an answer as well as a requested audio file. It is built on top of SmartScraperGraph and requires a user-defined prompt and a URL (or local file). -- **ScriptCreatorGraph**: script generator that creates a Python script to scrape a website using the specified library (e.g. BeautifulSoup). It requires a user-defined prompt and a URL (or local file). - -There are also two additional graphs that can handle multiple sources: - -- **SmartScraperMultiGraph**: similar to `SmartScraperGraph`, but with the ability to handle multiple sources. -- **ScriptCreatorMultiGraph**: similar to `ScriptCreatorGraph`, but with the ability to handle multiple sources. - -With the introduction of `GPT-4o`, two new powerful graphs have been created: - -- **OmniScraperGraph**: similar to `SmartScraperGraph`, but with the ability to scrape images and describe them. -- **OmniSearchGraph**: similar to `SearchGraph`, but with the ability to scrape images and describe them. - - -.. note:: - - They all use a graph configuration to set up LLM models and other parameters. To find out more about the configurations, check the :ref:`LLM` and :ref:`Configuration` sections. - - -.. note:: - - We can pass an optional `schema` parameter to the graph constructor to specify the output schema. If not provided or set to `None`, the schema will be generated by the LLM itself. - -OmniScraperGraph -^^^^^^^^^^^^^^^^ - -.. image:: ../../assets/omniscrapergraph.png - :align: center - :width: 90% - :alt: OmniScraperGraph -| - -First we define the graph configuration, which includes the LLM model and other parameters. Then we create an instance of the OmniScraperGraph class, passing the prompt, source, and configuration as arguments. Finally, we run the graph and print the result. -It will fetch the data from the source and extract the information based on the prompt in JSON format. - -.. code-block:: python - - from scrapegraphai.graphs import OmniScraperGraph - - graph_config = { - "llm": {...}, - } - - omni_scraper_graph = OmniScraperGraph( - prompt="List me all the projects with their titles and image links and descriptions.", - source="https://perinim.github.io/projects", - config=graph_config, - schema=schema - ) - - result = omni_scraper_graph.run() - print(result) - -OmniSearchGraph -^^^^^^^^^^^^^^^ - -.. image:: ../../assets/omnisearchgraph.png - :align: center - :width: 80% - :alt: OmniSearchGraph -| - -Similar to OmniScraperGraph, we define the graph configuration, create multiple of the OmniSearchGraph class, and run the graph. -It will create a search query, fetch the first n results from the search engine, run n OmniScraperGraph instances, and return the results in JSON format. - -.. code-block:: python - - from scrapegraphai.graphs import OmniSearchGraph - - graph_config = { - "llm": {...}, - } - - # Create the OmniSearchGraph instance - omni_search_graph = OmniSearchGraph( - prompt="List me all Chioggia's famous dishes and describe their pictures.", - config=graph_config, - schema=schema - ) - - # Run the graph - result = omni_search_graph.run() - print(result) - -SmartScraperGraph & SmartScraperMultiGraph -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. image:: ../../assets/smartscrapergraph.png - :align: center - :width: 90% - :alt: SmartScraperGraph -| - -First we define the graph configuration, which includes the LLM model and other parameters. Then we create an instance of the SmartScraperGraph class, passing the prompt, source, and configuration as arguments. Finally, we run the graph and print the result. -It will fetch the data from the source and extract the information based on the prompt in JSON format. - -.. code-block:: python - - from scrapegraphai.graphs import SmartScraperGraph - - graph_config = { - "llm": {...}, - } - - smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their descriptions", - source="https://perinim.github.io/projects", - config=graph_config, - schema=schema - ) - - result = smart_scraper_graph.run() - print(result) - -**SmartScraperMultiGraph** is similar to SmartScraperGraph, but it can handle multiple sources. We define the graph configuration, create an instance of the SmartScraperMultiGraph class, and run the graph. - -SearchGraph -^^^^^^^^^^^ - -.. image:: ../../assets/searchgraph.png - :align: center - :width: 80% - :alt: SearchGraph -| - -Similar to SmartScraperGraph, we define the graph configuration, create an instance of the SearchGraph class, and run the graph. -It will create a search query, fetch the first n results from the search engine, run n SmartScraperGraph instances, and return the results in JSON format. - - -.. code-block:: python - - from scrapegraphai.graphs import SearchGraph - - graph_config = { - "llm": {...}, - "embeddings": {...}, - } - - # Create the SearchGraph instance - search_graph = SearchGraph( - prompt="List me all the traditional recipes from Chioggia", - config=graph_config, - schema=schema - ) - - # Run the graph - result = search_graph.run() - print(result) - - -SpeechGraph -^^^^^^^^^^^ - -.. image:: ../../assets/speechgraph.png - :align: center - :width: 90% - :alt: SpeechGraph -| - -Similar to SmartScraperGraph, we define the graph configuration, create an instance of the SpeechGraph class, and run the graph. -It will fetch the data from the source, extract the information based on the prompt, and generate an audio file with the answer, as well as the answer itself, in JSON format. - -.. code-block:: python - - from scrapegraphai.graphs import SpeechGraph - - graph_config = { - "llm": {...}, - "tts_model": {...}, - } - - # ************************************************ - # Create the SpeechGraph instance and run it - # ************************************************ - - speech_graph = SpeechGraph( - prompt="Make a detailed audio summary of the projects.", - source="https://perinim.github.io/projects/", - config=graph_config, - schema=schema - ) - - result = speech_graph.run() - print(result) - - -ScriptCreatorGraph & ScriptCreatorMultiGraph -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. image:: ../../assets/scriptcreatorgraph.png - :align: center - :width: 90% - :alt: ScriptCreatorGraph - -First we define the graph configuration, which includes the LLM model and other parameters. -Then we create an instance of the ScriptCreatorGraph class, passing the prompt, source, and configuration as arguments. Finally, we run the graph and print the result. - -.. code-block:: python - - from scrapegraphai.graphs import ScriptCreatorGraph - - graph_config = { - "llm": {...}, - "library": "beautifulsoup4" - } - - script_creator_graph = ScriptCreatorGraph( - prompt="Create a Python script to scrape the projects.", - source="https://perinim.github.io/projects/", - config=graph_config, - schema=schema - ) - - result = script_creator_graph.run() - print(result) - -**ScriptCreatorMultiGraph** is similar to ScriptCreatorGraph, but it can handle multiple sources. We define the graph configuration, create an instance of the ScriptCreatorMultiGraph class, and run the graph. +.. toctree:: + :maxdepth: 4 + + types + llm + graph_config + benchmarks + telemetry diff --git a/docs/source/scrapers/telemetry.rst b/docs/source/scrapers/telemetry.rst new file mode 100644 index 00000000..a6598092 --- /dev/null +++ b/docs/source/scrapers/telemetry.rst @@ -0,0 +1,72 @@ +=============== +Usage Analytics +=============== + +ScrapeGraphAI collects **anonymous** usage data by default to improve the library and guide development efforts. + +**Events Captured** + +We capture events in the following scenarios: + +1. When a ``Graph`` finishes running. +2. When an exception is raised in one of the nodes. + +**Data Collected** + +The data captured is limited to: + +- Operating System and Python version +- A persistent UUID to identify the session, stored in ``~/.scrapegraphai.conf`` + +Additionally, the following properties are collected: + +.. code-block:: python + + properties = { + "graph_name": graph_name, + "llm_model": llm_model_name, + "embedder_model": embedder_model_name, + "source_type": source_type, + "execution_time": execution_time, + "error_node": error_node_name, + } + +For more details, refer to the `telemetry.py `_ module. + +**Opting Out** + +If you prefer not to participate in telemetry, you can opt out using any of the following methods: + +1. **Programmatically Disable Telemetry**: + + Add the following code at the beginning of your script: + + .. code-block:: python + + from scrapegraphai import telemetry + telemetry.disable_telemetry() + +2. **Configuration File**: + + Set the ``telemetry_enabled`` key to ``false`` in ``~/.scrapegraphai.conf`` under the ``[DEFAULT]`` section: + + .. code-block:: ini + + [DEFAULT] + telemetry_enabled = False + +3. **Environment Variable**: + + - **For a Shell Session**: + + .. code-block:: bash + + export SCRAPEGRAPHAI_TELEMETRY_ENABLED=false + + - **For a Single Command**: + + .. code-block:: bash + + SCRAPEGRAPHAI_TELEMETRY_ENABLED=false python my_script.py + +By following any of these methods, you can easily opt out of telemetry and ensure your usage data is not collected. diff --git a/docs/source/scrapers/types.rst b/docs/source/scrapers/types.rst new file mode 100644 index 00000000..42613066 --- /dev/null +++ b/docs/source/scrapers/types.rst @@ -0,0 +1,225 @@ +Types +===== + + +There are several types of graphs available in the library, each with its own purpose and functionality. The most common ones are: + +- **SmartScraperGraph**: one-page scraper that requires a user-defined prompt and a URL (or local file) to extract information using LLM. +- **SearchGraph**: multi-page scraper that only requires a user-defined prompt to extract information from a search engine using LLM. It is built on top of SmartScraperGraph. +- **SpeechGraph**: text-to-speech pipeline that generates an answer as well as a requested audio file. It is built on top of SmartScraperGraph and requires a user-defined prompt and a URL (or local file). +- **ScriptCreatorGraph**: script generator that creates a Python script to scrape a website using the specified library (e.g. BeautifulSoup). It requires a user-defined prompt and a URL (or local file). + +There are also two additional graphs that can handle multiple sources: + +- **SmartScraperMultiGraph**: similar to `SmartScraperGraph`, but with the ability to handle multiple sources. +- **ScriptCreatorMultiGraph**: similar to `ScriptCreatorGraph`, but with the ability to handle multiple sources. + +With the introduction of `GPT-4o`, two new powerful graphs have been created: + +- **OmniScraperGraph**: similar to `SmartScraperGraph`, but with the ability to scrape images and describe them. +- **OmniSearchGraph**: similar to `SearchGraph`, but with the ability to scrape images and describe them. + + +.. note:: + + They all use a graph configuration to set up LLM models and other parameters. To find out more about the configurations, check the :ref:`LLM` and :ref:`Configuration` sections. + + +.. note:: + + We can pass an optional `schema` parameter to the graph constructor to specify the output schema. If not provided or set to `None`, the schema will be generated by the LLM itself. + +OmniScraperGraph +^^^^^^^^^^^^^^^^ + +.. image:: ../../assets/omniscrapergraph.png + :align: center + :width: 90% + :alt: OmniScraperGraph +| + +First we define the graph configuration, which includes the LLM model and other parameters. Then we create an instance of the OmniScraperGraph class, passing the prompt, source, and configuration as arguments. Finally, we run the graph and print the result. +It will fetch the data from the source and extract the information based on the prompt in JSON format. + +.. code-block:: python + + from scrapegraphai.graphs import OmniScraperGraph + + graph_config = { + "llm": {...}, + } + + omni_scraper_graph = OmniScraperGraph( + prompt="List me all the projects with their titles and image links and descriptions.", + source="https://perinim.github.io/projects", + config=graph_config, + schema=schema + ) + + result = omni_scraper_graph.run() + print(result) + +OmniSearchGraph +^^^^^^^^^^^^^^^ + +.. image:: ../../assets/omnisearchgraph.png + :align: center + :width: 80% + :alt: OmniSearchGraph +| + +Similar to OmniScraperGraph, we define the graph configuration, create multiple of the OmniSearchGraph class, and run the graph. +It will create a search query, fetch the first n results from the search engine, run n OmniScraperGraph instances, and return the results in JSON format. + +.. code-block:: python + + from scrapegraphai.graphs import OmniSearchGraph + + graph_config = { + "llm": {...}, + } + + # Create the OmniSearchGraph instance + omni_search_graph = OmniSearchGraph( + prompt="List me all Chioggia's famous dishes and describe their pictures.", + config=graph_config, + schema=schema + ) + + # Run the graph + result = omni_search_graph.run() + print(result) + +SmartScraperGraph & SmartScraperMultiGraph +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. image:: ../../assets/smartscrapergraph.png + :align: center + :width: 90% + :alt: SmartScraperGraph +| + +First we define the graph configuration, which includes the LLM model and other parameters. Then we create an instance of the SmartScraperGraph class, passing the prompt, source, and configuration as arguments. Finally, we run the graph and print the result. +It will fetch the data from the source and extract the information based on the prompt in JSON format. + +.. code-block:: python + + from scrapegraphai.graphs import SmartScraperGraph + + graph_config = { + "llm": {...}, + } + + smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their descriptions", + source="https://perinim.github.io/projects", + config=graph_config, + schema=schema + ) + + result = smart_scraper_graph.run() + print(result) + +**SmartScraperMultiGraph** is similar to SmartScraperGraph, but it can handle multiple sources. We define the graph configuration, create an instance of the SmartScraperMultiGraph class, and run the graph. + +SearchGraph +^^^^^^^^^^^ + +.. image:: ../../assets/searchgraph.png + :align: center + :width: 80% + :alt: SearchGraph +| + +Similar to SmartScraperGraph, we define the graph configuration, create an instance of the SearchGraph class, and run the graph. +It will create a search query, fetch the first n results from the search engine, run n SmartScraperGraph instances, and return the results in JSON format. + + +.. code-block:: python + + from scrapegraphai.graphs import SearchGraph + + graph_config = { + "llm": {...}, + "embeddings": {...}, + } + + # Create the SearchGraph instance + search_graph = SearchGraph( + prompt="List me all the traditional recipes from Chioggia", + config=graph_config, + schema=schema + ) + + # Run the graph + result = search_graph.run() + print(result) + + +SpeechGraph +^^^^^^^^^^^ + +.. image:: ../../assets/speechgraph.png + :align: center + :width: 90% + :alt: SpeechGraph +| + +Similar to SmartScraperGraph, we define the graph configuration, create an instance of the SpeechGraph class, and run the graph. +It will fetch the data from the source, extract the information based on the prompt, and generate an audio file with the answer, as well as the answer itself, in JSON format. + +.. code-block:: python + + from scrapegraphai.graphs import SpeechGraph + + graph_config = { + "llm": {...}, + "tts_model": {...}, + } + + # ************************************************ + # Create the SpeechGraph instance and run it + # ************************************************ + + speech_graph = SpeechGraph( + prompt="Make a detailed audio summary of the projects.", + source="https://perinim.github.io/projects/", + config=graph_config, + schema=schema + ) + + result = speech_graph.run() + print(result) + + +ScriptCreatorGraph & ScriptCreatorMultiGraph +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. image:: ../../assets/scriptcreatorgraph.png + :align: center + :width: 90% + :alt: ScriptCreatorGraph + +First we define the graph configuration, which includes the LLM model and other parameters. +Then we create an instance of the ScriptCreatorGraph class, passing the prompt, source, and configuration as arguments. Finally, we run the graph and print the result. + +.. code-block:: python + + from scrapegraphai.graphs import ScriptCreatorGraph + + graph_config = { + "llm": {...}, + "library": "beautifulsoup4" + } + + script_creator_graph = ScriptCreatorGraph( + prompt="Create a Python script to scrape the projects.", + source="https://perinim.github.io/projects/", + config=graph_config, + schema=schema + ) + + result = script_creator_graph.run() + print(result) + +**ScriptCreatorMultiGraph** is similar to ScriptCreatorGraph, but it can handle multiple sources. We define the graph configuration, create an instance of the ScriptCreatorMultiGraph class, and run the graph.