Skip to content

Commit

Permalink
docs: graphs and helpers docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
PeriniM committed May 1, 2024
1 parent 18c20eb commit 0631985
Show file tree
Hide file tree
Showing 14 changed files with 305 additions and 81 deletions.
2 changes: 1 addition & 1 deletion scrapegraphai/builders/graph_builder.py
@@ -1,5 +1,5 @@
"""
Module for making the graph building
GraphBuilder Module
"""

from langchain_core.prompts import ChatPromptTemplate
Expand Down
1 change: 1 addition & 0 deletions scrapegraphai/graphs/__init__.py
@@ -1,6 +1,7 @@
"""
__init__.py file for graphs folder
"""

from .base_graph import BaseGraph
from .smart_scraper_graph import SmartScraperGraph
from .speech_graph import SpeechGraph
Expand Down
59 changes: 51 additions & 8 deletions scrapegraphai/graphs/abstract_graph.py
@@ -1,6 +1,7 @@
"""
Module having abstract class for creating all the graphs
AbstractGraph Module
"""

from abc import ABC, abstractmethod
from typing import Optional
from ..models import OpenAI, Gemini, Ollama, AzureOpenAI, HuggingFace, Groq
Expand All @@ -9,13 +10,34 @@

class AbstractGraph(ABC):
"""
Abstract class representing a generic graph-based tool.
Scaffolding class for creating a graph representation and executing it.
Attributes:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client, configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
Args:
prompt (str): The prompt for the graph.
config (dict): Configuration parameters for the graph.
source (str, optional): The source of the graph.
Example:
>>> class MyGraph(AbstractGraph):
... def _create_graph(self):
... # Implementation of graph creation here
... return graph
...
>>> my_graph = MyGraph("Example Graph", {"llm": {"model": "gpt-3.5-turbo"}}, "example_source")
>>> result = my_graph.run()
"""

def __init__(self, prompt: str, config: dict, source: Optional[str] = None):
"""
Initializes the AbstractGraph with a prompt, file source, and configuration.
"""

self.prompt = prompt
self.source = source
self.config = config
Expand All @@ -32,10 +54,20 @@ def __init__(self, prompt: str, config: dict, source: Optional[str] = None):
self.final_state = None
self.execution_info = None

def _create_llm(self, llm_config: dict):
def _create_llm(self, llm_config: dict) -> object:
"""
Creates an instance of the language model (OpenAI or Gemini) based on configuration.
Create a large language model instance based on the configuration provided.
Args:
llm_config (dict): Configuration parameters for the language model.
Returns:
object: An instance of the language model client.
Raises:
KeyError: If the model is not supported.
"""

llm_defaults = {
"temperature": 0,
"streaming": False
Expand Down Expand Up @@ -104,16 +136,27 @@ def _create_llm(self, llm_config: dict):

def get_state(self, key=None) -> dict:
"""""
Obtain the current state
Get the final state of the graph.
Args:
key (str, optional): The key of the final state to retrieve.
Returns:
dict: The final state of the graph.
"""

if key is not None:
return self.final_state[key]
return self.final_state

def get_execution_info(self):
"""
Returns the execution information of the graph.
Returns:
dict: The execution information of the graph.
"""

return self.execution_info

@abstractmethod
Expand Down
36 changes: 26 additions & 10 deletions scrapegraphai/graphs/base_graph.py
@@ -1,6 +1,7 @@
"""
Module for creating the base graphs
"""
BaseGraph Module
"""

import time
import warnings
from langchain_community.callbacks import get_openai_callback
Expand All @@ -16,21 +17,33 @@ class BaseGraph:
key-value pair corresponds to the from-node and to-node relationship.
entry_point (str): The name of the entry point node from which the graph execution begins.
Methods:
execute(initial_state): Executes the graph's nodes starting from the entry point and
traverses the graph based on the provided initial state.
Args:
nodes (iterable): An iterable of node instances that will be part of the graph.
edges (iterable): An iterable of tuples where each tuple represents a directed edge
in the graph, defined by a pair of nodes (from_node, to_node).
entry_point (BaseNode): The node instance that represents the entry point of the graph.
Raises:
Warning: If the entry point node is not the first node in the list.
Example:
>>> BaseGraph(
... nodes=[
... fetch_node,
... parse_node,
... rag_node,
... generate_answer_node,
... ],
... edges=[
... (fetch_node, parse_node),
... (parse_node, rag_node),
... (rag_node, generate_answer_node)
... ],
... entry_point=fetch_node
... )
"""

def __init__(self, nodes: list, edges: list, entry_point: str):
"""
Initializes the graph with nodes, edges, and the entry point.
"""

self.nodes = nodes
self.edges = self._create_edges({e for e in edges})
Expand All @@ -51,6 +64,7 @@ def _create_edges(self, edges: list) -> dict:
Returns:
dict: A dictionary of edges with the from-node as keys and to-node as values.
"""

edge_dict = {}
for from_node, to_node in edges:
edge_dict[from_node.node_name] = to_node.node_name
Expand All @@ -66,8 +80,10 @@ def execute(self, initial_state: dict) -> Tuple[dict, list]:
initial_state (dict): The initial state to pass to the entry point node.
Returns:
dict: The state after execution has completed, which may have been altered by the nodes.
Tuple[dict, list]: A tuple containing the final state of the execution and a list
of execution information for each node.
"""

current_node_name = self.nodes[0]
state = initial_state

Expand Down
41 changes: 34 additions & 7 deletions scrapegraphai/graphs/json_scraper_graph.py
@@ -1,6 +1,7 @@
"""
Module for creating the smart scraper
JSONScraperGraph Module
"""

from .base_graph import BaseGraph
from ..nodes import (
FetchNode,
Expand All @@ -13,22 +14,44 @@

class JSONScraperGraph(AbstractGraph):
"""
SmartScraper is a comprehensive web scraping tool that automates the process of extracting
information from web pages using a natural language model to interpret and answer prompts.
JSONScraperGraph defines a scraping pipeline for JSON files.
Attributes:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client, configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
Args:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
Example:
>>> json_scraper = JSONScraperGraph(
... "List me all the attractions in Chioggia.",
... "data/chioggia.json",
... {"llm": {"model": "gpt-3.5-turbo"}}
... )
>>> result = json_scraper.run()
"""

def __init__(self, prompt: str, source: str, config: dict):
"""
Initializes the JsonScraperGraph with a prompt, source, and configuration.
"""
super().__init__(prompt, config, source)

self.input_key = "json" if source.endswith("json") else "json_dir"

def _create_graph(self):
def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping.
Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""

fetch_node = FetchNode(
input="json_dir",
output=["doc"],
Expand Down Expand Up @@ -81,7 +104,11 @@ def _create_graph(self):
def run(self) -> str:
"""
Executes the web scraping process and returns the answer to the prompt.
Returns:
str: The answer to the prompt.
"""

inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)

Expand Down
46 changes: 37 additions & 9 deletions scrapegraphai/graphs/script_creator_graph.py
@@ -1,6 +1,7 @@
"""
Module for creating the smart scraper
ScriptCreatorGraph Module
"""

from .base_graph import BaseGraph
from ..nodes import (
FetchNode,
Expand All @@ -13,24 +14,47 @@

class ScriptCreatorGraph(AbstractGraph):
"""
SmartScraper is a comprehensive web scraping tool that automates the process of extracting
information from web pages using a natural language model to interpret and answer prompts.
ScriptCreatorGraph defines a scraping pipeline for generating web scraping scripts.
Attributes:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client, configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
model_token (int): The token limit for the language model.
library (str): The library used for web scraping.
Args:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
Example:
>>> script_creator = ScriptCreatorGraph(
... "List me all the attractions in Chioggia.",
... "https://en.wikipedia.org/wiki/Chioggia",
... {"llm": {"model": "gpt-3.5-turbo"}}
... )
>>> result = script_creator.run()
"""

def __init__(self, prompt: str, source: str, config: dict):
"""
Initializes the ScriptCreatorGraph with a prompt, source, and configuration.
"""
self.library = config['library']

super().__init__(prompt, config, source)

self.input_key = "url" if source.startswith("http") else "local_dir"
self.library = config['library']

def _create_graph(self):
def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping.
Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""

fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
Expand Down Expand Up @@ -76,7 +100,11 @@ def _create_graph(self):
def run(self) -> str:
"""
Executes the web scraping process and returns the answer to the prompt.
Returns:
str: The answer to the prompt.
"""

inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)

Expand Down
35 changes: 32 additions & 3 deletions scrapegraphai/graphs/search_graph.py
@@ -1,6 +1,7 @@
"""
Module for making the search on the intenet
SearchGraph Module
"""

from .base_graph import BaseGraph
from ..nodes import (
SearchInternetNode,
Expand All @@ -14,13 +15,37 @@

class SearchGraph(AbstractGraph):
"""
Module for searching info on the internet
SearchGraph is a scraping pipeline that searches the internet for answers to a given prompt.
It only requires a user prompt to search the internet and generate an answer.
Attributes:
prompt (str): The user prompt to search the internet.
llm_model (dict): The configuration for the language model.
embedder_model (dict): The configuration for the embedder model.
headless (bool): A flag to run the browser in headless mode.
verbose (bool): A flag to display the execution information.
model_token (int): The token limit for the language model.
Args:
prompt (str): The user prompt to search the internet.
config (dict): Configuration parameters for the graph.
Example:
>>> search_graph = SearchGraph(
... "What is Chioggia famous for?",
... {"llm": {"model": "gpt-3.5-turbo"}}
... )
>>> result = search_graph.run()
"""

def _create_graph(self):
def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping and searching.
Returns:
BaseGraph: A graph instance representing the web scraping and searching workflow.
"""

search_internet_node = SearchInternetNode(
input="user_prompt",
output=["url"],
Expand Down Expand Up @@ -83,7 +108,11 @@ def _create_graph(self):
def run(self) -> str:
"""
Executes the web scraping and searching process.
Returns:
str: The answer to the prompt.
"""

inputs = {"user_prompt": self.prompt}
self.final_state, self.execution_info = self.graph.execute(inputs)

Expand Down

0 comments on commit 0631985

Please sign in to comment.