From cf038b33eaae42f65d7d9c782b5729092b272dd0 Mon Sep 17 00:00:00 2001 From: Perinim Date: Wed, 1 May 2024 12:35:12 +0200 Subject: [PATCH] docs: update utils docstrings --- scrapegraphai/utils/convert_to_csv.py | 25 ++++++++++------ scrapegraphai/utils/convert_to_json.py | 30 +++++++++++++------- scrapegraphai/utils/parse_state_keys.py | 26 ++++++++++++++--- scrapegraphai/utils/prettify_exec_info.py | 10 +++++-- scrapegraphai/utils/proxy_rotation.py | 25 +++++++++------- scrapegraphai/utils/remover.py | 17 +++++++---- scrapegraphai/utils/research_web.py | 23 ++++++++++----- scrapegraphai/utils/save_audio_from_bytes.py | 12 ++++++-- scrapegraphai/utils/token_calculator.py | 16 +++++++---- 9 files changed, 126 insertions(+), 58 deletions(-) diff --git a/scrapegraphai/utils/convert_to_csv.py b/scrapegraphai/utils/convert_to_csv.py index 9b430fff..be001d06 100644 --- a/scrapegraphai/utils/convert_to_csv.py +++ b/scrapegraphai/utils/convert_to_csv.py @@ -6,20 +6,27 @@ import pandas as pd -def convert_to_csv(data: dict, filename: str, position: str = None): +def convert_to_csv(data: dict, filename: str, position: str = None) -> None: """ - Converts a dictionary to a CSV file and saves it. + Converts a dictionary to a CSV file and saves it at a specified location. Args: - data (dict): Data to be converted to CSV. - position (str): Optional path where the file should be saved. If not provided, - the directory of the caller script will be used. + data (dict): The data to be converted into CSV format. + filename (str): The name of the output CSV file, without the '.csv' extension. + position (str, optional): The file path where the CSV should be saved. Defaults to the directory of the caller script if not provided. + Returns: + None: The function does not return anything. + Raises: - FileNotFoundError: If the specified directory does not exist. - PermissionError: If the program lacks write permission for the directory. - TypeError: If the input data is not a dictionary. - Exception: For other potential errors during DataFrame creation or CSV saving. + FileNotFoundError: If the specified directory does not exist. + PermissionError: If write permissions are lacking for the directory. + TypeError: If `data` is not a dictionary. + Exception: For other issues that may arise during the creation or saving of the CSV file. + + Example: + >>> convert_to_csv({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save') + Saves a CSV file named 'output.csv' at '/path/to/save'. """ if ".csv" in filename: diff --git a/scrapegraphai/utils/convert_to_json.py b/scrapegraphai/utils/convert_to_json.py index c349ad1d..7cf12c53 100644 --- a/scrapegraphai/utils/convert_to_json.py +++ b/scrapegraphai/utils/convert_to_json.py @@ -6,23 +6,33 @@ import sys -def convert_to_json(data: dict, filename: str, position: str = None): +def convert_to_json(data: dict, filename: str, position: str = None) -> None: """ - Convert data to JSON format and save it to a file. + Converts a dictionary to a JSON file and saves it at a specified location. Args: - data (dict): Data to save. - filename (str): Name of the file to save without .json extension. - position (str): Directory where the file should be saved. If None, - the directory of the caller script will be used. + data (dict): The data to be converted into JSON format. + filename (str): The name of the output JSON file, without the '.json' extension. + position (str, optional): The file path where the JSON file should be saved. Defaults to the directory of the caller script if not provided. + Returns: + None: The function does not return anything. + Raises: - ValueError: If filename contains '.json'. - FileNotFoundError: If the specified directory does not exist. - PermissionError: If the program does not have permission to write to the directory. + ValueError: If 'filename' contains '.json'. + FileNotFoundError: If the specified directory does not exist. + PermissionError: If write permissions are lacking for the directory. + + Example: + >>> convert_to_json({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save') + Saves a JSON file named 'output.json' at '/path/to/save'. + + Notes: + This function automatically ensures the directory exists before attempting to write the file. If the directory does not exist, it will attempt to create it. """ + if ".json" in filename: - filename = filename.replace(".json", "") # Remove .csv extension + filename = filename.replace(".json", "") # Remove .json extension # Get the directory of the caller script if position is None: diff --git a/scrapegraphai/utils/parse_state_keys.py b/scrapegraphai/utils/parse_state_keys.py index 5c99a60f..6afc2ecb 100644 --- a/scrapegraphai/utils/parse_state_keys.py +++ b/scrapegraphai/utils/parse_state_keys.py @@ -4,12 +4,30 @@ import re -def parse_expression(expression, state: dict): - """ - Function for parsing the expressions +def parse_expression(expression, state: dict) -> list: + """ + Parses a complex boolean expression involving state keys. + Args: - state (dict): state to elaborate + expression (str): The boolean expression to parse. + state (dict): Dictionary of state keys used to evaluate the expression. + + Raises: + ValueError: If the expression is empty, has adjacent state keys without operators, invalid operator usage, + unbalanced parentheses, or if no state keys match the expression. + + Returns: + list: A list of state keys that match the boolean expression, ensuring each key appears only once. + + Example: + >>> parse_expression("user_input & (relevant_chunks | parsed_document | document)", + {"user_input": None, "document": None, "parsed_document": None, "relevant_chunks": None}) + ['user_input', 'relevant_chunks', 'parsed_document', 'document'] + + This function evaluates the expression to determine the logical inclusion of state keys based on provided boolean logic. + It checks for syntax errors such as unbalanced parentheses, incorrect adjacency of operators, and empty expressions. """ + # Check for empty expression if not expression: raise ValueError("Empty expression.") diff --git a/scrapegraphai/utils/prettify_exec_info.py b/scrapegraphai/utils/prettify_exec_info.py index 21004b71..6bda73c6 100644 --- a/scrapegraphai/utils/prettify_exec_info.py +++ b/scrapegraphai/utils/prettify_exec_info.py @@ -7,13 +7,17 @@ def prettify_exec_info(complete_result: list[dict]) -> pd.DataFrame: """ - Transform the execution information of the graph into a DataFrame for better visualization. + Transforms the execution information of a graph into a DataFrame for enhanced visualization. Args: - - complete_result (list[dict]): The complete execution information of the graph. + complete_result (list[dict]): The complete execution information of the graph. Returns: - - pd.DataFrame: The execution information of the graph in a DataFrame. + pd.DataFrame: A DataFrame that organizes the execution information for better readability and analysis. + + Example: + >>> prettify_exec_info([{'node': 'A', 'status': 'success'}, {'node': 'B', 'status': 'failure'}]) + DataFrame with columns 'node' and 'status' showing execution results for each node. """ df_nodes = pd.DataFrame(complete_result) diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py index 0019b421..576a91e4 100644 --- a/scrapegraphai/utils/proxy_rotation.py +++ b/scrapegraphai/utils/proxy_rotation.py @@ -4,26 +4,29 @@ from fp.fp import FreeProxy -def proxy_generator(num_ips: int): +def proxy_generator(num_ips: int) -> list: """ - Rotates through a specified number of proxy IPs using the FreeProxy library. + Generates a specified number of proxy IP addresses using the FreeProxy library. Args: - num_ips (int): The number of proxy IPs to rotate through. + num_ips (int): The number of proxy IPs to generate and rotate through. Returns: - dict: A dictionary containing the rotated proxy IPs, indexed by their position in rotation. + list: A list of proxy IP addresses. Example: >>> proxy_generator(5) - { - 0: '192.168.1.1:8080', - 1: '103.10.63.135:8080', - 2: '176.9.75.42:8080', - 3: '37.57.216.2:8080', - 4: '113.20.31.250:8080' - } + [ + '192.168.1.1:8080', + '103.10.63.135:8080', + '176.9.75.42:8080', + '37.57.216.2:8080', + '113.20.31.250:8080' + ] + + This function fetches fresh proxies and indexes them, making it easy to manage multiple proxy configurations. """ + res = [] for i in range(0, num_ips): diff --git a/scrapegraphai/utils/remover.py b/scrapegraphai/utils/remover.py index 60f7592b..5e203249 100644 --- a/scrapegraphai/utils/remover.py +++ b/scrapegraphai/utils/remover.py @@ -7,15 +7,20 @@ def remover(html_content: str) -> str: """ - This function processes HTML content, removes unnecessary tags - (including style tags), minifies the HTML, and retrieves the - title and body content. + Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content. - Parameters: - html_content (str): The HTML content to parse + Args: + html_content (str): The HTML content to be processed. Returns: - str: The parsed title followed by the minified body content + str: A string combining the parsed title and the minified body content. If no body content is found, it indicates so. + + Example: + >>> html_content = "Example

Hello World!

" + >>> remover(html_content) + 'Title: Example, Body:

Hello World!

' + + This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized. """ soup = BeautifulSoup(html_content, 'html.parser') diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 8f48adcd..398ae00a 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -8,16 +8,25 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]: - """ - Function that given a query it finds it on the intenet + """ + Searches the web for a given query using specified search engine options. + Args: - query (str): query to search on internet - search_engine (str, optional): type of browser, it could be DuckDuckGo or Google, - default: Google - max_results (int, optional): maximum number of results + query (str): The search query to find on the internet. + search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'. + max_results (int, optional): The maximum number of search results to return. Returns: - List[str]: List of strings of web link + List[str]: A list of URLs as strings that are the search results. + + Raises: + ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'. + + Example: + >>> search_on_web("example query", search_engine="Google", max_results=5) + ['http://example.com', 'http://example.org', ...] + + This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs. """ if search_engine == "Google": diff --git a/scrapegraphai/utils/save_audio_from_bytes.py b/scrapegraphai/utils/save_audio_from_bytes.py index 41c53d7b..3027e4e8 100644 --- a/scrapegraphai/utils/save_audio_from_bytes.py +++ b/scrapegraphai/utils/save_audio_from_bytes.py @@ -7,12 +7,18 @@ def save_audio_from_bytes(byte_response: bytes, output_path: Union[str, Path]) -> None: """ - Saves the byte response as an audio file. + Saves the byte response as an audio file to the specified path. Args: - byte_response (bytes): The byte response containing the generated speech. - output_path (str or Path): The file path where the generated speech should be saved. + byte_response (bytes): The byte array containing audio data. + output_path (Union[str, Path]): The destination file path where the audio file will be saved. + + Example: + >>> save_audio_from_bytes(b'audio data', 'path/to/audio.mp3') + + This function writes the byte array containing audio data to a file, saving it as an audio file. """ + if not isinstance(output_path, Path): output_path = Path(output_path) diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py index 5b46a1b8..5b23fdf4 100644 --- a/scrapegraphai/utils/token_calculator.py +++ b/scrapegraphai/utils/token_calculator.py @@ -8,15 +8,21 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]: """ - It creates a list of strings to create max dimension tokenizable elements + Truncates text into chunks that are small enough to be processed by specified llm models. Args: - text (str): The input text to be truncated into tokenizable elements. - model (str): The name of the language model to be used. - encoding_name (str): The name of the encoding to be used (default: EMBEDDING_ENCODING). + text (str): The input text to be truncated. + model (str): The name of the llm model to determine the maximum token limit. + encoding_name (str): The encoding strategy used to encode the text before truncation. Returns: - List[str]: A list of tokenizable elements created from the input text. + List[str]: A list of text chunks, each within the token limit of the specified model. + + Example: + >>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING") + ["This is a sample text", "for truncation."] + + This function ensures that each chunk of text can be tokenized by the specified model without exceeding the model's token limit. """ encoding = tiktoken.get_encoding(encoding_name)