Skip to content

Commit

Permalink
docs: update utils docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
PeriniM committed May 1, 2024
1 parent 96975b2 commit cf038b3
Show file tree
Hide file tree
Showing 9 changed files with 126 additions and 58 deletions.
25 changes: 16 additions & 9 deletions scrapegraphai/utils/convert_to_csv.py
Expand Up @@ -6,20 +6,27 @@
import pandas as pd


def convert_to_csv(data: dict, filename: str, position: str = None):
def convert_to_csv(data: dict, filename: str, position: str = None) -> None:
"""
Converts a dictionary to a CSV file and saves it.
Converts a dictionary to a CSV file and saves it at a specified location.
Args:
data (dict): Data to be converted to CSV.
position (str): Optional path where the file should be saved. If not provided,
the directory of the caller script will be used.
data (dict): The data to be converted into CSV format.
filename (str): The name of the output CSV file, without the '.csv' extension.
position (str, optional): The file path where the CSV should be saved. Defaults to the directory of the caller script if not provided.
Returns:
None: The function does not return anything.
Raises:
FileNotFoundError: If the specified directory does not exist.
PermissionError: If the program lacks write permission for the directory.
TypeError: If the input data is not a dictionary.
Exception: For other potential errors during DataFrame creation or CSV saving.
FileNotFoundError: If the specified directory does not exist.
PermissionError: If write permissions are lacking for the directory.
TypeError: If `data` is not a dictionary.
Exception: For other issues that may arise during the creation or saving of the CSV file.
Example:
>>> convert_to_csv({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save')
Saves a CSV file named 'output.csv' at '/path/to/save'.
"""

if ".csv" in filename:
Expand Down
30 changes: 20 additions & 10 deletions scrapegraphai/utils/convert_to_json.py
Expand Up @@ -6,23 +6,33 @@
import sys


def convert_to_json(data: dict, filename: str, position: str = None):
def convert_to_json(data: dict, filename: str, position: str = None) -> None:
"""
Convert data to JSON format and save it to a file.
Converts a dictionary to a JSON file and saves it at a specified location.
Args:
data (dict): Data to save.
filename (str): Name of the file to save without .json extension.
position (str): Directory where the file should be saved. If None,
the directory of the caller script will be used.
data (dict): The data to be converted into JSON format.
filename (str): The name of the output JSON file, without the '.json' extension.
position (str, optional): The file path where the JSON file should be saved. Defaults to the directory of the caller script if not provided.
Returns:
None: The function does not return anything.
Raises:
ValueError: If filename contains '.json'.
FileNotFoundError: If the specified directory does not exist.
PermissionError: If the program does not have permission to write to the directory.
ValueError: If 'filename' contains '.json'.
FileNotFoundError: If the specified directory does not exist.
PermissionError: If write permissions are lacking for the directory.
Example:
>>> convert_to_json({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save')
Saves a JSON file named 'output.json' at '/path/to/save'.
Notes:
This function automatically ensures the directory exists before attempting to write the file. If the directory does not exist, it will attempt to create it.
"""

if ".json" in filename:
filename = filename.replace(".json", "") # Remove .csv extension
filename = filename.replace(".json", "") # Remove .json extension

# Get the directory of the caller script
if position is None:
Expand Down
26 changes: 22 additions & 4 deletions scrapegraphai/utils/parse_state_keys.py
Expand Up @@ -4,12 +4,30 @@
import re


def parse_expression(expression, state: dict):
"""
Function for parsing the expressions
def parse_expression(expression, state: dict) -> list:
"""
Parses a complex boolean expression involving state keys.
Args:
state (dict): state to elaborate
expression (str): The boolean expression to parse.
state (dict): Dictionary of state keys used to evaluate the expression.
Raises:
ValueError: If the expression is empty, has adjacent state keys without operators, invalid operator usage,
unbalanced parentheses, or if no state keys match the expression.
Returns:
list: A list of state keys that match the boolean expression, ensuring each key appears only once.
Example:
>>> parse_expression("user_input & (relevant_chunks | parsed_document | document)",
{"user_input": None, "document": None, "parsed_document": None, "relevant_chunks": None})
['user_input', 'relevant_chunks', 'parsed_document', 'document']
This function evaluates the expression to determine the logical inclusion of state keys based on provided boolean logic.
It checks for syntax errors such as unbalanced parentheses, incorrect adjacency of operators, and empty expressions.
"""

# Check for empty expression
if not expression:
raise ValueError("Empty expression.")
Expand Down
10 changes: 7 additions & 3 deletions scrapegraphai/utils/prettify_exec_info.py
Expand Up @@ -7,13 +7,17 @@

def prettify_exec_info(complete_result: list[dict]) -> pd.DataFrame:
"""
Transform the execution information of the graph into a DataFrame for better visualization.
Transforms the execution information of a graph into a DataFrame for enhanced visualization.
Args:
- complete_result (list[dict]): The complete execution information of the graph.
complete_result (list[dict]): The complete execution information of the graph.
Returns:
- pd.DataFrame: The execution information of the graph in a DataFrame.
pd.DataFrame: A DataFrame that organizes the execution information for better readability and analysis.
Example:
>>> prettify_exec_info([{'node': 'A', 'status': 'success'}, {'node': 'B', 'status': 'failure'}])
DataFrame with columns 'node' and 'status' showing execution results for each node.
"""

df_nodes = pd.DataFrame(complete_result)
Expand Down
25 changes: 14 additions & 11 deletions scrapegraphai/utils/proxy_rotation.py
Expand Up @@ -4,26 +4,29 @@
from fp.fp import FreeProxy


def proxy_generator(num_ips: int):
def proxy_generator(num_ips: int) -> list:
"""
Rotates through a specified number of proxy IPs using the FreeProxy library.
Generates a specified number of proxy IP addresses using the FreeProxy library.
Args:
num_ips (int): The number of proxy IPs to rotate through.
num_ips (int): The number of proxy IPs to generate and rotate through.
Returns:
dict: A dictionary containing the rotated proxy IPs, indexed by their position in rotation.
list: A list of proxy IP addresses.
Example:
>>> proxy_generator(5)
{
0: '192.168.1.1:8080',
1: '103.10.63.135:8080',
2: '176.9.75.42:8080',
3: '37.57.216.2:8080',
4: '113.20.31.250:8080'
}
[
'192.168.1.1:8080',
'103.10.63.135:8080',
'176.9.75.42:8080',
'37.57.216.2:8080',
'113.20.31.250:8080'
]
This function fetches fresh proxies and indexes them, making it easy to manage multiple proxy configurations.
"""

res = []

for i in range(0, num_ips):
Expand Down
17 changes: 11 additions & 6 deletions scrapegraphai/utils/remover.py
Expand Up @@ -7,15 +7,20 @@

def remover(html_content: str) -> str:
"""
This function processes HTML content, removes unnecessary tags
(including style tags), minifies the HTML, and retrieves the
title and body content.
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
Parameters:
html_content (str): The HTML content to parse
Args:
html_content (str): The HTML content to be processed.
Returns:
str: The parsed title followed by the minified body content
str: A string combining the parsed title and the minified body content. If no body content is found, it indicates so.
Example:
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
>>> remover(html_content)
'Title: Example, Body: <body><p>Hello World!</p></body>'
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
"""

soup = BeautifulSoup(html_content, 'html.parser')
Expand Down
23 changes: 16 additions & 7 deletions scrapegraphai/utils/research_web.py
Expand Up @@ -8,16 +8,25 @@


def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
"""
Function that given a query it finds it on the intenet
"""
Searches the web for a given query using specified search engine options.
Args:
query (str): query to search on internet
search_engine (str, optional): type of browser, it could be DuckDuckGo or Google,
default: Google
max_results (int, optional): maximum number of results
query (str): The search query to find on the internet.
search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'.
max_results (int, optional): The maximum number of search results to return.
Returns:
List[str]: List of strings of web link
List[str]: A list of URLs as strings that are the search results.
Raises:
ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'.
Example:
>>> search_on_web("example query", search_engine="Google", max_results=5)
['http://example.com', 'http://example.org', ...]
This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs.
"""

if search_engine == "Google":
Expand Down
12 changes: 9 additions & 3 deletions scrapegraphai/utils/save_audio_from_bytes.py
Expand Up @@ -7,12 +7,18 @@

def save_audio_from_bytes(byte_response: bytes, output_path: Union[str, Path]) -> None:
"""
Saves the byte response as an audio file.
Saves the byte response as an audio file to the specified path.
Args:
byte_response (bytes): The byte response containing the generated speech.
output_path (str or Path): The file path where the generated speech should be saved.
byte_response (bytes): The byte array containing audio data.
output_path (Union[str, Path]): The destination file path where the audio file will be saved.
Example:
>>> save_audio_from_bytes(b'audio data', 'path/to/audio.mp3')
This function writes the byte array containing audio data to a file, saving it as an audio file.
"""

if not isinstance(output_path, Path):
output_path = Path(output_path)

Expand Down
16 changes: 11 additions & 5 deletions scrapegraphai/utils/token_calculator.py
Expand Up @@ -8,15 +8,21 @@

def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]:
"""
It creates a list of strings to create max dimension tokenizable elements
Truncates text into chunks that are small enough to be processed by specified llm models.
Args:
text (str): The input text to be truncated into tokenizable elements.
model (str): The name of the language model to be used.
encoding_name (str): The name of the encoding to be used (default: EMBEDDING_ENCODING).
text (str): The input text to be truncated.
model (str): The name of the llm model to determine the maximum token limit.
encoding_name (str): The encoding strategy used to encode the text before truncation.
Returns:
List[str]: A list of tokenizable elements created from the input text.
List[str]: A list of text chunks, each within the token limit of the specified model.
Example:
>>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING")
["This is a sample text", "for truncation."]
This function ensures that each chunk of text can be tokenized by the specified model without exceeding the model's token limit.
"""

encoding = tiktoken.get_encoding(encoding_name)
Expand Down

0 comments on commit cf038b3

Please sign in to comment.