From df271b6451c1d588c4b803ef9dbebf76fa1fd9ca Mon Sep 17 00:00:00 2001 From: mayurdb Date: Sat, 11 May 2024 16:39:55 +0530 Subject: [PATCH 1/3] Add search link node that can find out relevant links in the webpage --- examples/openai/deep_scraper_openai.py | 46 ++++++++ scrapegraphai/graphs/__init__.py | 1 + scrapegraphai/graphs/deep_scraper_graph.py | 116 ++++++++++++++++++ scrapegraphai/nodes/search_link_node.py | 130 +++++++-------------- 4 files changed, 205 insertions(+), 88 deletions(-) create mode 100644 examples/openai/deep_scraper_openai.py create mode 100644 scrapegraphai/graphs/deep_scraper_graph.py diff --git a/examples/openai/deep_scraper_openai.py b/examples/openai/deep_scraper_openai.py new file mode 100644 index 00000000..df0e480f --- /dev/null +++ b/examples/openai/deep_scraper_openai.py @@ -0,0 +1,46 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DeepScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-4", + }, + "verbose": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +deep_scraper_graph = DeepScraperGraph( + prompt="List me all the job titles and detailed job description.", + # also accepts a string with the already downloaded HTML code + source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", + config=graph_config +) + +result = deep_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = deep_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 64b8241c..9afaf7ed 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -5,6 +5,7 @@ from .abstract_graph import AbstractGraph from .base_graph import BaseGraph from .smart_scraper_graph import SmartScraperGraph +from .deep_scraper_graph import DeepScraperGraph from .speech_graph import SpeechGraph from .search_graph import SearchGraph from .script_creator_graph import ScriptCreatorGraph diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py new file mode 100644 index 00000000..fe225b9a --- /dev/null +++ b/scrapegraphai/graphs/deep_scraper_graph.py @@ -0,0 +1,116 @@ +""" +DeepScraperGraph Module +""" + +from .base_graph import BaseGraph +from ..nodes import ( + FetchNode, + SearchLinkNode, + ParseNode, + RAGNode, + GenerateAnswerNode +) +from .abstract_graph import AbstractGraph + + +class DeepScraperGraph(AbstractGraph): + """ + [WIP] + + DeepScraper is a scraping pipeline that automates the process of + extracting information from web pages + using a natural language model to interpret and answer prompts. + + Unlike SmartScraper, DeepScraper can navigate to the links within the input webpage, + to fuflfil the task within the prompt. + + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, + configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + Example: + >>> smart_scraper = DeepScraperGraph( + ... "List me all the job titles and detailed job description.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper.run() + ) + """ + + def __init__(self, prompt: str, source: str, config: dict): + super().__init__(prompt, config, source) + + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping. + Returns: + BaseGraph: A graph instance representing the web scraping workflow. + """ + fetch_node = FetchNode( + input="url | local_dir", + output=["doc"] + ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "chunk_size": self.model_token + } + ) + rag_node = RAGNode( + input="user_prompt & (parsed_doc | doc)", + output=["relevant_chunks"], + node_config={ + "llm_model": self.llm_model, + "embedder_model": self.embedder_model + } + ) + search_node = SearchLinkNode( + input="user_prompt & relevant_chunks", + output=["relevant_links"], + node_config={ + "llm_model": self.llm_model, + "embedder_model": self.embedder_model + } + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + rag_node, + search_node + ], + edges=[ + (fetch_node, parse_node), + (parse_node, rag_node), + (rag_node, search_node) + + ], + entry_point=fetch_node + ) + + def run(self) -> str: + """ + Executes the scraping process and returns the answer to the prompt. + Returns: + str: The answer to the prompt. + """ + + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 002c9d21..f7253e21 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -19,8 +19,15 @@ class SearchLinkNode(BaseNode): """ - A node that look for all the links in a web page and returns them. - It initially tries to extract the links using classical methods, if it fails it uses the LLM to extract the links. + A node that can filter out the relevant links in the webpage content. + Node expects the aleready scrapped information and hence it is expected + that this node be used after the FetchNode. + + For the links which are not incomplete and hence in-navigable. the node will complete + the url and return, + + For example: link /projects/rotary-pendulum-rl/ on https://perinim.github.io/projects/, + would be augmented to return https://perinim.github.io/projects/rotary-pendulum-rl/ Attributes: llm_model: An instance of the language model client used for generating answers. @@ -43,8 +50,8 @@ def __init__(self, input: str, output: List[str], node_config: Optional[dict] = def execute(self, state: dict) -> dict: """ - Generates a list of links by extracting them from the provided HTML content. - First, it tries to extract the links using classical methods, if it fails it uses the LLM to extract the links. + Filter out relevant links from the webpage that are relavant to prompt. Out of the filtered links, also + ensure that all links are navigable. Args: state (dict): The current state of the graph. The input keys will be used to fetch the @@ -64,89 +71,36 @@ def execute(self, state: dict) -> dict: # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys - doc = [state[key] for key in input_keys] - - try: - links = [] - for elem in doc: - soup = BeautifulSoup(elem.content, 'html.parser') - links.append(soup.find_all("a")) - state.update({self.output[0]: {elem for elem in links}}) - - except Exception: - if self.verbose: - print( - "Error extracting links using classical methods. Using LLM to extract links.") - - output_parser = JsonOutputParser() - - template_chunks = """ - You are a website scraper and you have just scraped the - following content from a website. - You are now asked to find all the links inside this page.\n - The website is big so I am giving you one chunk at the time to be merged later with the other chunks.\n - Ignore all the context sentences that ask you not to extract information from the html code.\n - Content of {chunk_id}: {context}. \n + user_prompt = state[input_keys[0]] + parsed_content_chunks = state[input_keys[1]] + output_parser = JsonOutputParser() + + prompt_relevant_links = """ + You are a website scraper and you have just scraped the following content from a website. + Content: {content} + You are now asked to find all relevant links from the extracted webpage content related + to prompt {user_prompt}. Only pick links which are valid and relevant + Output only a list of relevant links in the format: + [ + "link1", + "link2", + "link3", + . + . + . + ] """ - - template_no_chunks = """ - You are a website scraper and you have just scraped the - following content from a website. - You are now asked to find all the links inside this page.\n - Ignore all the context sentences that ask you not to extract information from the html code.\n - Website content: {context}\n - """ - - template_merge = """ - You are a website scraper and you have just scraped the - all these links. \n - You have scraped many chunks since the website is big and now you are asked to merge them into a single answer without repetitions (if there are any).\n - Links: {context}\n - """ - - chains_dict = {} - - # Use tqdm to add progress bar - for i, chunk in enumerate(tqdm(doc, desc="Processing chunks")): - if len(doc) == 1: - prompt = PromptTemplate( - template=template_no_chunks, - input_variables=["question"], - partial_variables={"context": chunk.page_content, - }, - ) - else: - prompt = PromptTemplate( - template=template_chunks, - input_variables=["question"], - partial_variables={"context": chunk.page_content, - "chunk_id": i + 1, - }, - ) - - # Dynamically name the chains based on their index - chain_name = f"chunk{i+1}" - chains_dict[chain_name] = prompt | self.llm_model | output_parser - - if len(chains_dict) > 1: - # Use dictionary unpacking to pass the dynamically named chains to RunnableParallel - map_chain = RunnableParallel(**chains_dict) - # Chain - answer = map_chain.invoke() - # Merge the answers from the chunks - merge_prompt = PromptTemplate( - template=template_merge, - input_variables=["context", "question"], - ) - merge_chain = merge_prompt | self.llm_model | output_parser - answer = merge_chain.invoke( - {"context": answer}) - else: - # Chain - single_chain = list(chains_dict.values())[0] - answer = single_chain.invoke() - - # Update the state with the generated answer - state.update({self.output[0]: answer}) + relevant_links = [] + + for i, chunk in enumerate(tqdm(parsed_content_chunks, desc="Processing chunks", disable=not self.verbose)): + merge_prompt = PromptTemplate( + template=prompt_relevant_links, + input_variables=["content", "user_prompt"], + ) + merge_chain = merge_prompt | self.llm_model | output_parser + # merge_chain = merge_prompt | self.llm_model + answer = merge_chain.invoke( + {"content": chunk.page_content, "user_prompt": user_prompt}) + relevant_links += answer + state.update({self.output[0]: relevant_links}) return state From 8f1fbe7e10dc9ef7cc58437f1e1fcb9e082b5c8f Mon Sep 17 00:00:00 2001 From: mayurdb Date: Sat, 11 May 2024 16:52:36 +0530 Subject: [PATCH 2/3] minor changes --- examples/openai/deep_scraper_openai.py | 3 ++- scrapegraphai/graphs/deep_scraper_graph.py | 6 +++--- scrapegraphai/utils/cleanup_html.py | 2 -- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/openai/deep_scraper_openai.py b/examples/openai/deep_scraper_openai.py index df0e480f..f87d7cb5 100644 --- a/examples/openai/deep_scraper_openai.py +++ b/examples/openai/deep_scraper_openai.py @@ -43,4 +43,5 @@ # ************************************************ graph_exec_info = deep_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) +print(deep_scraper_graph.get_state("relevant_links")) +print(prettify_exec_info(graph_exec_info)) \ No newline at end of file diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py index fe225b9a..4d6d4d4b 100644 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ b/scrapegraphai/graphs/deep_scraper_graph.py @@ -39,12 +39,12 @@ class DeepScraperGraph(AbstractGraph): source (str): The source of the graph. config (dict): Configuration parameters for the graph. Example: - >>> smart_scraper = DeepScraperGraph( + >>> deep_scraper = DeepScraperGraph( ... "List me all the job titles and detailed job description.", - ... "https://en.wikipedia.org/wiki/Chioggia", + ... "https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", ... {"llm": {"model": "gpt-3.5-turbo"}} ... ) - >>> result = smart_scraper.run() + >>> result = deep_scraper.run() ) """ diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index bc16a99b..226e4a8b 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -45,9 +45,7 @@ def cleanup_html(html_content: str, base_url: str) -> str: if body_content: # Minify the HTML within the body tag minimized_body = minify(str(body_content)) - print("Came here") return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) - print("No Came here") return "Title: " + title + ", Body: No body content found" + ", Links: " + str(link_urls) \ No newline at end of file From 9a67a26cd3aad0ea53ffdf0a90804e51a43cf4e9 Mon Sep 17 00:00:00 2001 From: mayurdb Date: Sat, 11 May 2024 16:57:22 +0530 Subject: [PATCH 3/3] Update documentation --- scrapegraphai/nodes/search_link_node.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index f7253e21..bf64b5d9 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -5,7 +5,6 @@ # Imports from standard library from typing import List, Optional from tqdm import tqdm -from bs4 import BeautifulSoup # Imports from Langchain @@ -19,15 +18,9 @@ class SearchLinkNode(BaseNode): """ - A node that can filter out the relevant links in the webpage content. - Node expects the aleready scrapped information and hence it is expected + A node that can filter out the relevant links in the webpage content for the user prompt. + Node expects the aleready scrapped links on the webpage and hence it is expected that this node be used after the FetchNode. - - For the links which are not incomplete and hence in-navigable. the node will complete - the url and return, - - For example: link /projects/rotary-pendulum-rl/ on https://perinim.github.io/projects/, - would be augmented to return https://perinim.github.io/projects/rotary-pendulum-rl/ Attributes: llm_model: An instance of the language model client used for generating answers.