From e6c7940a57929c2ed8c9fda1a6e375cc87a2b7f4 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 12 Jun 2024 12:29:14 +0200 Subject: [PATCH 1/6] feat: add Parse_Node --- scrapegraphai/graphs/pdf_scraper_graph.py | 14 +++++++++++++- scrapegraphai/graphs/smart_scraper_graph.py | 3 ++- scrapegraphai/nodes/parse_node.py | 20 ++++++++++++++------ 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index ca79df41..c476e629 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -11,6 +11,7 @@ from ..nodes import ( FetchNode, + ParseNode, RAGNode, GenerateAnswerPDFNode ) @@ -66,6 +67,15 @@ def _create_graph(self) -> BaseGraph: output=["doc"], ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "parse_html": False, + "chunk_size": self.model_token + } + ) + rag_node = RAGNode( input="user_prompt & (parsed_doc | doc)", output=["relevant_chunks"], @@ -86,11 +96,13 @@ def _create_graph(self) -> BaseGraph: return BaseGraph( nodes=[ fetch_node, + parse_node, rag_node, generate_answer_node_pdf, ], edges=[ - (fetch_node, rag_node), + (fetch_node, parse_node), + (parse_node, rag_node), (rag_node, generate_answer_node_pdf) ], entry_point=fetch_node diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 85b292c3..35ff3df4 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -3,8 +3,8 @@ """ from typing import Optional +import logging from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph @@ -70,6 +70,7 @@ def _create_graph(self) -> BaseGraph: } ) logging.info("FetchNode configured with headless: %s", self.config.get("headless", True)) + parse_node = ParseNode( input="doc", output=["parsed_doc"], diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 3e77b3e9..5585ae80 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -70,12 +70,20 @@ def execute(self, state: dict) -> dict: docs_transformed = input_data[0] if self.parse_html: docs_transformed = Html2TextTransformer().transform_documents(input_data[0]) - docs_transformed = docs_transformed[0] - - chunks = chunk(text=docs_transformed.page_content, - chunk_size= self.node_config.get("chunk_size", 4096), - token_counter=lambda x: len(x.split()), - memoize=False) + docs_transformed = docs_transformed[0] + + chunks = chunk(text=docs_transformed.page_content, + chunk_size= self.node_config.get("chunk_size", 4096), + token_counter=lambda x: len(x.split()), + memoize=False) + else: + docs_transformed = docs_transformed[0] + + chunks = chunk(text=docs_transformed, + chunk_size= self.node_config.get("chunk_size", 4096), + token_counter=lambda x: len(x.split()), + memoize=False) + state.update({self.output[0]: chunks}) return state From e45f159a31f5dca98659d56c31aa68a0f4503499 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 12 Jun 2024 14:59:10 +0200 Subject: [PATCH 2/6] enhanced performance and readibility --- examples/local_models/pdf_scraper_ollama.py | 2 +- scrapegraphai/nodes/generate_answer_node.py | 11 ++++------- scrapegraphai/nodes/generate_answer_pdf_node.py | 7 +++---- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/examples/local_models/pdf_scraper_ollama.py b/examples/local_models/pdf_scraper_ollama.py index 819fabca..d79afb3a 100644 --- a/examples/local_models/pdf_scraper_ollama.py +++ b/examples/local_models/pdf_scraper_ollama.py @@ -5,7 +5,7 @@ graph_config = { "llm": { - "model": "ollama/llama3", + "model": "ollama/mistral", "temperature": 0, "format": "json", # Ollama needs the format to be specified explicitly "model_tokens": 4000, diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index c6b8c388..62955ea9 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -99,7 +99,8 @@ def execute(self, state: dict) -> dict: input_variables=["question"], partial_variables={"context": chunk.page_content, "format_instructions": format_instructions}) - + chain = prompt | self.llm_model | output_parser + answer = chain.invoke({"question": user_prompt}) else: prompt = PromptTemplate( template=template_chunks, @@ -125,11 +126,7 @@ def execute(self, state: dict) -> dict: ) merge_chain = merge_prompt | self.llm_model | output_parser answer = merge_chain.invoke({"context": answer, "question": user_prompt}) - else: - # Chain - single_chain = list(chains_dict.values())[0] - answer = single_chain.invoke({"question": user_prompt}) - # Update the state with the generated answer state.update({self.output[0]: answer}) - return state \ No newline at end of file + return state + \ No newline at end of file diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index 897e1c56..bf003411 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -117,6 +117,9 @@ def execute(self, state): "format_instructions": format_instructions, }, ) + + chain = prompt | self.llm_model | output_parser + answer = chain.invoke({"question": user_prompt}) else: prompt = PromptTemplate( template=template_chunks_pdf, @@ -145,10 +148,6 @@ def execute(self, state): ) merge_chain = merge_prompt | self.llm_model | output_parser answer = merge_chain.invoke({"context": answer, "question": user_prompt}) - else: - # Chain - single_chain = list(chains_dict.values())[0] - answer = single_chain.invoke({"question": user_prompt}) # Update the state with the generated answer state.update({self.output[0]: answer}) From dc1340e302117a6bb5e5b12e6f51d097ff79cb47 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 12 Jun 2024 15:47:04 +0200 Subject: [PATCH 3/6] Update generate_answer_pdf_node.py --- scrapegraphai/nodes/generate_answer_pdf_node.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index bf003411..e58ae35e 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -113,7 +113,7 @@ def execute(self, state): template=template_no_chunks_pdf, input_variables=["question"], partial_variables={ - "context":chunk, + "context":chunk.page_content, "format_instructions": format_instructions, }, ) @@ -150,5 +150,5 @@ def execute(self, state): answer = merge_chain.invoke({"context": answer, "question": user_prompt}) # Update the state with the generated answer - state.update({self.output[0]: answer}) + state.update({self.output[0]: answer.get("Response", {})}) return state From 1705046cc7dc74911517833698a7e7c4ad31fa7a Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 12 Jun 2024 18:00:12 +0200 Subject: [PATCH 4/6] Update pdf_scraper_graph.py --- scrapegraphai/graphs/pdf_scraper_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index c476e629..6980daf2 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -119,4 +119,4 @@ def run(self) -> str: inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.") + return self.final_state.get("answer", "No answer found.")[0] From 91c5b5af43134671f4d5c801ee315f935b4fed4f Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Fri, 14 Jun 2024 14:59:12 +0200 Subject: [PATCH 5/6] fix(multi): updated multi pdf scraper with schema --- examples/openai/pdf_scraper_graph_openai.py | 2 +- examples/openai/pdf_scraper_multi_openai.py | 61 +++++++++---------- scrapegraphai/graphs/pdf_scraper_graph.py | 2 +- scrapegraphai/graphs/pdf_scraper_multi.py | 6 +- .../nodes/generate_answer_pdf_node.py | 2 +- 5 files changed, 36 insertions(+), 37 deletions(-) diff --git a/examples/openai/pdf_scraper_graph_openai.py b/examples/openai/pdf_scraper_graph_openai.py index e07a7ab5..59f36a9d 100644 --- a/examples/openai/pdf_scraper_graph_openai.py +++ b/examples/openai/pdf_scraper_graph_openai.py @@ -32,7 +32,7 @@ pdf_scraper_graph = PDFScraperGraph( prompt="Summarize the text and find the main topics", - source=source, + source="a.pdf", config=graph_config, ) result = pdf_scraper_graph.run() diff --git a/examples/openai/pdf_scraper_multi_openai.py b/examples/openai/pdf_scraper_multi_openai.py index 8b6c57a1..9e699e58 100644 --- a/examples/openai/pdf_scraper_multi_openai.py +++ b/examples/openai/pdf_scraper_multi_openai.py @@ -6,55 +6,50 @@ from dotenv import load_dotenv from scrapegraphai.graphs import PdfScraperMultiGraph +from pydantic import BaseModel, Field +from typing import List + load_dotenv() openai_key = os.getenv("OPENAI_APIKEY") +# ************************************************ +# Define the configuration for the graph +# ************************************************ + graph_config = { "llm": { "api_key": openai_key, "model": "gpt-3.5-turbo", }, + "verbose": True, } -# Covert to list -sources = [ - "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", - "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", - "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", - "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", -] - -prompt = """ -You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements: - -Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables. -Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable. -Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV. -Response Format: For each abstract, present your response in the following structured format: +# ************************************************ +# Define the output schema for the graph +# ************************************************ -Independent Variable (IV): -Dependent Variable (DV): -Exogenous Shock: +class Article(BaseModel): + independent_variable: str = Field(description="(IV): The variable that is manipulated or considered as the primary cause affecting other variables.") + dependent_variable: str = Field(description="(DV) The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable.") + exogenous_shock: str = Field(description="Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV.") -Example Queries and Responses: +class Articles(BaseModel): + articles: List[Article] -Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking. +# ************************************************ +# Define the sources for the graph +# ************************************************ -Response: - -Independent Variable (IV): Employee happiness. -Dependent Variable (DV): Overall firm productivity. -Exogenous Shock: Sudden company-wide increase in bonus payments. - -Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons. - -Response: +sources = [ + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons." +] -Independent Variable (IV): Exposure to social media. -Dependent Variable (DV): Mental health outcomes. -Exogenous Shock: staggered introduction of Facebook across U.S. colleges. +prompt = """ +Analyze the abstracts provided from an academic journal article to extract and clearly identify the Independent Variable (IV), Dependent Variable (DV), and Exogenous Shock. """ + # ******************************************************* # Create the SmartScraperMultiGraph instance and run it # ******************************************************* @@ -62,7 +57,7 @@ multiple_search_graph = PdfScraperMultiGraph( prompt=prompt, source= sources, - schema=None, + schema=Articles, config=graph_config ) diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 6980daf2..c476e629 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -119,4 +119,4 @@ def run(self) -> str: inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.")[0] + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/pdf_scraper_multi.py b/scrapegraphai/graphs/pdf_scraper_multi.py index 125d70a0..60e81bf7 100644 --- a/scrapegraphai/graphs/pdf_scraper_multi.py +++ b/scrapegraphai/graphs/pdf_scraper_multi.py @@ -4,6 +4,7 @@ from copy import copy, deepcopy from typing import List, Optional +from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph @@ -43,7 +44,7 @@ class PdfScraperMultiGraph(AbstractGraph): >>> result = search_graph.run() """ - def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None): + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): self.max_results = config.get("max_results", 3) @@ -52,6 +53,8 @@ def __init__(self, prompt: str, source: List[str], config: dict, schema: Optiona else: self.copy_config = deepcopy(config) + self.copy_schema = deepcopy(schema) + super().__init__(prompt, config, source, schema) def _create_graph(self) -> BaseGraph: @@ -70,6 +73,7 @@ def _create_graph(self) -> BaseGraph: prompt="", source="", config=self.copy_config, + schema=self.copy_schema ) # ************************************************ diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index e58ae35e..065f3b94 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -150,5 +150,5 @@ def execute(self, state): answer = merge_chain.invoke({"context": answer, "question": user_prompt}) # Update the state with the generated answer - state.update({self.output[0]: answer.get("Response", {})}) + state.update({self.output[0]: answer}) return state From 203de834051ea1d6443841921f3aa3e6adbd9174 Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Fri, 14 Jun 2024 15:20:30 +0200 Subject: [PATCH 6/6] fix(pdf): correctly read .pdf files --- ...ni_search_graph_openai.py => omni_search_openai.py} | 0 ...f_scraper_graph_openai.py => pdf_scraper_openai.py} | 2 +- scrapegraphai/nodes/fetch_node.py | 4 +++- scrapegraphai/nodes/parse_node.py | 10 +++++++++- 4 files changed, 13 insertions(+), 3 deletions(-) rename examples/openai/{omni_search_graph_openai.py => omni_search_openai.py} (100%) rename examples/openai/{pdf_scraper_graph_openai.py => pdf_scraper_openai.py} (97%) diff --git a/examples/openai/omni_search_graph_openai.py b/examples/openai/omni_search_openai.py similarity index 100% rename from examples/openai/omni_search_graph_openai.py rename to examples/openai/omni_search_openai.py diff --git a/examples/openai/pdf_scraper_graph_openai.py b/examples/openai/pdf_scraper_openai.py similarity index 97% rename from examples/openai/pdf_scraper_graph_openai.py rename to examples/openai/pdf_scraper_openai.py index 59f36a9d..6267baea 100644 --- a/examples/openai/pdf_scraper_graph_openai.py +++ b/examples/openai/pdf_scraper_openai.py @@ -32,7 +32,7 @@ pdf_scraper_graph = PDFScraperGraph( prompt="Summarize the text and find the main topics", - source="a.pdf", + source="Laureaconanniaccademici.pdf", config=graph_config, ) result = pdf_scraper_graph.run() diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index dbdd9925..df12a26f 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -95,8 +95,10 @@ def execute(self, state): state.update({self.output[0]: compressed_document}) return state - # handling for pdf + # handling pdf elif input_keys[0] == "pdf": + + # TODO: fix bytes content issue loader = PyPDFLoader(source) compressed_document = loader.load() state.update({self.output[0]: compressed_document}) diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 5585ae80..9c24edb6 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -5,6 +5,7 @@ from typing import List, Optional from semchunk import chunk from langchain_community.document_transformers import Html2TextTransformer +from langchain_core.documents import Document from ..utils.logging import get_logger from .base_node import BaseNode @@ -79,10 +80,17 @@ def execute(self, state: dict) -> dict: else: docs_transformed = docs_transformed[0] - chunks = chunk(text=docs_transformed, + if type(docs_transformed) == Document: + chunks = chunk(text=docs_transformed.page_content, chunk_size= self.node_config.get("chunk_size", 4096), token_counter=lambda x: len(x.split()), memoize=False) + else: + + chunks = chunk(text=docs_transformed, + chunk_size= self.node_config.get("chunk_size", 4096), + token_counter=lambda x: len(x.split()), + memoize=False) state.update({self.output[0]: chunks})