Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions examples/gemini/script_generator_gemini.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""
Basic example of scraping pipeline using ScriptCreatorGraph
"""

import os
from dotenv import load_dotenv
from scrapegraphai.graphs import ScriptCreatorGraph
from scrapegraphai.utils import prettify_exec_info

load_dotenv()


# ************************************************
# Define the configuration for the graph
# ************************************************

gemini_key = os.getenv("GOOGLE_APIKEY")

graph_config = {
"llm": {
"api_key": gemini_key,
"model": "gpt-3.5-turbo",
},
}

# ************************************************
# Create the ScriptCreatorGraph instance and run it
# ************************************************

smart_scraper_graph = ScriptCreatorGraph(
prompt="List me all the news with their description.",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects",
config=graph_config
)

result = smart_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
43 changes: 43 additions & 0 deletions examples/local_models/Docker/script_generator_docker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""
Basic example of scraping pipeline using ScriptCreatorGraph
"""
from scrapegraphai.graphs import ScriptCreatorGraph
from scrapegraphai.utils import prettify_exec_info

# ************************************************
# Define the configuration for the graph
# ************************************************

graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json",
# "model_tokens": 2000, # set context length arbitrarily,
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
}
}

# ************************************************
# Create the ScriptCreatorGraph instance and run it
# ************************************************

smart_scraper_graph = ScriptCreatorGraph(
prompt="List me all the news with their description.",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects",
config=graph_config
)

result = smart_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
44 changes: 44 additions & 0 deletions examples/local_models/Ollama/script_generator_ollama.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
Basic example of scraping pipeline using ScriptCreatorGraph
"""
from scrapegraphai.graphs import ScriptCreatorGraph
from scrapegraphai.utils import prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************

graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
}
}

# ************************************************
# Create the ScriptCreatorGraph instance and run it
# ************************************************

smart_scraper_graph = ScriptCreatorGraph(
prompt="List me all the news with their description.",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects",
config=graph_config
)

result = smart_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
44 changes: 44 additions & 0 deletions examples/openai/script_generator_openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
Basic example of scraping pipeline using ScriptCreatorGraph
"""

import os
from dotenv import load_dotenv
from scrapegraphai.graphs import ScriptCreatorGraph
from scrapegraphai.utils import prettify_exec_info

load_dotenv()

# ************************************************
# Define the configuration for the graph
# ************************************************

openai_key = os.getenv("OPENAI_APIKEY")

graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-3.5-turbo",
},
}

# ************************************************
# Create the ScriptCreatorGraph instance and run it
# ************************************************

smart_scraper_graph = ScriptCreatorGraph(
prompt="List me all the news with their description.",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects",
config=graph_config
)

result = smart_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
1 change: 1 addition & 0 deletions examples/openai/smart_scraper_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info

load_dotenv()


Expand Down
1 change: 1 addition & 0 deletions scrapegraphai/graphs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from .smart_scraper_graph import SmartScraperGraph
from .speech_graph import SpeechGraph
from .search_graph import SearchGraph
from .script_creator_graph import ScriptCreatorGraph
77 changes: 77 additions & 0 deletions scrapegraphai/graphs/script_creator_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
Module for creating the smart scraper
"""
from .base_graph import BaseGraph
from ..nodes import (
FetchNode,
ParseNode,
RAGNode,
GenerateScraperNode
)
from .abstract_graph import AbstractGraph


class ScriptCreatorGraph(AbstractGraph):
"""
SmartScraper is a comprehensive web scraping tool that automates the process of extracting
information from web pages using a natural language model to interpret and answer prompts.
"""

def __init__(self, prompt: str, source: str, config: dict):
"""
Initializes the ScriptCreatorGraph with a prompt, source, and configuration.
"""
super().__init__(prompt, config, source)

self.input_key = "url" if source.startswith("http") else "local_dir"

def _create_graph(self):
"""
Creates the graph of nodes representing the workflow for web scraping.
"""
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={"chunk_size": self.model_token}
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
node_config={
"llm": self.llm_model,
"embedder_model": self.embedder_model
}
)
generate_scraper_node = GenerateScraperNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={"llm": self.llm_model},
)

return BaseGraph(
nodes={
fetch_node,
parse_node,
rag_node,
generate_scraper_node,
},
edges={
(fetch_node, parse_node),
(parse_node, rag_node),
(rag_node, generate_scraper_node)
},
entry_point=fetch_node
)

def run(self) -> str:
"""
Executes the web scraping process and returns the answer to the prompt.
"""
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)

return self.final_state.get("answer", "No answer found.")
1 change: 1 addition & 0 deletions scrapegraphai/nodes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@
from .text_to_speech_node import TextToSpeechNode
from .image_to_text_node import ImageToTextNode
from .search_internet_node import SearchInternetNode
from .generate_scraper_node import GenerateScraperNode
Loading