diff --git a/README.md b/README.md
index b017bcd2..0add9705 100644
--- a/README.md
+++ b/README.md
@@ -8,13 +8,18 @@
[](https://discord.gg/gkxQDAjfeX)
-ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites, documents and XML files.
+ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites and local documents (XML, HTML, JSON, etc.).
+
Just say which information you want to extract and the library will do it for you!
+[](https://discord.gg/gkxQDAjfeX)
+[](https://www.linkedin.com/company/scrapegraphai/)
+[](https://twitter.com/scrapegraphai)
+
## π Quick install
@@ -48,11 +53,16 @@ The documentation for ScrapeGraphAI can be found [here](https://scrapegraph-ai.r
Check out also the docusaurus [documentation](https://scrapegraph-doc.onrender.com/).
## π» Usage
-You can use the `SmartScraper` class to extract information from a website using a prompt.
+There are three main scraping pipelines that can be used to extract information from a website (or local file):
+- `SmartScraperGraph`: single-page scraper that only needs a user prompt and an input source;
+- `SearchGraph`: multi-page scraper that extracts information from the top n search results of a search engine;
+- `SpeechGraph`: single-page scraper that extracts information from a website and generates an audio file.
+
+It is possible to use different LLM through APIs, such as **OpenAI**, **Groq**, **Azure** and **Gemini**, or local models using **Ollama**.
-The `SmartScraper` class is a direct graph implementation that uses the most common nodes present in a web scraping pipeline. For more information, please see the [documentation](https://scrapegraph-ai.readthedocs.io/en/latest/).
-### Case 1: Extracting information using Ollama
-Remember to download the model on Ollama separately!
+### Case 1: SmartScraper using Local Models
+
+Remember to have [Ollama](https://ollama.com/) installed and download the models using the **ollama pull** command.
```python
from scrapegraphai.graphs import SmartScraperGraph
@@ -67,11 +77,12 @@ graph_config = {
"embeddings": {
"model": "ollama/nomic-embed-text",
"base_url": "http://localhost:11434", # set Ollama URL
- }
+ },
+ "verbose": True,
}
smart_scraper_graph = SmartScraperGraph(
- prompt="List me all the articles",
+ prompt="List me all the projects with their descriptions",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects",
config=graph_config
@@ -82,170 +93,92 @@ print(result)
```
-### Case 2: Extracting information using Docker
+The output will be a list of projects with their descriptions like the following:
-Note: before using the local model remember to create the docker container!
-```text
- docker-compose up -d
- docker exec -it ollama ollama pull stablelm-zephyr
-```
-You can use which models available on Ollama or your own model instead of stablelm-zephyr
```python
-from scrapegraphai.graphs import SmartScraperGraph
-
-graph_config = {
- "llm": {
- "model": "ollama/mistral",
- "temperature": 0,
- "format": "json", # Ollama needs the format to be specified explicitly
- # "model_tokens": 2000, # set context length arbitrarily
- },
-}
-
-smart_scraper_graph = SmartScraperGraph(
- prompt="List me all the articles",
- # also accepts a string with the already downloaded HTML code
- source="https://perinim.github.io/projects",
- config=graph_config
-)
-
-result = smart_scraper_graph.run()
-print(result)
+{'projects': [{'title': 'Rotary Pendulum RL', 'description': 'Open Source project aimed at controlling a real life rotary pendulum using RL algorithms'}, {'title': 'DQN Implementation from scratch', 'description': 'Developed a Deep Q-Network algorithm to train a simple and double pendulum'}, ...]}
```
+### Case 2: SearchGraph using Mixed Models
-### Case 3: Extracting information using Openai model
-```python
-from scrapegraphai.graphs import SmartScraperGraph
-OPENAI_API_KEY = "YOUR_API_KEY"
+We use **Groq** for the LLM and **Ollama** for the embeddings.
-graph_config = {
- "llm": {
- "api_key": OPENAI_API_KEY,
- "model": "gpt-3.5-turbo",
- },
-}
-
-smart_scraper_graph = SmartScraperGraph(
- prompt="List me all the articles",
- # also accepts a string with the already downloaded HTML code
- source="https://perinim.github.io/projects",
- config=graph_config
-)
-
-result = smart_scraper_graph.run()
-print(result)
-```
-
-### Case 4: Extracting information using Groq
```python
-from scrapegraphai.graphs import SmartScraperGraph
-from scrapegraphai.utils import prettify_exec_info
-
-groq_key = os.getenv("GROQ_APIKEY")
+from scrapegraphai.graphs import SearchGraph
+# Define the configuration for the graph
graph_config = {
"llm": {
"model": "groq/gemma-7b-it",
- "api_key": groq_key,
+ "api_key": "GROQ_API_KEY",
"temperature": 0
},
"embeddings": {
"model": "ollama/nomic-embed-text",
- "temperature": 0,
- "base_url": "http://localhost:11434",
+ "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
- "headless": False
+ "max_results": 5,
}
-smart_scraper_graph = SmartScraperGraph(
- prompt="List me all the projects with their description and the author.",
- source="https://perinim.github.io/projects",
+# Create the SearchGraph instance
+search_graph = SearchGraph(
+ prompt="List me all the traditional recipes from Chioggia",
config=graph_config
)
-result = smart_scraper_graph.run()
+# Run the graph
+result = search_graph.run()
print(result)
```
+The output will be a list of recipes like the following:
-### Case 5: Extracting information using Azure
```python
-from langchain_openai import AzureChatOpenAI
-from langchain_openai import AzureOpenAIEmbeddings
-
-lm_model_instance = AzureChatOpenAI(
- openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
- azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
-)
-
-embedder_model_instance = AzureOpenAIEmbeddings(
- azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
- openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
-)
-graph_config = {
- "llm": {"model_instance": llm_model_instance},
- "embeddings": {"model_instance": embedder_model_instance}
-}
-
-smart_scraper_graph = SmartScraperGraph(
- prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time,
- event_end_date, event_end_time, location, event_mode, event_category,
- third_party_redirect, no_of_days,
- time_in_hours, hosted_or_attending, refreshments_type,
- registration_available, registration_link""",
- source="https://www.hmhco.com/event",
- config=graph_config
-)
+{'recipes': [{'name': 'Sarde in SaΓ²re'}, {'name': 'Bigoli in salsa'}, {'name': 'Seppie in umido'}, {'name': 'Moleche frite'}, {'name': 'Risotto alla pescatora'}, {'name': 'Broeto'}, {'name': 'Bibarasse in Cassopipa'}, {'name': 'Risi e bisi'}, {'name': 'Smegiassa Ciosota'}]}
```
+### Case 3: SpeechGraph using OpenAI
+
+You just need to pass the OpenAI API key and the model name.
-### Case 6: Extracting information using Gemini
```python
-from scrapegraphai.graphs import SmartScraperGraph
-GOOGLE_APIKEY = "YOUR_API_KEY"
+from scrapegraphai.graphs import SpeechGraph
-# Define the configuration for the graph
graph_config = {
"llm": {
- "api_key": GOOGLE_APIKEY,
- "model": "gemini-pro",
+ "api_key": "OPENAI_API_KEY",
+ "model": "gpt-3.5-turbo",
},
+ "tts_model": {
+ "api_key": "OPENAI_API_KEY",
+ "model": "tts-1",
+ "voice": "alloy"
+ },
+ "output_path": "audio_summary.mp3",
}
-# Create the SmartScraperGraph instance
-smart_scraper_graph = SmartScraperGraph(
- prompt="List me all the articles",
- source="https://perinim.github.io/projects",
- config=graph_config
+# ************************************************
+# Create the SpeechGraph instance and run it
+# ************************************************
+
+speech_graph = SpeechGraph(
+ prompt="Make a detailed audio summary of the projects.",
+ source="https://perinim.github.io/projects/",
+ config=graph_config,
)
-result = smart_scraper_graph.run()
+result = speech_graph.run()
print(result)
-```
-The output for all 3 the cases will be a dictionary with the extracted information, for example:
-
-```bash
-{
- 'titles': [
- 'Rotary Pendulum RL'
- ],
- 'descriptions': [
- 'Open Source project aimed at controlling a real life rotary pendulum using RL algorithms'
- ]
-}
```
+The output will be an audio file with the summary of the projects on the page.
+
## π€ Contributing
Feel free to contribute and join our Discord server to discuss with us improvements and give us suggestions!
Please see the [contributing guidelines](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/CONTRIBUTING.md).
-[](https://discord.gg/gkxQDAjfeX)
-[](https://www.linkedin.com/company/scrapegraphai/)
-[](https://twitter.com/scrapegraphai)
-
## π Roadmap
Check out the project roadmap [here](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/README.md)! π
@@ -253,6 +186,10 @@ Wanna visualize the roadmap in a more interactive way? Check out the [markmap](h
## β€οΈ Contributors
[](https://github.com/VinciGit00/Scrapegraph-ai/graphs/contributors)
+## Sponsors
+
+
+
## π Citations
If you have used our library for research purposes please quote us with the following reference:
@@ -269,7 +206,7 @@ If you have used our library for research purposes please quote us with the foll
## Authors
-
+
| | Contact Info |
diff --git a/docs/assets/serp_api_logo.png b/docs/assets/serp_api_logo.png
new file mode 100644
index 00000000..ff2f1b01
Binary files /dev/null and b/docs/assets/serp_api_logo.png differ
diff --git a/examples/groq/search_graph_groq_openai.py b/examples/groq/search_graph_groq_openai.py
new file mode 100644
index 00000000..3d581063
--- /dev/null
+++ b/examples/groq/search_graph_groq_openai.py
@@ -0,0 +1,46 @@
+"""
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SearchGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+groq_key = os.getenv("GROQ_APIKEY")
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "model": "groq/gemma-7b-it",
+ "api_key": groq_key,
+ "temperature": 0
+ },
+ "embeddings": {
+ "api_key": openai_key,
+ "model": "openai",
+ },
+ "headless": False
+}
+
+search_graph = SearchGraph(
+ prompt="List me the best escursions near Trento",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = search_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/examples/local_models/README.md b/examples/local_models/README.md
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/local_models/Ollama/csv_scraper_ollama.py b/examples/local_models/csv_scraper_ollama.py
similarity index 100%
rename from examples/local_models/Ollama/csv_scraper_ollama.py
rename to examples/local_models/csv_scraper_ollama.py
diff --git a/examples/local_models/Ollama/inputs/books.xml b/examples/local_models/inputs/books.xml
similarity index 100%
rename from examples/local_models/Ollama/inputs/books.xml
rename to examples/local_models/inputs/books.xml
diff --git a/examples/local_models/Ollama/inputs/example.json b/examples/local_models/inputs/example.json
similarity index 100%
rename from examples/local_models/Ollama/inputs/example.json
rename to examples/local_models/inputs/example.json
diff --git a/examples/local_models/Ollama/inputs/plain_html_example.txt b/examples/local_models/inputs/plain_html_example.txt
similarity index 100%
rename from examples/local_models/Ollama/inputs/plain_html_example.txt
rename to examples/local_models/inputs/plain_html_example.txt
diff --git a/examples/local_models/Ollama/inputs/username.csv b/examples/local_models/inputs/username.csv
similarity index 100%
rename from examples/local_models/Ollama/inputs/username.csv
rename to examples/local_models/inputs/username.csv
diff --git a/examples/local_models/Ollama/json_scraper_ollama.py b/examples/local_models/json_scraper_ollama.py
similarity index 100%
rename from examples/local_models/Ollama/json_scraper_ollama.py
rename to examples/local_models/json_scraper_ollama.py
diff --git a/examples/local_models/Ollama/scrape_plain_text_ollama.py b/examples/local_models/scrape_plain_text_ollama.py
similarity index 100%
rename from examples/local_models/Ollama/scrape_plain_text_ollama.py
rename to examples/local_models/scrape_plain_text_ollama.py
diff --git a/examples/local_models/Ollama/scrape_xml_ollama.py b/examples/local_models/scrape_xml_ollama.py
similarity index 100%
rename from examples/local_models/Ollama/scrape_xml_ollama.py
rename to examples/local_models/scrape_xml_ollama.py
diff --git a/examples/local_models/Ollama/script_generator_ollama.py b/examples/local_models/script_generator_ollama.py
similarity index 100%
rename from examples/local_models/Ollama/script_generator_ollama.py
rename to examples/local_models/script_generator_ollama.py
diff --git a/examples/local_models/Ollama/search_graph_ollama.py b/examples/local_models/search_graph_ollama.py
similarity index 100%
rename from examples/local_models/Ollama/search_graph_ollama.py
rename to examples/local_models/search_graph_ollama.py
diff --git a/examples/local_models/Ollama/smart_scraper_ollama.py b/examples/local_models/smart_scraper_ollama.py
similarity index 100%
rename from examples/local_models/Ollama/smart_scraper_ollama.py
rename to examples/local_models/smart_scraper_ollama.py
diff --git a/examples/local_models/Ollama/xml_scraper_ollama.py b/examples/local_models/xml_scraper_ollama.py
similarity index 100%
rename from examples/local_models/Ollama/xml_scraper_ollama.py
rename to examples/local_models/xml_scraper_ollama.py
diff --git a/pyproject.toml b/pyproject.toml
index af78d90c..f9d280fe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,24 +43,24 @@ classifiers = [
[tool.poetry.dependencies]
python = ">=3.9, <3.12"
langchain = "0.1.15"
-langchain-openai = "^0.1.6"
-langchain-google-genai = "^1.0.3"
-langchain-groq = "^0.1.3"
-langchain-aws = "^0.1.3"
-langchain-anthropic = "^0.1.11"
-html2text = "^2024.2.26"
-faiss-cpu = "^1.8.0"
-beautifulsoup4 = "^4.12.3"
-pandas = "^2.2.2"
-python-dotenv = "^1.0.1"
-tiktoken = "^0.6.0"
-tqdm = "^4.66.4"
-graphviz = "^0.20.3"
-minify-html = "^0.15.0"
-free-proxy = "^1.1.1"
-playwright = "^1.43.0"
-google = "^3.0.0"
-yahoo-search-py = "^0.3"
+langchain-openai = "0.1.6"
+langchain-google-genai = "1.0.3"
+langchain-groq = "0.1.3"
+langchain-aws = "0.1.3"
+langchain-anthropic = "0.1.11"
+html2text = "2024.2.26"
+faiss-cpu = "1.8.0"
+beautifulsoup4 = "4.12.3"
+pandas = "2.2.2"
+python-dotenv = "1.0.1"
+tiktoken = "0.6.0"
+tqdm = "4.66.4"
+graphviz = "0.20.3"
+minify-html = "0.15.0"
+free-proxy = "1.1.1"
+playwright = "1.43.0"
+google = "3.0.0"
+yahoo-search-py = "0.3"
[tool.poetry.dev-dependencies]
pytest = "8.0.0"
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
index 04dbf9d7..4614df80 100644
--- a/scrapegraphai/graphs/abstract_graph.py
+++ b/scrapegraphai/graphs/abstract_graph.py
@@ -3,6 +3,7 @@
"""
from abc import ABC, abstractmethod
from typing import Optional
+from langchain_aws import BedrockEmbeddings
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings, BedrockEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
@@ -148,12 +149,12 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
except KeyError as exc:
raise KeyError("Model not supported") from exc
return Gemini(llm_params)
- elif "claude" in llm_params["model"]:
+ elif llm_params["model"].startswith("claude"):
try:
self.model_token = models_tokens["claude"][llm_params["model"]]
except KeyError as exc:
raise KeyError("Model not supported") from exc
- return Claude(llm_params)
+ return Anthropic(llm_params)
elif "ollama" in llm_params["model"]:
llm_params["model"] = llm_params["model"].split("/")[-1]
diff --git a/tests/nodes/fetch_node_test.py b/tests/nodes/fetch_node_test.py
index 811c2daf..a67f3dbb 100644
--- a/tests/nodes/fetch_node_test.py
+++ b/tests/nodes/fetch_node_test.py
@@ -1,5 +1,5 @@
"""
-Module for testinh robot_node
+Module for testinh fetch_node
"""
import pytest
from scrapegraphai.nodes import FetchNode
@@ -14,7 +14,7 @@ def setup():
# Define the node
# ************************************************
- robots_node = FetchNode(
+ fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
node_config={
@@ -22,14 +22,14 @@ def setup():
}
)
- return robots_node
+ return fetch_node
# ************************************************
# Test the node
# ************************************************
-def test_robots_node(setup):
+def test_fetch_node(setup):
"""
Run the tests
"""
@@ -40,8 +40,3 @@ def test_robots_node(setup):
result = setup.execute(state)
assert result is not None
-
-
-# If you need to run this script directly
-if __name__ == "__main__":
- pytest.main()
diff --git a/tests/nodes/robot_node_test.py b/tests/nodes/robot_node_test.py
index cae3a895..084522c4 100644
--- a/tests/nodes/robot_node_test.py
+++ b/tests/nodes/robot_node_test.py
@@ -55,8 +55,3 @@ def test_robots_node(setup):
result = setup.execute(state)
assert result is not None
-
-
-# If you need to run this script directly
-if __name__ == "__main__":
- pytest.main()