diff --git a/README.md b/README.md index b017bcd2..0add9705 100644 --- a/README.md +++ b/README.md @@ -8,13 +8,18 @@ [![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX) -ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites, documents and XML files. +ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites and local documents (XML, HTML, JSON, etc.). + Just say which information you want to extract and the library will do it for you!

Scrapegraph-ai Logo

+[![My Skills](https://skillicons.dev/icons?i=discord)](https://discord.gg/gkxQDAjfeX) +[![My Skills](https://skillicons.dev/icons?i=linkedin)](https://www.linkedin.com/company/scrapegraphai/) +[![My Skills](https://skillicons.dev/icons?i=twitter)](https://twitter.com/scrapegraphai) + ## πŸš€ Quick install @@ -48,11 +53,16 @@ The documentation for ScrapeGraphAI can be found [here](https://scrapegraph-ai.r Check out also the docusaurus [documentation](https://scrapegraph-doc.onrender.com/). ## πŸ’» Usage -You can use the `SmartScraper` class to extract information from a website using a prompt. +There are three main scraping pipelines that can be used to extract information from a website (or local file): +- `SmartScraperGraph`: single-page scraper that only needs a user prompt and an input source; +- `SearchGraph`: multi-page scraper that extracts information from the top n search results of a search engine; +- `SpeechGraph`: single-page scraper that extracts information from a website and generates an audio file. + +It is possible to use different LLM through APIs, such as **OpenAI**, **Groq**, **Azure** and **Gemini**, or local models using **Ollama**. -The `SmartScraper` class is a direct graph implementation that uses the most common nodes present in a web scraping pipeline. For more information, please see the [documentation](https://scrapegraph-ai.readthedocs.io/en/latest/). -### Case 1: Extracting information using Ollama -Remember to download the model on Ollama separately! +### Case 1: SmartScraper using Local Models + +Remember to have [Ollama](https://ollama.com/) installed and download the models using the **ollama pull** command. ```python from scrapegraphai.graphs import SmartScraperGraph @@ -67,11 +77,12 @@ graph_config = { "embeddings": { "model": "ollama/nomic-embed-text", "base_url": "http://localhost:11434", # set Ollama URL - } + }, + "verbose": True, } smart_scraper_graph = SmartScraperGraph( - prompt="List me all the articles", + prompt="List me all the projects with their descriptions", # also accepts a string with the already downloaded HTML code source="https://perinim.github.io/projects", config=graph_config @@ -82,170 +93,92 @@ print(result) ``` -### Case 2: Extracting information using Docker +The output will be a list of projects with their descriptions like the following: -Note: before using the local model remember to create the docker container! -```text - docker-compose up -d - docker exec -it ollama ollama pull stablelm-zephyr -``` -You can use which models available on Ollama or your own model instead of stablelm-zephyr ```python -from scrapegraphai.graphs import SmartScraperGraph - -graph_config = { - "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily - }, -} - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the articles", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) +{'projects': [{'title': 'Rotary Pendulum RL', 'description': 'Open Source project aimed at controlling a real life rotary pendulum using RL algorithms'}, {'title': 'DQN Implementation from scratch', 'description': 'Developed a Deep Q-Network algorithm to train a simple and double pendulum'}, ...]} ``` +### Case 2: SearchGraph using Mixed Models -### Case 3: Extracting information using Openai model -```python -from scrapegraphai.graphs import SmartScraperGraph -OPENAI_API_KEY = "YOUR_API_KEY" +We use **Groq** for the LLM and **Ollama** for the embeddings. -graph_config = { - "llm": { - "api_key": OPENAI_API_KEY, - "model": "gpt-3.5-turbo", - }, -} - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the articles", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) -``` - -### Case 4: Extracting information using Groq ```python -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -groq_key = os.getenv("GROQ_APIKEY") +from scrapegraphai.graphs import SearchGraph +# Define the configuration for the graph graph_config = { "llm": { "model": "groq/gemma-7b-it", - "api_key": groq_key, + "api_key": "GROQ_API_KEY", "temperature": 0 }, "embeddings": { "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", + "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "headless": False + "max_results": 5, } -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description and the author.", - source="https://perinim.github.io/projects", +# Create the SearchGraph instance +search_graph = SearchGraph( + prompt="List me all the traditional recipes from Chioggia", config=graph_config ) -result = smart_scraper_graph.run() +# Run the graph +result = search_graph.run() print(result) ``` +The output will be a list of recipes like the following: -### Case 5: Extracting information using Azure ```python -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings - -lm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) -graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} -} - -smart_scraper_graph = SmartScraperGraph( - prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, - event_end_date, event_end_time, location, event_mode, event_category, - third_party_redirect, no_of_days, - time_in_hours, hosted_or_attending, refreshments_type, - registration_available, registration_link""", - source="https://www.hmhco.com/event", - config=graph_config -) +{'recipes': [{'name': 'Sarde in SaΓ²re'}, {'name': 'Bigoli in salsa'}, {'name': 'Seppie in umido'}, {'name': 'Moleche frite'}, {'name': 'Risotto alla pescatora'}, {'name': 'Broeto'}, {'name': 'Bibarasse in Cassopipa'}, {'name': 'Risi e bisi'}, {'name': 'Smegiassa Ciosota'}]} ``` +### Case 3: SpeechGraph using OpenAI + +You just need to pass the OpenAI API key and the model name. -### Case 6: Extracting information using Gemini ```python -from scrapegraphai.graphs import SmartScraperGraph -GOOGLE_APIKEY = "YOUR_API_KEY" +from scrapegraphai.graphs import SpeechGraph -# Define the configuration for the graph graph_config = { "llm": { - "api_key": GOOGLE_APIKEY, - "model": "gemini-pro", + "api_key": "OPENAI_API_KEY", + "model": "gpt-3.5-turbo", }, + "tts_model": { + "api_key": "OPENAI_API_KEY", + "model": "tts-1", + "voice": "alloy" + }, + "output_path": "audio_summary.mp3", } -# Create the SmartScraperGraph instance -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the articles", - source="https://perinim.github.io/projects", - config=graph_config +# ************************************************ +# Create the SpeechGraph instance and run it +# ************************************************ + +speech_graph = SpeechGraph( + prompt="Make a detailed audio summary of the projects.", + source="https://perinim.github.io/projects/", + config=graph_config, ) -result = smart_scraper_graph.run() +result = speech_graph.run() print(result) -``` -The output for all 3 the cases will be a dictionary with the extracted information, for example: - -```bash -{ - 'titles': [ - 'Rotary Pendulum RL' - ], - 'descriptions': [ - 'Open Source project aimed at controlling a real life rotary pendulum using RL algorithms' - ] -} ``` +The output will be an audio file with the summary of the projects on the page. + ## 🀝 Contributing Feel free to contribute and join our Discord server to discuss with us improvements and give us suggestions! Please see the [contributing guidelines](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/CONTRIBUTING.md). -[![My Skills](https://skillicons.dev/icons?i=discord)](https://discord.gg/gkxQDAjfeX) -[![My Skills](https://skillicons.dev/icons?i=linkedin)](https://www.linkedin.com/company/scrapegraphai/) -[![My Skills](https://skillicons.dev/icons?i=twitter)](https://twitter.com/scrapegraphai) - ## πŸ“ˆ Roadmap Check out the project roadmap [here](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/README.md)! πŸš€ @@ -253,6 +186,10 @@ Wanna visualize the roadmap in a more interactive way? Check out the [markmap](h ## ❀️ Contributors [![Contributors](https://contrib.rocks/image?repo=VinciGit00/Scrapegraph-ai)](https://github.com/VinciGit00/Scrapegraph-ai/graphs/contributors) +## Sponsors +

+ SerpAPI +

## πŸŽ“ Citations If you have used our library for research purposes please quote us with the following reference: @@ -269,7 +206,7 @@ If you have used our library for research purposes please quote us with the foll ## Authors

- Authors Logos + Authors_logos

| | Contact Info | diff --git a/docs/assets/serp_api_logo.png b/docs/assets/serp_api_logo.png new file mode 100644 index 00000000..ff2f1b01 Binary files /dev/null and b/docs/assets/serp_api_logo.png differ diff --git a/examples/groq/search_graph_groq_openai.py b/examples/groq/search_graph_groq_openai.py new file mode 100644 index 00000000..3d581063 --- /dev/null +++ b/examples/groq/search_graph_groq_openai.py @@ -0,0 +1,46 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +groq_key = os.getenv("GROQ_APIKEY") +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "embeddings": { + "api_key": openai_key, + "model": "openai", + }, + "headless": False +} + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/local_models/README.md b/examples/local_models/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/local_models/Ollama/csv_scraper_ollama.py b/examples/local_models/csv_scraper_ollama.py similarity index 100% rename from examples/local_models/Ollama/csv_scraper_ollama.py rename to examples/local_models/csv_scraper_ollama.py diff --git a/examples/local_models/Ollama/inputs/books.xml b/examples/local_models/inputs/books.xml similarity index 100% rename from examples/local_models/Ollama/inputs/books.xml rename to examples/local_models/inputs/books.xml diff --git a/examples/local_models/Ollama/inputs/example.json b/examples/local_models/inputs/example.json similarity index 100% rename from examples/local_models/Ollama/inputs/example.json rename to examples/local_models/inputs/example.json diff --git a/examples/local_models/Ollama/inputs/plain_html_example.txt b/examples/local_models/inputs/plain_html_example.txt similarity index 100% rename from examples/local_models/Ollama/inputs/plain_html_example.txt rename to examples/local_models/inputs/plain_html_example.txt diff --git a/examples/local_models/Ollama/inputs/username.csv b/examples/local_models/inputs/username.csv similarity index 100% rename from examples/local_models/Ollama/inputs/username.csv rename to examples/local_models/inputs/username.csv diff --git a/examples/local_models/Ollama/json_scraper_ollama.py b/examples/local_models/json_scraper_ollama.py similarity index 100% rename from examples/local_models/Ollama/json_scraper_ollama.py rename to examples/local_models/json_scraper_ollama.py diff --git a/examples/local_models/Ollama/scrape_plain_text_ollama.py b/examples/local_models/scrape_plain_text_ollama.py similarity index 100% rename from examples/local_models/Ollama/scrape_plain_text_ollama.py rename to examples/local_models/scrape_plain_text_ollama.py diff --git a/examples/local_models/Ollama/scrape_xml_ollama.py b/examples/local_models/scrape_xml_ollama.py similarity index 100% rename from examples/local_models/Ollama/scrape_xml_ollama.py rename to examples/local_models/scrape_xml_ollama.py diff --git a/examples/local_models/Ollama/script_generator_ollama.py b/examples/local_models/script_generator_ollama.py similarity index 100% rename from examples/local_models/Ollama/script_generator_ollama.py rename to examples/local_models/script_generator_ollama.py diff --git a/examples/local_models/Ollama/search_graph_ollama.py b/examples/local_models/search_graph_ollama.py similarity index 100% rename from examples/local_models/Ollama/search_graph_ollama.py rename to examples/local_models/search_graph_ollama.py diff --git a/examples/local_models/Ollama/smart_scraper_ollama.py b/examples/local_models/smart_scraper_ollama.py similarity index 100% rename from examples/local_models/Ollama/smart_scraper_ollama.py rename to examples/local_models/smart_scraper_ollama.py diff --git a/examples/local_models/Ollama/xml_scraper_ollama.py b/examples/local_models/xml_scraper_ollama.py similarity index 100% rename from examples/local_models/Ollama/xml_scraper_ollama.py rename to examples/local_models/xml_scraper_ollama.py diff --git a/pyproject.toml b/pyproject.toml index af78d90c..f9d280fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,24 +43,24 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.9, <3.12" langchain = "0.1.15" -langchain-openai = "^0.1.6" -langchain-google-genai = "^1.0.3" -langchain-groq = "^0.1.3" -langchain-aws = "^0.1.3" -langchain-anthropic = "^0.1.11" -html2text = "^2024.2.26" -faiss-cpu = "^1.8.0" -beautifulsoup4 = "^4.12.3" -pandas = "^2.2.2" -python-dotenv = "^1.0.1" -tiktoken = "^0.6.0" -tqdm = "^4.66.4" -graphviz = "^0.20.3" -minify-html = "^0.15.0" -free-proxy = "^1.1.1" -playwright = "^1.43.0" -google = "^3.0.0" -yahoo-search-py = "^0.3" +langchain-openai = "0.1.6" +langchain-google-genai = "1.0.3" +langchain-groq = "0.1.3" +langchain-aws = "0.1.3" +langchain-anthropic = "0.1.11" +html2text = "2024.2.26" +faiss-cpu = "1.8.0" +beautifulsoup4 = "4.12.3" +pandas = "2.2.2" +python-dotenv = "1.0.1" +tiktoken = "0.6.0" +tqdm = "4.66.4" +graphviz = "0.20.3" +minify-html = "0.15.0" +free-proxy = "1.1.1" +playwright = "1.43.0" +google = "3.0.0" +yahoo-search-py = "0.3" [tool.poetry.dev-dependencies] pytest = "8.0.0" diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 04dbf9d7..4614df80 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -3,6 +3,7 @@ """ from abc import ABC, abstractmethod from typing import Optional +from langchain_aws import BedrockEmbeddings from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings, BedrockEmbeddings from langchain_google_genai import GoogleGenerativeAIEmbeddings @@ -148,12 +149,12 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: except KeyError as exc: raise KeyError("Model not supported") from exc return Gemini(llm_params) - elif "claude" in llm_params["model"]: + elif llm_params["model"].startswith("claude"): try: self.model_token = models_tokens["claude"][llm_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc - return Claude(llm_params) + return Anthropic(llm_params) elif "ollama" in llm_params["model"]: llm_params["model"] = llm_params["model"].split("/")[-1] diff --git a/tests/nodes/fetch_node_test.py b/tests/nodes/fetch_node_test.py index 811c2daf..a67f3dbb 100644 --- a/tests/nodes/fetch_node_test.py +++ b/tests/nodes/fetch_node_test.py @@ -1,5 +1,5 @@ """ -Module for testinh robot_node +Module for testinh fetch_node """ import pytest from scrapegraphai.nodes import FetchNode @@ -14,7 +14,7 @@ def setup(): # Define the node # ************************************************ - robots_node = FetchNode( + fetch_node = FetchNode( input="url | local_dir", output=["doc"], node_config={ @@ -22,14 +22,14 @@ def setup(): } ) - return robots_node + return fetch_node # ************************************************ # Test the node # ************************************************ -def test_robots_node(setup): +def test_fetch_node(setup): """ Run the tests """ @@ -40,8 +40,3 @@ def test_robots_node(setup): result = setup.execute(state) assert result is not None - - -# If you need to run this script directly -if __name__ == "__main__": - pytest.main() diff --git a/tests/nodes/robot_node_test.py b/tests/nodes/robot_node_test.py index cae3a895..084522c4 100644 --- a/tests/nodes/robot_node_test.py +++ b/tests/nodes/robot_node_test.py @@ -55,8 +55,3 @@ def test_robots_node(setup): result = setup.execute(state) assert result is not None - - -# If you need to run this script directly -if __name__ == "__main__": - pytest.main()