diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a87a873..1890eb64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,90 @@ +## [1.11.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.9...v1.11.0-beta.10) (2024-08-02) + + +### Bug Fixes + +* **AbstractGraph:** instantiation of Azure GPT models ([ade28fc](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ade28fca2c3fdf40f28a80854e3b8435a52a6930)), closes [#498](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/498) + +## [1.11.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.8...v1.11.0-beta.9) (2024-08-02) + + +### Features + +* refactoring of the code ([9355507](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9355507a2dc73342f325b6649e871df48ae13567)) + +## [1.11.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.7...v1.11.0-beta.8) (2024-08-01) + + +### Features + +* add integration in the abstract grapgh ([5ecdbe7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5ecdbe715f4bb223fa1be834fda07ccea2a51cb9)) + + +### Bug Fixes + +* fixed bug on fetch_node ([968c69e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/968c69e217d9c180b9b8c2aa52ca59b9a1733525)) + +## [1.11.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.6...v1.11.0-beta.7) (2024-08-01) ## [1.10.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.0-beta.6...v1.10.0-beta.7) (2024-07-23) ## [1.11.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.2...v1.11.3) (2024-07-25) + ### Bug Fixes + +* abstract_graph and removed unused embeddings ([0b4cfd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0b4cfd6522dcad0eb418f0badd0f7824a1efd534)) + + +### Refactor + +* move embeddings code from AbstractGraph to RAGNode ([a94ebcd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a94ebcde0078d66d33e67f7e0a87850efb92d408)) +* reuse code for common interface models ([bb73d91](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bb73d916a1a7b378438038ec928eeda6d8f6ac9d)) + +## [1.11.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.5...v1.11.0-beta.6) (2024-07-31) + + +### Features + +* intregration of firebase ([4caed54](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4caed545e5030460b2d5e46f9ad90546ce36f0ee)) + +## [1.11.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.4...v1.11.0-beta.5) (2024-07-30) + + +### Features + +* fix tests ([1db164e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1db164e9e682eefbc1414989a043fefa2e9009c2)) + + +### chore + +* remove unused import ([88710f1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/88710f1a7c7d50f57108456112da30d1a12a1ba1)) + + +### Refactor + +* **Ollama:** integrate new LangChain chat init ([d177afb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d177afb68be036465ede1f567d2562b145d77d36)) +* **OpenAI:** integrate new LangChain chat init ([5e3eb6e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5e3eb6e43df4bd4c452d34b49f254235e9ff1b22)) +* remove LangChain wrappers ([2c5f934](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2c5f934f101e319ec4e61009d4c464ca4626c1ff)) +* remove LangChain wrappers for Ollama ([25066b2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/25066b2bc51517e50058231664230b8edef365b9)) +* remove redundant LangChain wrappers ([9275486](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/927548624034b3c30eca60011d216720102d1815)) +* remove redundant wrappers for Ernie and Nvidia ([bc2c996](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bc2c9967d2f13ade6eeb7b23e9b423f6e79aa890)) + +## [1.11.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.3...v1.11.0-beta.4) (2024-07-25) + + +### Features + +* add generate_answer node paralellization ([0c4b290](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0c4b2908d98efbb2b0a6faf68618a801d726bb5f)) + + +### chore + +* rebuild requirements ([2edad66](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2edad66788cbd92f197e3b37db13c44bfa39e36a)) + +## [1.11.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.2...v1.11.0-beta.3) (2024-07-25) +======= * add llama 3.1 ([f872bdd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f872bdd24f9874660eea04f9ade570c96b6e7e93)) @@ -18,6 +98,16 @@ ### Bug Fixes +* add llama 3.1 ([f336c95](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f336c95c2d1833d1f829d61ae7fa415ac2caf250)) + +## [1.11.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.1...v1.11.0-beta.2) (2024-07-24) + + +### Features + +* pdate models_tokens.py ([377d679](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/377d679eecd62611c0c9a3cba8202c6f0719ed31)) + +## [1.11.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.4...v1.11.0-beta.1) (2024-07-23) * md conversion ([1d41f6e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1d41f6eafe8ed0e191bb6a258d54c6388ff283c6)) ## [1.11.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0...v1.11.1) (2024-07-23) diff --git a/examples/bedrock/csv_scraper_bedrock.py b/examples/bedrock/csv_scraper_bedrock.py index f015f77b..a69417c0 100644 --- a/examples/bedrock/csv_scraper_bedrock.py +++ b/examples/bedrock/csv_scraper_bedrock.py @@ -33,9 +33,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } # ************************************************ diff --git a/examples/bedrock/csv_scraper_graph_multi_bedrock.py b/examples/bedrock/csv_scraper_graph_multi_bedrock.py index c776c508..b9dd7f6f 100644 --- a/examples/bedrock/csv_scraper_graph_multi_bedrock.py +++ b/examples/bedrock/csv_scraper_graph_multi_bedrock.py @@ -28,9 +28,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/custom_graph_bedrock.py b/examples/bedrock/custom_graph_bedrock.py index 45358555..9002a598 100644 --- a/examples/bedrock/custom_graph_bedrock.py +++ b/examples/bedrock/custom_graph_bedrock.py @@ -28,9 +28,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/json_scraper_bedrock.py b/examples/bedrock/json_scraper_bedrock.py index 0729adfe..dc1bf769 100644 --- a/examples/bedrock/json_scraper_bedrock.py +++ b/examples/bedrock/json_scraper_bedrock.py @@ -32,9 +32,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/json_scraper_multi_bedrock.py b/examples/bedrock/json_scraper_multi_bedrock.py index 5dc666b8..5848ef17 100644 --- a/examples/bedrock/json_scraper_multi_bedrock.py +++ b/examples/bedrock/json_scraper_multi_bedrock.py @@ -10,9 +10,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } FILE_NAME = "inputs/example.json" diff --git a/examples/bedrock/pdf_scraper_graph_bedrock.py b/examples/bedrock/pdf_scraper_graph_bedrock.py index 6ee4b753..dcef848e 100644 --- a/examples/bedrock/pdf_scraper_graph_bedrock.py +++ b/examples/bedrock/pdf_scraper_graph_bedrock.py @@ -18,9 +18,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/pdf_scraper_graph_multi_bedrock.py b/examples/bedrock/pdf_scraper_graph_multi_bedrock.py index 7102c406..37e61c42 100644 --- a/examples/bedrock/pdf_scraper_graph_multi_bedrock.py +++ b/examples/bedrock/pdf_scraper_graph_multi_bedrock.py @@ -11,9 +11,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } # *************** diff --git a/examples/bedrock/scrape_plain_text_bedrock.py b/examples/bedrock/scrape_plain_text_bedrock.py index 01bec609..0214a1e3 100644 --- a/examples/bedrock/scrape_plain_text_bedrock.py +++ b/examples/bedrock/scrape_plain_text_bedrock.py @@ -33,9 +33,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/script_generator_bedrock.py b/examples/bedrock/script_generator_bedrock.py index 0d3f7d07..26863193 100644 --- a/examples/bedrock/script_generator_bedrock.py +++ b/examples/bedrock/script_generator_bedrock.py @@ -19,10 +19,7 @@ "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" - }, - "library": "beautifulsoup" + "library": "beautifulsoup" } # ************************************************ diff --git a/examples/bedrock/script_multi_generator_bedrock.py b/examples/bedrock/script_multi_generator_bedrock.py index 2f892546..ecef966d 100644 --- a/examples/bedrock/script_multi_generator_bedrock.py +++ b/examples/bedrock/script_multi_generator_bedrock.py @@ -15,10 +15,7 @@ "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" - }, - "library": "beautifulsoup" + "library": "beautifulsoup" } # ************************************************ diff --git a/examples/bedrock/search_graph_bedrock.py b/examples/bedrock/search_graph_bedrock.py index 9b32d3db..b27f6e5d 100644 --- a/examples/bedrock/search_graph_bedrock.py +++ b/examples/bedrock/search_graph_bedrock.py @@ -16,9 +16,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } # ************************************************ diff --git a/examples/bedrock/search_graph_schema_bedrock.py b/examples/bedrock/search_graph_schema_bedrock.py index 90539155..a49ba730 100644 --- a/examples/bedrock/search_graph_schema_bedrock.py +++ b/examples/bedrock/search_graph_schema_bedrock.py @@ -27,9 +27,6 @@ class Dishes(BaseModel): "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/search_link_graph_bedrock.py b/examples/bedrock/search_link_graph_bedrock.py index 116dea01..fc1e6233 100644 --- a/examples/bedrock/search_link_graph_bedrock.py +++ b/examples/bedrock/search_link_graph_bedrock.py @@ -15,9 +15,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/smart_scraper_bedrock.py b/examples/bedrock/smart_scraper_bedrock.py index 03394434..9c747c00 100644 --- a/examples/bedrock/smart_scraper_bedrock.py +++ b/examples/bedrock/smart_scraper_bedrock.py @@ -19,9 +19,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/smart_scraper_multi_bedrock.py b/examples/bedrock/smart_scraper_multi_bedrock.py index 7aeb71cd..b363d6ab 100644 --- a/examples/bedrock/smart_scraper_multi_bedrock.py +++ b/examples/bedrock/smart_scraper_multi_bedrock.py @@ -17,9 +17,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/smart_scraper_schema_bedrock.py b/examples/bedrock/smart_scraper_schema_bedrock.py index 6213ea1f..2829efec 100644 --- a/examples/bedrock/smart_scraper_schema_bedrock.py +++ b/examples/bedrock/smart_scraper_schema_bedrock.py @@ -26,9 +26,6 @@ class Projects(BaseModel): "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/xml_scraper_bedrock.py b/examples/bedrock/xml_scraper_bedrock.py index 018a8387..5f81fbf6 100644 --- a/examples/bedrock/xml_scraper_bedrock.py +++ b/examples/bedrock/xml_scraper_bedrock.py @@ -32,9 +32,6 @@ "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" - } } # ************************************************ diff --git a/examples/bedrock/xml_scraper_graph_multi_bedrock.py b/examples/bedrock/xml_scraper_graph_multi_bedrock.py index a0ed3560..638ce280 100644 --- a/examples/bedrock/xml_scraper_graph_multi_bedrock.py +++ b/examples/bedrock/xml_scraper_graph_multi_bedrock.py @@ -29,9 +29,6 @@ "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" - } } # ************************************************ diff --git a/examples/deepseek/csv_scraper_deepseek.py b/examples/deepseek/csv_scraper_deepseek.py index fd55469d..b734b543 100644 --- a/examples/deepseek/csv_scraper_deepseek.py +++ b/examples/deepseek/csv_scraper_deepseek.py @@ -30,11 +30,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/csv_scraper_graph_multi_deepseek.py b/examples/deepseek/csv_scraper_graph_multi_deepseek.py index d665bc31..ea5e9154 100644 --- a/examples/deepseek/csv_scraper_graph_multi_deepseek.py +++ b/examples/deepseek/csv_scraper_graph_multi_deepseek.py @@ -30,11 +30,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/custom_graph_deepseek.py b/examples/deepseek/custom_graph_deepseek.py deleted file mode 100644 index a265db95..00000000 --- a/examples/deepseek/custom_graph_deepseek.py +++ /dev/null @@ -1,89 +0,0 @@ -""" -Example of custom graph using Gemini Google model -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.models import Gemini -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek-chat", - "openai_api_key": deepseek_key, - "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, - "verbose": True, -} - -# ************************************************ -# Define the graph nodes -# ************************************************ - -llm_model = Gemini(graph_config["llm"]) - -# define the nodes for the graph -fetch_node = FetchNode( - input="url | local_dir", - output=["doc"], -) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={"chunk_size": 4096} -) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={"llm": llm_model}, -) -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={"llm": llm_model}, -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes={ - fetch_node, - parse_node, - rag_node, - generate_answer_node, - }, - edges={ - (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) - }, - entry_point=fetch_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "List me the projects with their description", - "url": "https://perinim.github.io/projects/" -}) - -# get the answer from the result -result = result.get("answer", "No answer found.") -print(result) diff --git a/examples/deepseek/json_scraper_deepseek.py b/examples/deepseek/json_scraper_deepseek.py index 696a08d9..dfe6f489 100644 --- a/examples/deepseek/json_scraper_deepseek.py +++ b/examples/deepseek/json_scraper_deepseek.py @@ -29,11 +29,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/json_scraper_multi_deepseek.py b/examples/deepseek/json_scraper_multi_deepseek.py index 17660ddb..b957dde0 100644 --- a/examples/deepseek/json_scraper_multi_deepseek.py +++ b/examples/deepseek/json_scraper_multi_deepseek.py @@ -15,11 +15,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/pdf_scraper_graph_deepseek.py b/examples/deepseek/pdf_scraper_graph_deepseek.py index fe6f2658..d66bbef5 100644 --- a/examples/deepseek/pdf_scraper_graph_deepseek.py +++ b/examples/deepseek/pdf_scraper_graph_deepseek.py @@ -20,11 +20,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/pdf_scraper_multi_deepseek.py b/examples/deepseek/pdf_scraper_multi_deepseek.py index c884b798..211e4635 100644 --- a/examples/deepseek/pdf_scraper_multi_deepseek.py +++ b/examples/deepseek/pdf_scraper_multi_deepseek.py @@ -15,11 +15,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/scrape_plain_text_deepseek.py b/examples/deepseek/scrape_plain_text_deepseek.py index 7076dd39..d7a070d7 100644 --- a/examples/deepseek/scrape_plain_text_deepseek.py +++ b/examples/deepseek/scrape_plain_text_deepseek.py @@ -31,11 +31,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/script_generator_deepseek.py b/examples/deepseek/script_generator_deepseek.py index 09db0876..fd5fd4dd 100644 --- a/examples/deepseek/script_generator_deepseek.py +++ b/examples/deepseek/script_generator_deepseek.py @@ -20,11 +20,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "library": "beautifulsoup" } diff --git a/examples/deepseek/script_multi_generator_deepseek.py b/examples/deepseek/script_multi_generator_deepseek.py index 41e363b5..2ebfd90a 100644 --- a/examples/deepseek/script_multi_generator_deepseek.py +++ b/examples/deepseek/script_multi_generator_deepseek.py @@ -20,11 +20,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "library": "beautifulsoup" } diff --git a/examples/deepseek/search_graph_deepseek.py b/examples/deepseek/search_graph_deepseek.py index d607e1b1..176d6107 100644 --- a/examples/deepseek/search_graph_deepseek.py +++ b/examples/deepseek/search_graph_deepseek.py @@ -18,11 +18,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "max_results": 2, "verbose": True, diff --git a/examples/deepseek/search_graph_schema_deepseek.py b/examples/deepseek/search_graph_schema_deepseek.py index 8debee2f..f5db278e 100644 --- a/examples/deepseek/search_graph_schema_deepseek.py +++ b/examples/deepseek/search_graph_schema_deepseek.py @@ -34,11 +34,6 @@ class Dishes(BaseModel): "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/search_link_graph_deepseek.py b/examples/deepseek/search_link_graph_deepseek.py index 30e4a9b3..6a35f177 100644 --- a/examples/deepseek/search_link_graph_deepseek.py +++ b/examples/deepseek/search_link_graph_deepseek.py @@ -19,11 +19,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/smart_scraper_deepseek.py b/examples/deepseek/smart_scraper_deepseek.py index 9fe00a2a..ed291b02 100644 --- a/examples/deepseek/smart_scraper_deepseek.py +++ b/examples/deepseek/smart_scraper_deepseek.py @@ -21,11 +21,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/smart_scraper_multi_deepseek.py b/examples/deepseek/smart_scraper_multi_deepseek.py index c88ab525..fafe7261 100644 --- a/examples/deepseek/smart_scraper_multi_deepseek.py +++ b/examples/deepseek/smart_scraper_multi_deepseek.py @@ -19,11 +19,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/smart_scraper_schema_deepseek.py b/examples/deepseek/smart_scraper_schema_deepseek.py index a16ae575..5cbbb702 100644 --- a/examples/deepseek/smart_scraper_schema_deepseek.py +++ b/examples/deepseek/smart_scraper_schema_deepseek.py @@ -33,11 +33,6 @@ class Projects(BaseModel): "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/xml_scraper_deepseek.py b/examples/deepseek/xml_scraper_deepseek.py index 3b2af61b..ba401b91 100644 --- a/examples/deepseek/xml_scraper_deepseek.py +++ b/examples/deepseek/xml_scraper_deepseek.py @@ -31,11 +31,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/xml_scraper_graph_multi_deepseek.py b/examples/deepseek/xml_scraper_graph_multi_deepseek.py index 5d3c29d5..0f53a6b2 100644 --- a/examples/deepseek/xml_scraper_graph_multi_deepseek.py +++ b/examples/deepseek/xml_scraper_graph_multi_deepseek.py @@ -30,11 +30,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/extras/browser_base_integration.py b/examples/extras/browser_base_integration.py new file mode 100644 index 00000000..97529879 --- /dev/null +++ b/examples/extras/browser_base_integration.py @@ -0,0 +1,49 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "gpt-3.5-turbo", + }, + "browser_base": { + "api_key": os.getenv("BROWSER_BASE_API_KEY"), + "project_id": os.getenv("BROWSER_BASE_API_KEY"), + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/csv_scraper_fireworks.py b/examples/fireworks/csv_scraper_fireworks.py index b1d7526d..f588c4c5 100644 --- a/examples/fireworks/csv_scraper_fireworks.py +++ b/examples/fireworks/csv_scraper_fireworks.py @@ -29,12 +29,6 @@ "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, - "verbose": True, "headless": False, } diff --git a/examples/fireworks/csv_scraper_graph_multi_fireworks.py b/examples/fireworks/csv_scraper_graph_multi_fireworks.py index 81393d60..ebc46e61 100644 --- a/examples/fireworks/csv_scraper_graph_multi_fireworks.py +++ b/examples/fireworks/csv_scraper_graph_multi_fireworks.py @@ -28,11 +28,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/custom_graph_fireworks.py b/examples/fireworks/custom_graph_fireworks.py index a02b774e..d0dcd994 100644 --- a/examples/fireworks/custom_graph_fireworks.py +++ b/examples/fireworks/custom_graph_fireworks.py @@ -4,9 +4,7 @@ import os from dotenv import load_dotenv - -from langchain_openai import OpenAIEmbeddings -from scrapegraphai.models import OpenAI +from langchain_openai import ChatOpenAI from scrapegraphai.graphs import BaseGraph from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode load_dotenv() @@ -21,11 +19,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, @@ -35,8 +28,7 @@ # Define the graph nodes # ************************************************ -llm_model = OpenAI(graph_config["llm"]) -embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key) +llm_model = ChatOpenAI(graph_config["llm"]) # define the nodes for the graph robot_node = RobotsNode( @@ -65,15 +57,7 @@ "verbose": True, } ) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model, - "embedder_model": embedder, - "verbose": True, - } -) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -92,14 +76,11 @@ robot_node, fetch_node, parse_node, - rag_node, - generate_answer_node, ], edges=[ (robot_node, fetch_node), (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) + (parse_node, generate_answer_node) ], entry_point=robot_node ) diff --git a/examples/fireworks/deep_scraper_fireworks.py b/examples/fireworks/deep_scraper_fireworks.py index 67a80868..86fb1717 100644 --- a/examples/fireworks/deep_scraper_fireworks.py +++ b/examples/fireworks/deep_scraper_fireworks.py @@ -19,11 +19,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "max_depth": 1 @@ -49,4 +44,4 @@ graph_exec_info = deep_scraper_graph.get_execution_info() print(deep_scraper_graph.get_state("relevant_links")) -print(prettify_exec_info(graph_exec_info)) \ No newline at end of file +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/json_scraper_fireworkspy.py b/examples/fireworks/json_scraper_fireworkspy.py index 0dd188fb..a76a89c5 100644 --- a/examples/fireworks/json_scraper_fireworkspy.py +++ b/examples/fireworks/json_scraper_fireworkspy.py @@ -29,11 +29,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/json_scraper_multi_fireworks.py b/examples/fireworks/json_scraper_multi_fireworks.py index b4cf4fc7..cd16c525 100644 --- a/examples/fireworks/json_scraper_multi_fireworks.py +++ b/examples/fireworks/json_scraper_multi_fireworks.py @@ -14,11 +14,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/pdf_scraper_fireworks.py b/examples/fireworks/pdf_scraper_fireworks.py index 20db556b..3bb3f3d4 100644 --- a/examples/fireworks/pdf_scraper_fireworks.py +++ b/examples/fireworks/pdf_scraper_fireworks.py @@ -15,11 +15,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/fireworks/pdf_scraper_multi_fireworks.py b/examples/fireworks/pdf_scraper_multi_fireworks.py index 891a4454..c1077061 100644 --- a/examples/fireworks/pdf_scraper_multi_fireworks.py +++ b/examples/fireworks/pdf_scraper_multi_fireworks.py @@ -20,11 +20,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/fireworks/scrape_plain_text_fireworks.py b/examples/fireworks/scrape_plain_text_fireworks.py index a45b2691..331f05e2 100644 --- a/examples/fireworks/scrape_plain_text_fireworks.py +++ b/examples/fireworks/scrape_plain_text_fireworks.py @@ -32,11 +32,6 @@ "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, } diff --git a/examples/fireworks/script_generator_fireworks.py b/examples/fireworks/script_generator_fireworks.py index dea59e12..2ee3294c 100644 --- a/examples/fireworks/script_generator_fireworks.py +++ b/examples/fireworks/script_generator_fireworks.py @@ -19,11 +19,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/script_generator_schema_fireworks.py b/examples/fireworks/script_generator_schema_fireworks.py index f7aa4c83..6355a4e8 100644 --- a/examples/fireworks/script_generator_schema_fireworks.py +++ b/examples/fireworks/script_generator_schema_fireworks.py @@ -32,11 +32,6 @@ class Projects(BaseModel): "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "library": "beautifulsoup", diff --git a/examples/fireworks/script_multi_generator_fireworks.py b/examples/fireworks/script_multi_generator_fireworks.py index 42aff923..98671768 100644 --- a/examples/fireworks/script_multi_generator_fireworks.py +++ b/examples/fireworks/script_multi_generator_fireworks.py @@ -19,11 +19,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "library": "beautifulsoup", diff --git a/examples/fireworks/search_graph_fireworks.py b/examples/fireworks/search_graph_fireworks.py index 4d4d33cb..a091190c 100644 --- a/examples/fireworks/search_graph_fireworks.py +++ b/examples/fireworks/search_graph_fireworks.py @@ -18,11 +18,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "max_results": 2, "verbose": True, diff --git a/examples/fireworks/search_graph_schema_fireworks.py b/examples/fireworks/search_graph_schema_fireworks.py index 9180522b..d88d991e 100644 --- a/examples/fireworks/search_graph_schema_fireworks.py +++ b/examples/fireworks/search_graph_schema_fireworks.py @@ -33,11 +33,6 @@ class Dishes(BaseModel): "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "max_results": 2, "verbose": True, diff --git a/examples/fireworks/search_link_graph_fireworks.py b/examples/fireworks/search_link_graph_fireworks.py index a1d3a979..e71e2a4f 100644 --- a/examples/fireworks/search_link_graph_fireworks.py +++ b/examples/fireworks/search_link_graph_fireworks.py @@ -18,11 +18,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "max_results": 2, "verbose": True, diff --git a/examples/fireworks/smart_scraper_fireworks.py b/examples/fireworks/smart_scraper_fireworks.py index 40071d8f..cff9aedb 100644 --- a/examples/fireworks/smart_scraper_fireworks.py +++ b/examples/fireworks/smart_scraper_fireworks.py @@ -20,11 +20,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/smart_scraper_multi_fireworks.py b/examples/fireworks/smart_scraper_multi_fireworks.py index 68e28055..09e2c811 100644 --- a/examples/fireworks/smart_scraper_multi_fireworks.py +++ b/examples/fireworks/smart_scraper_multi_fireworks.py @@ -19,11 +19,7 @@ "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, + "verbose": True, "headless": False, } diff --git a/examples/fireworks/smart_scraper_schema_fireworks.py b/examples/fireworks/smart_scraper_schema_fireworks.py index b8685c3e..d71593f3 100644 --- a/examples/fireworks/smart_scraper_schema_fireworks.py +++ b/examples/fireworks/smart_scraper_schema_fireworks.py @@ -31,11 +31,6 @@ class Projects(BaseModel): "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/xml_scraper_fireworks.py b/examples/fireworks/xml_scraper_fireworks.py index efc98bd8..59d9e6a3 100644 --- a/examples/fireworks/xml_scraper_fireworks.py +++ b/examples/fireworks/xml_scraper_fireworks.py @@ -28,11 +28,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/xml_scraper_graph_multi_fireworks.py b/examples/fireworks/xml_scraper_graph_multi_fireworks.py index d14b8db0..690836a4 100644 --- a/examples/fireworks/xml_scraper_graph_multi_fireworks.py +++ b/examples/fireworks/xml_scraper_graph_multi_fireworks.py @@ -29,11 +29,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/groq/csv_scraper_graph_multi_groq.py b/examples/groq/csv_scraper_graph_multi_groq.py index 87e3279c..475b8cac 100644 --- a/examples/groq/csv_scraper_graph_multi_groq.py +++ b/examples/groq/csv_scraper_graph_multi_groq.py @@ -30,11 +30,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/csv_scraper_groq.py b/examples/groq/csv_scraper_groq.py index 20839a75..805ce5fc 100644 --- a/examples/groq/csv_scraper_groq.py +++ b/examples/groq/csv_scraper_groq.py @@ -31,11 +31,6 @@ "api_key": groq_key, "temperature": 0 }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, } # ************************************************ # Create the CSVScraperGraph instance and run it diff --git a/examples/groq/custom_graph_groq.py b/examples/groq/custom_graph_groq.py index d0384ffd..79d2f0c6 100644 --- a/examples/groq/custom_graph_groq.py +++ b/examples/groq/custom_graph_groq.py @@ -4,7 +4,7 @@ import os from dotenv import load_dotenv -from scrapegraphai.models import OpenAI +from langchain_openai import ChatOpenAI from scrapegraphai.graphs import BaseGraph from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode load_dotenv() @@ -19,11 +19,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False @@ -33,7 +28,7 @@ # Define the graph nodes # ************************************************ -llm_model = OpenAI(graph_config["llm"]) +llm_model = ChatOpenAI(graph_config["llm"]) # define the nodes for the graph robot_node = RobotsNode( @@ -62,14 +57,7 @@ "verbose": True, } ) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model, - "verbose": True, - } -) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -88,14 +76,12 @@ robot_node, fetch_node, parse_node, - rag_node, generate_answer_node, ], edges=[ (robot_node, fetch_node), (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) + (parse_node, generate_answer_node) ], entry_point=robot_node ) diff --git a/examples/groq/json_scraper_groq.py b/examples/groq/json_scraper_groq.py index 3faddae8..a9099069 100644 --- a/examples/groq/json_scraper_groq.py +++ b/examples/groq/json_scraper_groq.py @@ -30,11 +30,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False diff --git a/examples/groq/json_scraper_multi_groq.py b/examples/groq/json_scraper_multi_groq.py index 13b49be6..df3b9276 100644 --- a/examples/groq/json_scraper_multi_groq.py +++ b/examples/groq/json_scraper_multi_groq.py @@ -15,11 +15,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "library": "beautifulsoup" } diff --git a/examples/groq/pdf_scraper_graph_groq.py b/examples/groq/pdf_scraper_graph_groq.py index a9ca57ee..2560c11e 100644 --- a/examples/groq/pdf_scraper_graph_groq.py +++ b/examples/groq/pdf_scraper_graph_groq.py @@ -18,11 +18,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/groq/pdf_scraper_multi_groq.py b/examples/groq/pdf_scraper_multi_groq.py index f1afc058..c43a7087 100644 --- a/examples/groq/pdf_scraper_multi_groq.py +++ b/examples/groq/pdf_scraper_multi_groq.py @@ -14,11 +14,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "library": "beautifulsoup" } diff --git a/examples/groq/scrape_plain_text_groq.py b/examples/groq/scrape_plain_text_groq.py index 73cda250..329df51f 100644 --- a/examples/groq/scrape_plain_text_groq.py +++ b/examples/groq/scrape_plain_text_groq.py @@ -32,11 +32,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False diff --git a/examples/groq/script_generator_groq.py b/examples/groq/script_generator_groq.py index a370eb3c..9e280e2b 100644 --- a/examples/groq/script_generator_groq.py +++ b/examples/groq/script_generator_groq.py @@ -19,11 +19,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "library": "beautifulsoup" } diff --git a/examples/groq/script_multi_generator_groq.py b/examples/groq/script_multi_generator_groq.py index 1757a3de..31f4041e 100644 --- a/examples/groq/script_multi_generator_groq.py +++ b/examples/groq/script_multi_generator_groq.py @@ -20,11 +20,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "library": "beautifulsoup" } diff --git a/examples/groq/search_graph_groq.py b/examples/groq/search_graph_groq.py index e82ffb7c..e3044c0e 100644 --- a/examples/groq/search_graph_groq.py +++ b/examples/groq/search_graph_groq.py @@ -21,11 +21,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/search_graph_schema_groq.py b/examples/groq/search_graph_schema_groq.py index 41f03dc4..4cc2209d 100644 --- a/examples/groq/search_graph_schema_groq.py +++ b/examples/groq/search_graph_schema_groq.py @@ -34,11 +34,6 @@ class Dishes(BaseModel): "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/search_link_graph_groq.py b/examples/groq/search_link_graph_groq.py index f940c2a4..5d82f37f 100644 --- a/examples/groq/search_link_graph_groq.py +++ b/examples/groq/search_link_graph_groq.py @@ -19,11 +19,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/smart_scraper_groq.py b/examples/groq/smart_scraper_groq.py index f828cdec..ab38edc0 100644 --- a/examples/groq/smart_scraper_groq.py +++ b/examples/groq/smart_scraper_groq.py @@ -20,11 +20,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/smart_scraper_multi_groq.py b/examples/groq/smart_scraper_multi_groq.py index 18ba3992..6ead098c 100644 --- a/examples/groq/smart_scraper_multi_groq.py +++ b/examples/groq/smart_scraper_multi_groq.py @@ -19,11 +19,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False diff --git a/examples/groq/smart_scraper_schema_groq.py b/examples/groq/smart_scraper_schema_groq.py index e0c51c98..f9c1a40b 100644 --- a/examples/groq/smart_scraper_schema_groq.py +++ b/examples/groq/smart_scraper_schema_groq.py @@ -33,11 +33,6 @@ class Projects(BaseModel): "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/xml_scraper_graph_multi_groq.py b/examples/groq/xml_scraper_graph_multi_groq.py index 7b102c0f..62540671 100644 --- a/examples/groq/xml_scraper_graph_multi_groq.py +++ b/examples/groq/xml_scraper_graph_multi_groq.py @@ -30,11 +30,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/xml_scraper_groq.py b/examples/groq/xml_scraper_groq.py index 1c086175..2172ea77 100644 --- a/examples/groq/xml_scraper_groq.py +++ b/examples/groq/xml_scraper_groq.py @@ -30,11 +30,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False diff --git a/examples/huggingfacehub/csv_scraper_graph_multi_huggingfacehub.py b/examples/huggingfacehub/csv_scraper_graph_multi_huggingfacehub.py index 4517bbe9..48b04dab 100644 --- a/examples/huggingfacehub/csv_scraper_graph_multi_huggingfacehub.py +++ b/examples/huggingfacehub/csv_scraper_graph_multi_huggingfacehub.py @@ -40,7 +40,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } diff --git a/examples/huggingfacehub/csv_scraper_huggingfacehub.py b/examples/huggingfacehub/csv_scraper_huggingfacehub.py index 9d1dbe0b..18ce1194 100644 --- a/examples/huggingfacehub/csv_scraper_huggingfacehub.py +++ b/examples/huggingfacehub/csv_scraper_huggingfacehub.py @@ -43,7 +43,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/custom_graph_huggingfacehub.py b/examples/huggingfacehub/custom_graph_huggingfacehub.py index ad903b5d..0c392cc1 100644 --- a/examples/huggingfacehub/custom_graph_huggingfacehub.py +++ b/examples/huggingfacehub/custom_graph_huggingfacehub.py @@ -33,7 +33,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/json_scraper_huggingfacehub.py b/examples/huggingfacehub/json_scraper_huggingfacehub.py index 3a9a163d..d709cc0d 100644 --- a/examples/huggingfacehub/json_scraper_huggingfacehub.py +++ b/examples/huggingfacehub/json_scraper_huggingfacehub.py @@ -43,7 +43,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py b/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py index 8ca3ba51..c029431e 100644 --- a/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py +++ b/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py @@ -24,7 +24,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } FILE_NAME = "inputs/example.json" curr_dir = os.path.dirname(os.path.realpath(__file__)) diff --git a/examples/huggingfacehub/pdf_scraper_graph_huggingfacehub.py b/examples/huggingfacehub/pdf_scraper_graph_huggingfacehub.py index bb2724fe..eb0b1895 100644 --- a/examples/huggingfacehub/pdf_scraper_graph_huggingfacehub.py +++ b/examples/huggingfacehub/pdf_scraper_graph_huggingfacehub.py @@ -25,7 +25,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } source = """ diff --git a/examples/huggingfacehub/pdf_scraper_multi_huggingfacehub.py b/examples/huggingfacehub/pdf_scraper_multi_huggingfacehub.py index d24d522c..4db809b2 100644 --- a/examples/huggingfacehub/pdf_scraper_multi_huggingfacehub.py +++ b/examples/huggingfacehub/pdf_scraper_multi_huggingfacehub.py @@ -23,7 +23,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # Covert to list diff --git a/examples/huggingfacehub/scrape_plain_text_huggingfacehub.py b/examples/huggingfacehub/scrape_plain_text_huggingfacehub.py index f07e5666..76d32cda 100644 --- a/examples/huggingfacehub/scrape_plain_text_huggingfacehub.py +++ b/examples/huggingfacehub/scrape_plain_text_huggingfacehub.py @@ -45,7 +45,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/script_generator_huggingfacehub.py b/examples/huggingfacehub/script_generator_huggingfacehub.py index 4804db93..a3fcaaf4 100644 --- a/examples/huggingfacehub/script_generator_huggingfacehub.py +++ b/examples/huggingfacehub/script_generator_huggingfacehub.py @@ -36,7 +36,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ # Create the ScriptCreatorGraph instance and run it diff --git a/examples/huggingfacehub/script_multi_generator_huggingfacehub.py b/examples/huggingfacehub/script_multi_generator_huggingfacehub.py index 5afeff0d..0ee89189 100644 --- a/examples/huggingfacehub/script_multi_generator_huggingfacehub.py +++ b/examples/huggingfacehub/script_multi_generator_huggingfacehub.py @@ -33,7 +33,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/search_graph_huggingfacehub.py b/examples/huggingfacehub/search_graph_huggingfacehub.py index b3c58ce5..7c4a0c43 100644 --- a/examples/huggingfacehub/search_graph_huggingfacehub.py +++ b/examples/huggingfacehub/search_graph_huggingfacehub.py @@ -29,7 +29,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/search_link_graph_huggingfacehub.py b/examples/huggingfacehub/search_link_graph_huggingfacehub.py index a49fb3b9..75b41282 100644 --- a/examples/huggingfacehub/search_link_graph_huggingfacehub.py +++ b/examples/huggingfacehub/search_link_graph_huggingfacehub.py @@ -26,7 +26,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } diff --git a/examples/huggingfacehub/smart_scraper_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_huggingfacehub.py index bd415d41..6f9a863f 100644 --- a/examples/huggingfacehub/smart_scraper_huggingfacehub.py +++ b/examples/huggingfacehub/smart_scraper_huggingfacehub.py @@ -38,7 +38,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } smart_scraper_graph = SmartScraperGraph( diff --git a/examples/huggingfacehub/smart_scraper_multi_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_multi_huggingfacehub.py index e1a332f9..046883a2 100644 --- a/examples/huggingfacehub/smart_scraper_multi_huggingfacehub.py +++ b/examples/huggingfacehub/smart_scraper_multi_huggingfacehub.py @@ -28,7 +28,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ******************************************************* diff --git a/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py index 784079e4..31719697 100644 --- a/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py +++ b/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py @@ -48,7 +48,6 @@ class Projects(BaseModel): graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } smart_scraper_graph = SmartScraperGraph( diff --git a/examples/huggingfacehub/xml_scraper_graph_multi_huggingfacehub.py b/examples/huggingfacehub/xml_scraper_graph_multi_huggingfacehub.py index 24d6babd..1a244b86 100644 --- a/examples/huggingfacehub/xml_scraper_graph_multi_huggingfacehub.py +++ b/examples/huggingfacehub/xml_scraper_graph_multi_huggingfacehub.py @@ -40,7 +40,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/xml_scraper_huggingfacehub.py b/examples/huggingfacehub/xml_scraper_huggingfacehub.py index cc8a4425..ddd73b5f 100644 --- a/examples/huggingfacehub/xml_scraper_huggingfacehub.py +++ b/examples/huggingfacehub/xml_scraper_huggingfacehub.py @@ -40,7 +40,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/local_models/custom_graph_ollama.py b/examples/local_models/custom_graph_ollama.py index b9a42949..66dd59b6 100644 --- a/examples/local_models/custom_graph_ollama.py +++ b/examples/local_models/custom_graph_ollama.py @@ -4,7 +4,7 @@ import os from langchain_openai import OpenAIEmbeddings -from scrapegraphai.models import OpenAI +from langchain_openai import ChatOpenAI from scrapegraphai.graphs import BaseGraph from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode @@ -20,11 +20,7 @@ # "model_tokens": 2000, # set context length arbitrarily "base_url": "http://localhost:11434", }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - }, + "verbose": True, } @@ -32,7 +28,7 @@ # Define the graph nodes # ************************************************ -llm_model = OpenAI(graph_config["llm"]) +llm_model = ChatOpenAI(graph_config["llm"]) embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key) # define the nodes for the graph @@ -62,15 +58,7 @@ "verbose": True, } ) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model, - "embedder_model": embedder, - "verbose": True, - } -) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -89,14 +77,12 @@ robot_node, fetch_node, parse_node, - rag_node, generate_answer_node, ], edges=[ (robot_node, fetch_node), (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) + (parse_node, generate_answer_node) ], entry_point=robot_node ) diff --git a/examples/local_models/json_scraper_multi_ollama.py b/examples/local_models/json_scraper_multi_ollama.py index 91f4fab4..6e9c3da3 100644 --- a/examples/local_models/json_scraper_multi_ollama.py +++ b/examples/local_models/json_scraper_multi_ollama.py @@ -12,10 +12,6 @@ "format": "json", # Ollama needs the format to be specified explicitly "model_tokens": 4000, }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - }, "verbose": True, "headless": False, } diff --git a/examples/local_models/json_scraper_ollama.py b/examples/local_models/json_scraper_ollama.py index 2dd072ac..ca4eb32e 100644 --- a/examples/local_models/json_scraper_ollama.py +++ b/examples/local_models/json_scraper_ollama.py @@ -31,11 +31,6 @@ # "model_tokens": 2000, # set context length arbitrarily "base_url": "http://localhost:11434", }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - }, "verbose": True, } diff --git a/examples/local_models/pdf_scraper_multi_ollama.py b/examples/local_models/pdf_scraper_multi_ollama.py index c0b65a63..ce258bf6 100644 --- a/examples/local_models/pdf_scraper_multi_ollama.py +++ b/examples/local_models/pdf_scraper_multi_ollama.py @@ -11,10 +11,6 @@ "format": "json", # Ollama needs the format to be specified explicitly "model_tokens": 4000, }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - }, "verbose": True, } diff --git a/examples/local_models/pdf_scraper_ollama.py b/examples/local_models/pdf_scraper_ollama.py index d79afb3a..84eb40f9 100644 --- a/examples/local_models/pdf_scraper_ollama.py +++ b/examples/local_models/pdf_scraper_ollama.py @@ -10,10 +10,6 @@ "format": "json", # Ollama needs the format to be specified explicitly "model_tokens": 4000, }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - }, "verbose": True, "headless": False, } diff --git a/examples/local_models/scrape_plain_text_ollama.py b/examples/local_models/scrape_plain_text_ollama.py index 9700d713..fe24c2a9 100644 --- a/examples/local_models/scrape_plain_text_ollama.py +++ b/examples/local_models/scrape_plain_text_ollama.py @@ -30,11 +30,6 @@ # "model_tokens": 2000, # set context length arbitrarily "base_url": "http://localhost:11434", }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - }, "verbose": True, } diff --git a/examples/local_models/script_multi_generator_ollama.py b/examples/local_models/script_multi_generator_ollama.py index dc34c910..d94faba6 100644 --- a/examples/local_models/script_multi_generator_ollama.py +++ b/examples/local_models/script_multi_generator_ollama.py @@ -20,11 +20,6 @@ # "model_tokens": 2000, # set context length arbitrarily, "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, "library": "beautifoulsoup", "verbose": True, } diff --git a/examples/local_models/search_graph_ollama.py b/examples/local_models/search_graph_ollama.py index 8ecb60c1..039ca00e 100644 --- a/examples/local_models/search_graph_ollama.py +++ b/examples/local_models/search_graph_ollama.py @@ -16,11 +16,6 @@ # "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, "max_results": 5, "verbose": True, } diff --git a/examples/local_models/search_graph_schema_ollama.py b/examples/local_models/search_graph_schema_ollama.py index ae7c0632..fb87954f 100644 --- a/examples/local_models/search_graph_schema_ollama.py +++ b/examples/local_models/search_graph_schema_ollama.py @@ -29,11 +29,6 @@ class Dishes(BaseModel): "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, "verbose": True, "headless": False } diff --git a/examples/local_models/search_link_graph_ollama.py b/examples/local_models/search_link_graph_ollama.py index 5c594270..a05067dd 100644 --- a/examples/local_models/search_link_graph_ollama.py +++ b/examples/local_models/search_link_graph_ollama.py @@ -14,11 +14,7 @@ "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, + "verbose": True, "headless": False } diff --git a/examples/local_models/smart_scraper_ollama.py b/examples/local_models/smart_scraper_ollama.py index 8ac579e5..5a33dabc 100644 --- a/examples/local_models/smart_scraper_ollama.py +++ b/examples/local_models/smart_scraper_ollama.py @@ -14,11 +14,7 @@ "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, + "verbose": True, "headless": False } diff --git a/examples/local_models/smart_scraper_schema_ollama.py b/examples/local_models/smart_scraper_schema_ollama.py index 7168d513..5fcff433 100644 --- a/examples/local_models/smart_scraper_schema_ollama.py +++ b/examples/local_models/smart_scraper_schema_ollama.py @@ -24,11 +24,7 @@ class Projects(BaseModel): "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, + "verbose": True, "headless": False } diff --git a/examples/local_models/xml_scraper_graph_multi_ollama.py b/examples/local_models/xml_scraper_graph_multi_ollama.py index d84c6c9f..0494ff2c 100644 --- a/examples/local_models/xml_scraper_graph_multi_ollama.py +++ b/examples/local_models/xml_scraper_graph_multi_ollama.py @@ -29,11 +29,7 @@ # "model_tokens": 2000, # set context length arbitrarily "base_url": "http://localhost:11434", }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - }, + "verbose": True, } diff --git a/examples/local_models/xml_scraper_ollama.py b/examples/local_models/xml_scraper_ollama.py index cc8c3ad9..50c4f8e7 100644 --- a/examples/local_models/xml_scraper_ollama.py +++ b/examples/local_models/xml_scraper_ollama.py @@ -30,11 +30,6 @@ # "model_tokens": 2000, # set context length arbitrarily "base_url": "http://localhost:11434", }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - }, "verbose": True, } diff --git a/examples/openai/csv_scraper_graph_multi_openai.py b/examples/openai/csv_scraper_graph_multi_openai.py index 771ad679..7b91c896 100644 --- a/examples/openai/csv_scraper_graph_multi_openai.py +++ b/examples/openai/csv_scraper_graph_multi_openai.py @@ -27,7 +27,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, } diff --git a/examples/openai/csv_scraper_openai.py b/examples/openai/csv_scraper_openai.py index 211f14f9..744fc7a4 100644 --- a/examples/openai/csv_scraper_openai.py +++ b/examples/openai/csv_scraper_openai.py @@ -28,7 +28,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, } diff --git a/examples/openai/custom_graph_openai.py b/examples/openai/custom_graph_openai.py index 905473e0..6687e0ef 100644 --- a/examples/openai/custom_graph_openai.py +++ b/examples/openai/custom_graph_openai.py @@ -19,7 +19,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, } diff --git a/examples/openai/deep_scraper_openai.py b/examples/openai/deep_scraper_openai.py index 4860a31f..5b7202d4 100644 --- a/examples/openai/deep_scraper_openai.py +++ b/examples/openai/deep_scraper_openai.py @@ -18,7 +18,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-4", + "model": "gpt-4o", }, "verbose": True, "max_depth": 1 diff --git a/examples/openai/json_scraper_multi_openai.py b/examples/openai/json_scraper_multi_openai.py index 021cd6e1..b27e5050 100644 --- a/examples/openai/json_scraper_multi_openai.py +++ b/examples/openai/json_scraper_multi_openai.py @@ -13,7 +13,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", } } diff --git a/examples/openai/json_scraper_openai.py b/examples/openai/json_scraper_openai.py index 25fc85af..eb5d1e7e 100644 --- a/examples/openai/json_scraper_openai.py +++ b/examples/openai/json_scraper_openai.py @@ -28,7 +28,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, } diff --git a/examples/openai/md_scraper_openai.py b/examples/openai/md_scraper_openai.py index e6f74107..2c264ab9 100644 --- a/examples/openai/md_scraper_openai.py +++ b/examples/openai/md_scraper_openai.py @@ -28,7 +28,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, } diff --git a/examples/openai/pdf_scraper_multi_openai.py b/examples/openai/pdf_scraper_multi_openai.py index 9e699e58..49a9c7fa 100644 --- a/examples/openai/pdf_scraper_multi_openai.py +++ b/examples/openai/pdf_scraper_multi_openai.py @@ -3,11 +3,10 @@ """ import os import json +from typing import List from dotenv import load_dotenv -from scrapegraphai.graphs import PdfScraperMultiGraph - from pydantic import BaseModel, Field -from typing import List +from scrapegraphai.graphs import PdfScraperMultiGraph load_dotenv() @@ -20,7 +19,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "verbose": True, } diff --git a/examples/openai/pdf_scraper_openai.py b/examples/openai/pdf_scraper_openai.py index e07a7ab5..2b0e19f3 100644 --- a/examples/openai/pdf_scraper_openai.py +++ b/examples/openai/pdf_scraper_openai.py @@ -14,7 +14,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "verbose": True, } diff --git a/examples/openai/scrape_plain_text_openai.py b/examples/openai/scrape_plain_text_openai.py index ffe0054a..7f390cff 100644 --- a/examples/openai/scrape_plain_text_openai.py +++ b/examples/openai/scrape_plain_text_openai.py @@ -30,7 +30,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, } diff --git a/examples/openai/script_generator_openai.py b/examples/openai/script_generator_openai.py index 14c00ab4..046a25ec 100644 --- a/examples/openai/script_generator_openai.py +++ b/examples/openai/script_generator_openai.py @@ -18,7 +18,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "library": "beautifulsoup" } diff --git a/examples/openai/search_graph_schema_openai.py b/examples/openai/search_graph_schema_openai.py index e5131461..ecbcc644 100644 --- a/examples/openai/search_graph_schema_openai.py +++ b/examples/openai/search_graph_schema_openai.py @@ -3,14 +3,13 @@ """ import os +from typing import List from dotenv import load_dotenv -load_dotenv() - +from pydantic import BaseModel, Field from scrapegraphai.graphs import SearchGraph from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -from pydantic import BaseModel, Field -from typing import List +load_dotenv() # ************************************************ # Define the output schema for the graph diff --git a/examples/openai/search_link_graph_openai.py b/examples/openai/search_link_graph_openai.py index 10d10d4c..818f9434 100644 --- a/examples/openai/search_link_graph_openai.py +++ b/examples/openai/search_link_graph_openai.py @@ -1,16 +1,23 @@ """ Basic example of scraping pipeline using SmartScraper """ + +import os +from dotenv import load_dotenv from scrapegraphai.graphs import SearchLinkGraph from scrapegraphai.utils import prettify_exec_info + +load_dotenv() # ************************************************ # Define the configuration for the graph # ************************************************ +openai_key = os.getenv("OPENAI_APIKEY") + graph_config = { "llm": { - "api_key": "s", - "model": "gpt-3.5-turbo", + "api_key": openai_key, + "model": "gpt-4o", }, "verbose": True, "headless": False, diff --git a/examples/openai/smart_scraper_schema_openai.py b/examples/openai/smart_scraper_schema_openai.py index 076f1327..d9e1bd1c 100644 --- a/examples/openai/smart_scraper_schema_openai.py +++ b/examples/openai/smart_scraper_schema_openai.py @@ -30,7 +30,7 @@ class Projects(BaseModel): graph_config = { "llm": { "api_key":openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "verbose": True, "headless": False, diff --git a/examples/openai/speech_graph_openai.py b/examples/openai/speech_graph_openai.py index 15cc2cfb..603ce51c 100644 --- a/examples/openai/speech_graph_openai.py +++ b/examples/openai/speech_graph_openai.py @@ -25,7 +25,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", "temperature": 0.7, }, "tts_model": { diff --git a/examples/openai/xml_scraper_graph_multi_openai.py b/examples/openai/xml_scraper_graph_multi_openai.py index 46633bba..ef46b877 100644 --- a/examples/openai/xml_scraper_graph_multi_openai.py +++ b/examples/openai/xml_scraper_graph_multi_openai.py @@ -29,7 +29,7 @@ graph_config = { "llm": { "api_key":openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "verbose": True, "headless": False, diff --git a/examples/openai/xml_scraper_openai.py b/examples/openai/xml_scraper_openai.py index 5be5716e..b2b5075e 100644 --- a/examples/openai/xml_scraper_openai.py +++ b/examples/openai/xml_scraper_openai.py @@ -28,7 +28,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "verbose":False, } diff --git a/examples/single_node/robot_node.py b/examples/single_node/robot_node.py index f51f8649..c2bcbbd1 100644 --- a/examples/single_node/robot_node.py +++ b/examples/single_node/robot_node.py @@ -2,7 +2,7 @@ Example of custom graph using existing nodes """ -from scrapegraphai.models import Ollama +from langchain_community.chat_models import ChatOllama from scrapegraphai.nodes import RobotsNode # ************************************************ @@ -26,7 +26,7 @@ # Define the node # ************************************************ -llm_model = Ollama(graph_config["llm"]) +llm_model = ChatOllama(graph_config["llm"]) robots_node = RobotsNode( input="url", diff --git a/pyproject.toml b/pyproject.toml index 168f79cc..768e9415 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,8 @@ [project] name = "scrapegraphai" -version = "1.12.1b1" + +version = "1.11.0b10" + description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ @@ -9,10 +11,11 @@ authors = [ { name = "Lorenzo Padoan", email = "lorenzo.padoan977@gmail.com" } ] -dependencies = [ "langchain>=0.2.10", + "langchain-fireworks>=0.1.3", "langchain_community>=0.2.9", + "langchain-google-genai>=1.0.7", "langchain-google-vertexai>=1.0.7", "langchain-openai>=0.1.17", @@ -34,6 +37,10 @@ dependencies = [ "google>=3.0.0", "undetected-playwright>=0.3.0", "semchunk>=1.0.1", + "langchain-fireworks>=0.1.3", + "langchain-community>=0.2.9", + "langchain-huggingface>=0.0.3", + "browserbase==0.3.0" ] license = "MIT" diff --git a/requirements-dev.lock b/requirements-dev.lock index 3925a66a..cd2e2612 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -54,6 +54,8 @@ boto3==1.34.146 botocore==1.34.146 # via boto3 # via s3transfer +browserbase==0.3.0 + # via scrapegraphai burr==0.22.1 # via scrapegraphai cachetools==5.4.0 @@ -106,6 +108,8 @@ fastapi-pagination==0.12.26 # via burr filelock==3.15.4 # via huggingface-hub + # via torch + # via transformers fireworks-ai==0.14.0 # via langchain-fireworks fonttools==4.53.1 @@ -117,6 +121,7 @@ frozenlist==1.4.1 # via aiosignal fsspec==2024.6.1 # via huggingface-hub + # via torch furo==2024.5.6 # via scrapegraphai gitdb==4.0.11 @@ -206,14 +211,20 @@ httptools==0.6.1 # via uvicorn httpx==0.27.0 # via anthropic + # via browserbase # via fastapi # via fireworks-ai # via groq # via openai httpx-sse==0.4.0 # via fireworks-ai +huggingface-hub==0.24.0 + # via langchain-huggingface + # via sentence-transformers huggingface-hub==0.24.1 + # via tokenizers + # via transformers idna==3.7 # via anyio # via email-validator @@ -236,11 +247,14 @@ jinja2==3.1.4 # via fastapi # via pydeck # via sphinx + # via torch jiter==0.5.0 # via anthropic jmespath==1.0.1 # via boto3 # via botocore +joblib==1.4.2 + # via scikit-learn jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 @@ -269,6 +283,7 @@ langchain-core==0.2.23 # via langchain-google-genai # via langchain-google-vertexai # via langchain-groq + # via langchain-huggingface # via langchain-nvidia-ai-endpoints # via langchain-openai # via langchain-text-splitters @@ -280,6 +295,10 @@ langchain-google-vertexai==1.0.7 # via scrapegraphai langchain-groq==0.1.6 # via scrapegraphai +langchain-huggingface==0.0.3 + # via scrapegraphai +langchain-nvidia-ai-endpoints==0.1.6 + langchain-nvidia-ai-endpoints==0.1.7 # via scrapegraphai langchain-openai==0.1.17 @@ -310,6 +329,8 @@ minify-html==0.15.0 # via scrapegraphai mpire==2.10.2 # via semchunk +mpmath==1.3.0 + # via sympy multidict==6.0.5 # via aiohttp # via yarl @@ -317,6 +338,8 @@ multiprocess==0.70.16 # via mpire mypy-extensions==1.0.0 # via typing-inspect +networkx==3.2.1 + # via torch numpy==1.26.4 # via altair # via contourpy @@ -328,9 +351,13 @@ numpy==1.26.4 # via pandas # via pyarrow # via pydeck + # via scikit-learn + # via scipy + # via sentence-transformers # via sf-hamilton # via shapely # via streamlit + # via transformers openai==1.37.0 # via burr # via langchain-fireworks @@ -349,6 +376,7 @@ packaging==24.1 # via pytest # via sphinx # via streamlit + # via transformers pandas==2.2.2 # via altair # via scrapegraphai @@ -358,9 +386,11 @@ pillow==10.4.0 # via fireworks-ai # via langchain-nvidia-ai-endpoints # via matplotlib + # via sentence-transformers # via streamlit platformdirs==4.2.2 # via pylint + playwright==1.45.1 # via scrapegraphai # via undetected-playwright @@ -391,6 +421,7 @@ pyasn1-modules==0.4.0 # via google-auth pydantic==2.8.2 # via anthropic + # via browserbase # via burr # via fastapi # via fastapi-pagination @@ -437,12 +468,14 @@ pyyaml==6.0.1 # via langchain # via langchain-community # via langchain-core + # via transformers # via uvicorn referencing==0.35.1 # via jsonschema # via jsonschema-specifications regex==2024.5.15 # via tiktoken + # via transformers requests==2.32.3 # via burr # via free-proxy @@ -457,6 +490,7 @@ requests==2.32.3 # via sphinx # via streamlit # via tiktoken + # via transformers rich==13.7.1 # via streamlit # via typer @@ -467,8 +501,17 @@ rsa==4.9 # via google-auth s3transfer==0.10.2 # via boto3 +safetensors==0.4.3 + # via transformers +scikit-learn==1.5.1 + # via sentence-transformers +scipy==1.13.1 + # via scikit-learn + # via sentence-transformers semchunk==2.2.0 # via scrapegraphai +sentence-transformers==3.0.1 + # via langchain-huggingface sf-hamilton==1.72.1 # via burr shapely==2.0.5 @@ -514,16 +557,22 @@ starlette==0.37.2 # via fastapi streamlit==1.36.0 # via burr +sympy==1.13.1 + # via torch tenacity==8.5.0 # via langchain # via langchain-community # via langchain-core # via streamlit +threadpoolctl==3.5.0 + # via scikit-learn tiktoken==0.7.0 # via langchain-openai # via scrapegraphai tokenizers==0.19.1 # via anthropic + # via langchain-huggingface + # via transformers toml==0.10.2 # via streamlit tomli==2.0.1 @@ -533,6 +582,8 @@ tomlkit==0.13.0 # via pylint toolz==0.12.1 # via altair +torch==2.2.2 + # via sentence-transformers tornado==6.4.1 # via streamlit tqdm==4.66.4 @@ -542,6 +593,11 @@ tqdm==4.66.4 # via openai # via scrapegraphai # via semchunk + # via sentence-transformers + # via transformers +transformers==4.43.3 + # via langchain-huggingface + # via sentence-transformers typer==0.12.3 # via fastapi-cli typing-extensions==4.12.2 @@ -563,6 +619,7 @@ typing-extensions==4.12.2 # via sqlalchemy # via starlette # via streamlit + # via torch # via typer # via typing-inspect # via uvicorn diff --git a/requirements.lock b/requirements.lock index 5b34025c..1c5b514e 100644 --- a/requirements.lock +++ b/requirements.lock @@ -37,6 +37,8 @@ boto3==1.34.146 botocore==1.34.146 # via boto3 # via s3transfer +browserbase==0.3.0 + # via scrapegraphai cachetools==5.4.0 # via google-auth certifi==2024.7.4 @@ -63,6 +65,8 @@ faiss-cpu==1.8.0.post1 # via scrapegraphai filelock==3.15.4 # via huggingface-hub + # via torch + # via transformers fireworks-ai==0.14.0 # via langchain-fireworks free-proxy==1.1.1 @@ -72,6 +76,7 @@ frozenlist==1.4.1 # via aiosignal fsspec==2024.6.1 # via huggingface-hub + # via torch google==3.0.0 # via scrapegraphai google-ai-generativelanguage==0.6.6 @@ -151,6 +156,7 @@ httplib2==0.22.0 # via google-auth-httplib2 httpx==0.27.0 # via anthropic + # via browserbase # via fireworks-ai # via groq # via openai @@ -158,16 +164,21 @@ httpx-sse==0.4.0 # via fireworks-ai huggingface-hub==0.24.1 # via tokenizers + # via transformers idna==3.7 # via anyio # via httpx # via requests # via yarl +jinja2==3.1.4 + # via torch jiter==0.5.0 # via anthropic jmespath==1.0.1 # via boto3 # via botocore +joblib==1.4.2 + # via scikit-learn jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 @@ -190,6 +201,7 @@ langchain-core==0.2.23 # via langchain-google-genai # via langchain-google-vertexai # via langchain-groq + # via langchain-huggingface # via langchain-nvidia-ai-endpoints # via langchain-openai # via langchain-text-splitters @@ -201,7 +213,10 @@ langchain-google-vertexai==1.0.7 # via scrapegraphai langchain-groq==0.1.6 # via scrapegraphai +langchain-huggingface==0.0.3 + # via scrapegraphai langchain-nvidia-ai-endpoints==0.1.7 + # via scrapegraphai langchain-openai==0.1.17 # via scrapegraphai @@ -213,12 +228,16 @@ langsmith==0.1.93 # via langchain-core lxml==5.2.2 # via free-proxy +markupsafe==2.1.5 + # via jinja2 marshmallow==3.21.3 # via dataclasses-json minify-html==0.15.0 # via scrapegraphai mpire==2.10.2 # via semchunk +mpmath==1.3.0 + # via sympy multidict==6.0.5 # via aiohttp # via yarl @@ -226,13 +245,19 @@ multiprocess==0.70.16 # via mpire mypy-extensions==1.0.0 # via typing-inspect +networkx==3.2.1 + # via torch numpy==1.26.4 # via faiss-cpu # via langchain # via langchain-aws # via langchain-community # via pandas + # via scikit-learn + # via scipy + # via sentence-transformers # via shapely + # via transformers openai==1.37.0 # via langchain-fireworks # via langchain-openai @@ -245,11 +270,13 @@ packaging==24.1 # via huggingface-hub # via langchain-core # via marshmallow + pandas==2.2.2 # via scrapegraphai pillow==10.4.0 # via fireworks-ai # via langchain-nvidia-ai-endpoints + playwright==1.45.1 # via scrapegraphai # via undetected-playwright @@ -275,6 +302,7 @@ pyasn1-modules==0.4.0 # via google-auth pydantic==2.8.2 # via anthropic + # via browserbase # via fireworks-ai # via google-cloud-aiplatform # via google-generativeai @@ -304,8 +332,10 @@ pyyaml==6.0.1 # via langchain # via langchain-community # via langchain-core + # via transformers regex==2024.5.15 # via tiktoken + # via transformers requests==2.32.3 # via free-proxy # via google-api-core @@ -317,12 +347,22 @@ requests==2.32.3 # via langchain-fireworks # via langsmith # via tiktoken + # via transformers rsa==4.9 # via google-auth s3transfer==0.10.2 # via boto3 +safetensors==0.4.3 + # via transformers +scikit-learn==1.5.1 + # via sentence-transformers +scipy==1.13.1 + # via scikit-learn + # via sentence-transformers semchunk==2.2.0 # via scrapegraphai +sentence-transformers==3.0.1 + # via langchain-huggingface shapely==2.0.5 # via google-cloud-aiplatform six==1.16.0 @@ -338,15 +378,23 @@ soupsieve==2.5 sqlalchemy==2.0.31 # via langchain # via langchain-community +sympy==1.13.1 + # via torch tenacity==8.5.0 # via langchain # via langchain-community # via langchain-core +threadpoolctl==3.5.0 + # via scikit-learn tiktoken==0.7.0 # via langchain-openai # via scrapegraphai tokenizers==0.19.1 # via anthropic + # via langchain-huggingface + # via transformers +torch==2.2.2 + # via sentence-transformers tqdm==4.66.4 # via google-generativeai # via huggingface-hub @@ -354,6 +402,11 @@ tqdm==4.66.4 # via openai # via scrapegraphai # via semchunk + # via sentence-transformers + # via transformers +transformers==4.43.3 + # via langchain-huggingface + # via sentence-transformers typing-extensions==4.12.2 # via anthropic # via anyio @@ -365,6 +418,7 @@ typing-extensions==4.12.2 # via pydantic-core # via pyee # via sqlalchemy + # via typing-inspect typing-inspect==0.9.0 # via dataclasses-json diff --git a/requirements.txt b/requirements.txt index af7ee2ed..0116a466 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,12 @@ langchain>=0.2.10 +langchain-google-genai>=1.0.7 +langchain-google-vertexai + langchain-fireworks>=0.1.3 langchain_community>=0.2.9 langchain-google-genai>=1.0.7 langchain-google-vertexai>=1.0.7 + langchain-openai>=0.1.17 langchain-groq>=0.1.3 langchain-aws>=0.1.3 @@ -22,3 +26,8 @@ playwright>=1.43.0 google>=3.0.0 undetected-playwright>=0.3.0 semchunk>=1.0.1 +langchain-fireworks>=0.1.3 +langchain-community>=0.2.9 +langchain-huggingface>=0.0.3 +browserbase==0.3.0 + diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py index a9e45407..45a3783d 100644 --- a/scrapegraphai/docloaders/__init__.py +++ b/scrapegraphai/docloaders/__init__.py @@ -1,3 +1,4 @@ """__init__.py file for docloaders folder""" from .chromium import ChromiumLoader +from .browser_base import browser_base_fetch diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py new file mode 100644 index 00000000..77628bc5 --- /dev/null +++ b/scrapegraphai/docloaders/browser_base.py @@ -0,0 +1,48 @@ +""" +browserbase integration module +""" +from typing import List +from browserbase import Browserbase + +def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[str]: + """ + BrowserBase Fetch + + This module provides an interface to the BrowserBase API. + + The `browser_base_fetch` function takes three arguments: + - `api_key`: The API key provided by BrowserBase. + - `project_id`: The ID of the project on BrowserBase where you want to fetch data from. + - `link`: The URL or link that you want to fetch data from. + + It initializes a Browserbase object with the given API key and project ID, + then uses this object to load the specified link. + It returns the result of the loading operation. + + Example usage: + + ``` + from browser_base_fetch import browser_base_fetch + + result = browser_base_fetch(api_key="your_api_key", + project_id="your_project_id", link="https://example.com") + print(result) + ``` + + Please note that you need to replace "your_api_key" and "your_project_id" + with your actual BrowserBase API key and project ID. + + Args: + api_key (str): The API key provided by BrowserBase. + project_id (str): The ID of the project on BrowserBase where you want to fetch data from. + link (str): The URL or link that you want to fetch data from. + + Returns: + object: The result of the loading operation. + """ + + browserbase = Browserbase(api_key=api_key, project_id=project_id) + + result = browserbase.load([link]) + + return result diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 474c22de..cb0cfd9a 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -1,3 +1,6 @@ +""" +Chromium module +""" import asyncio from typing import Any, AsyncIterator, Iterator, List, Optional diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 91396ae0..f07bcb10 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -3,38 +3,21 @@ """ from abc import ABC, abstractmethod -from typing import Optional, Union +from typing import Optional import uuid from pydantic import BaseModel -from langchain_aws import BedrockEmbeddings -from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings -from langchain_google_genai import GoogleGenerativeAIEmbeddings -from langchain_google_vertexai import VertexAIEmbeddings -from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings -from langchain_fireworks import FireworksEmbeddings -from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings -from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings +from langchain_community.chat_models import ErnieBotChat +from langchain_nvidia_ai_endpoints import ChatNVIDIA +from langchain.chat_models import init_chat_model + from ..helpers import models_tokens from ..models import ( - Anthropic, - AzureOpenAI, - Bedrock, - Gemini, - Groq, - HuggingFace, - Ollama, - OpenAI, OneApi, - Fireworks, - VertexAI, - Nvidia + DeepSeek ) -from ..models.ernie import Ernie -from ..utils.logging import set_verbosity_debug, set_verbosity_warning, set_verbosity_info +from ..utils.logging import set_verbosity_warning, set_verbosity_info -from ..helpers import models_tokens -from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek class AbstractGraph(ABC): @@ -46,8 +29,6 @@ class AbstractGraph(ABC): config (dict): Configuration parameters for the graph. schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. - embedder_model: An instance of an embedding model client, - configured for generating embeddings. verbose (bool): A flag indicating whether to show print statements during execution. headless (bool): A flag indicating whether to run the graph in headless mode. @@ -68,22 +49,21 @@ class AbstractGraph(ABC): >>> result = my_graph.run() """ - def __init__(self, prompt: str, config: dict, + def __init__(self, prompt: str, config: dict, source: Optional[str] = None, schema: Optional[BaseModel] = None): self.prompt = prompt self.source = source self.config = config self.schema = schema - self.llm_model = self._create_llm(config["llm"], chat=True) - self.embedder_model = self._create_default_embedder(llm_config=config["llm"]) if "embeddings" not in config else self._create_embedder( - config["embeddings"]) + self.llm_model = self._create_llm(config["llm"]) self.verbose = False if config is None else config.get( "verbose", False) - self.headless = True if config is None else config.get( + self.headless = True if self.config is None else config.get( "headless", True) - self.loader_kwargs = config.get("loader_kwargs", {}) - self.cache_path = config.get("cache_path", False) + self.loader_kwargs = self.config.get("loader_kwargs", {}) + self.cache_path = self.config.get("cache_path", False) + self.browser_base = self.config.get("browser_base") # Create the graph self.graph = self._create_graph() @@ -104,7 +84,6 @@ def __init__(self, prompt: str, config: dict, "verbose": self.verbose, "loader_kwargs": self.loader_kwargs, "llm_model": self.llm_model, - "embedder_model": self.embedder_model, "cache_path": self.cache_path, } @@ -131,7 +110,7 @@ def set_common_params(self, params: dict, overwrite=False): for node in self.graph.nodes: node.update_config(params, overwrite) - def _create_llm(self, llm_config: dict, chat=False) -> object: + def _create_llm(self, llm_config: dict) -> object: """ Create a large language model instance based on the configuration provided. @@ -151,256 +130,102 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: # If model instance is passed directly instead of the model details if "model_instance" in llm_params: try: - self.model_token = llm_params["model_tokens"] + self.model_token = llm_params["model_tokens"] except KeyError as exc: - raise KeyError("model_tokens not specified") from exc + raise KeyError("model_tokens not specified") from exc return llm_params["model_instance"] - # Instantiate the language model based on the model name - if "gpt-" in llm_params["model"]: - try: - self.model_token = models_tokens["openai"][llm_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return OpenAI(llm_params) - elif "oneapi" in llm_params["model"]: - # take the model after the last dash - llm_params["model"] = llm_params["model"].split("/")[-1] - try: - self.model_token = models_tokens["oneapi"][llm_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return OneApi(llm_params) - elif "fireworks" in llm_params["model"]: - try: - self.model_token = models_tokens["fireworks"][llm_params["model"].split("/")[-1]] - llm_params["model"] = "/".join(llm_params["model"].split("/")[1:]) - except KeyError as exc: - raise KeyError("Model not supported") from exc - return Fireworks(llm_params) - elif "azure" in llm_params["model"]: - # take the model after the last dash - llm_params["model"] = llm_params["model"].split("/")[-1] - try: - self.model_token = models_tokens["azure"][llm_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return AzureOpenAI(llm_params) - elif "nvidia" in llm_params["model"]: - try: - self.model_token = models_tokens["nvidia"][llm_params["model"].split("/")[-1]] - llm_params["model"] = "/".join(llm_params["model"].split("/")[1:]) - except KeyError as exc: - raise KeyError("Model not supported") from exc - return Nvidia(llm_params) - elif "gemini" in llm_params["model"]: - llm_params["model"] = llm_params["model"].split("/")[-1] + # Instantiate the language model based on the model name (models that use the common interface) + def handle_model(model_name, provider, token_key, default_token=8192): try: - self.model_token = models_tokens["gemini"][llm_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return Gemini(llm_params) - elif llm_params["model"].startswith("claude"): - llm_params["model"] = llm_params["model"].split("/")[-1] - try: - self.model_token = models_tokens["claude"][llm_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return Anthropic(llm_params) - elif llm_params["model"].startswith("vertexai"): - try: - self.model_token = models_tokens["vertexai"][llm_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return VertexAI(llm_params) - elif "ollama" in llm_params["model"]: - llm_params["model"] = llm_params["model"].split("ollama/")[-1] + self.model_token = models_tokens[provider][token_key] + except KeyError: + print(f"Model not found, using default token size ({default_token})") + self.model_token = default_token + llm_params["model_provider"] = provider + llm_params["model"] = model_name + return init_chat_model(**llm_params) - # allow user to set model_tokens in config - try: - if "model_tokens" in llm_params: - self.model_token = llm_params["model_tokens"] - elif llm_params["model"] in models_tokens["ollama"]: - try: - self.model_token = models_tokens["ollama"][llm_params["model"]] - except KeyError as exc: - print("model not found, using default token size (8192)") - self.model_token = 8192 - else: - self.model_token = 8192 - except AttributeError: - self.model_token = 8192 + if "azure" in llm_params["model"]: + model_name = llm_params["model"].split("/")[-1] + return handle_model(model_name, "azure_openai", model_name) - return Ollama(llm_params) - elif "hugging_face" in llm_params["model"]: - llm_params["model"] = llm_params["model"].split("/")[-1] - try: - self.model_token = models_tokens["hugging_face"][llm_params["model"]] - except KeyError: - print("model not found, using default token size (8192)") - self.model_token = 8192 - return HuggingFace(llm_params) - elif "groq" in llm_params["model"]: - llm_params["model"] = llm_params["model"].split("/")[-1] + if "gpt-" in llm_params["model"]: + return handle_model(llm_params["model"], "openai", llm_params["model"]) - try: - self.model_token = models_tokens["groq"][llm_params["model"]] - except KeyError: - print("model not found, using default token size (8192)") - self.model_token = 8192 - return Groq(llm_params) - elif "bedrock" in llm_params["model"]: - llm_params["model"] = llm_params["model"].split("/")[-1] - model_id = llm_params["model"] - client = llm_params.get("client", None) - try: - self.model_token = models_tokens["bedrock"][llm_params["model"]] - except KeyError: - print("model not found, using default token size (8192)") - self.model_token = 8192 - return Bedrock( - { - "client": client, - "model_id": model_id, - "model_kwargs": { - "temperature": llm_params["temperature"], - }, - } - ) - elif "claude-3-" in llm_params["model"]: - try: - self.model_token = models_tokens["claude"]["claude3"] - except KeyError: - print("model not found, using default token size (8192)") - self.model_token = 8192 - return Anthropic(llm_params) - elif "deepseek" in llm_params["model"]: + if "fireworks" in llm_params["model"]: + model_name = "/".join(llm_params["model"].split("/")[1:]) + token_key = llm_params["model"].split("/")[-1] + return handle_model(model_name, "fireworks", token_key) + + if "gemini" in llm_params["model"]: + model_name = llm_params["model"].split("/")[-1] + return handle_model(model_name, "google_genai", model_name) + + if llm_params["model"].startswith("claude"): + model_name = llm_params["model"].split("/")[-1] + return handle_model(model_name, "anthropic", model_name) + + if llm_params["model"].startswith("vertexai"): + return handle_model(llm_params["model"], "google_vertexai", llm_params["model"]) + + if "ollama" in llm_params["model"]: + model_name = llm_params["model"].split("ollama/")[-1] + token_key = model_name if "model_tokens" not in llm_params else llm_params["model_tokens"] + return handle_model(model_name, "ollama", token_key) + + if "hugging_face" in llm_params["model"]: + model_name = llm_params["model"].split("/")[-1] + return handle_model(model_name, "hugging_face", model_name) + + if "groq" in llm_params["model"]: + model_name = llm_params["model"].split("/")[-1] + return handle_model(model_name, "groq", model_name) + + if "bedrock" in llm_params["model"]: + model_name = llm_params["model"].split("/")[-1] + return handle_model(model_name, "bedrock", model_name) + + if "claude-3-" in llm_params["model"]: + return handle_model(llm_params["model"], "anthropic", "claude3") + + # Instantiate the language model based on the model name (models that do not use the common interface) + if "deepseek" in llm_params["model"]: try: self.model_token = models_tokens["deepseek"][llm_params["model"]] except KeyError: print("model not found, using default token size (8192)") self.model_token = 8192 return DeepSeek(llm_params) - elif "ernie" in llm_params["model"]: + + if "ernie" in llm_params["model"]: try: self.model_token = models_tokens["ernie"][llm_params["model"]] except KeyError: print("model not found, using default token size (8192)") self.model_token = 8192 - return Ernie(llm_params) - else: - raise ValueError("Model provided by the configuration not supported") - - def _create_default_embedder(self, llm_config=None) -> object: - """ - Create an embedding model instance based on the chosen llm model. - - Returns: - object: An instance of the embedding model client. - - Raises: - ValueError: If the model is not supported. - """ - if isinstance(self.llm_model, Gemini): - return GoogleGenerativeAIEmbeddings( - google_api_key=llm_config["api_key"], model="models/embedding-001" - ) - if isinstance(self.llm_model, OpenAI): - return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key, - base_url=self.llm_model.openai_api_base) - elif isinstance(self.llm_model, DeepSeek): - return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) - elif isinstance(self.llm_model, VertexAI): - return VertexAIEmbeddings() - elif isinstance(self.llm_model, AzureOpenAIEmbeddings): - return self.llm_model - elif isinstance(self.llm_model, AzureOpenAI): - return AzureOpenAIEmbeddings() - elif isinstance(self.llm_model, Fireworks): - return FireworksEmbeddings(model=self.llm_model.model_name) - elif isinstance(self.llm_model, Nvidia): - return NVIDIAEmbeddings(model=self.llm_model.model_name) - elif isinstance(self.llm_model, Ollama): - # unwrap the kwargs from the model whihc is a dict - params = self.llm_model._lc_kwargs - # remove streaming and temperature - params.pop("streaming", None) - params.pop("temperature", None) - - return OllamaEmbeddings(**params) - elif isinstance(self.llm_model, HuggingFace): - return HuggingFaceHubEmbeddings(model=self.llm_model.model) - elif isinstance(self.llm_model, Bedrock): - return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id) - else: - raise ValueError("Embedding Model missing or not supported") - - def _create_embedder(self, embedder_config: dict) -> object: - """ - Create an embedding model instance based on the configuration provided. - - Args: - embedder_config (dict): Configuration parameters for the embedding model. - - Returns: - object: An instance of the embedding model client. - - Raises: - KeyError: If the model is not supported. - """ - embedder_params = {**embedder_config} - if "model_instance" in embedder_config: - return embedder_params["model_instance"] - # Instantiate the embedding model based on the model name - if "openai" in embedder_params["model"]: - return OpenAIEmbeddings(api_key=embedder_params["api_key"]) - elif "azure" in embedder_params["model"]: - return AzureOpenAIEmbeddings() - if "nvidia" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["nvidia"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return NVIDIAEmbeddings(model=embedder_params["model"], - nvidia_api_key=embedder_params["api_key"]) - elif "ollama" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["ollama"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return OllamaEmbeddings(**embedder_params) - elif "hugging_face" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["hugging_face"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return HuggingFaceHubEmbeddings(model=embedder_params["model"]) - elif "fireworks" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["fireworks"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return FireworksEmbeddings(model=embedder_params["model"]) - elif "gemini" in embedder_params["model"]: + return ErnieBotChat(llm_params) + + if "oneapi" in llm_params["model"]: + # take the model after the last dash + llm_params["model"] = llm_params["model"].split("/")[-1] try: - models_tokens["gemini"][embedder_params["model"]] + self.model_token = models_tokens["oneapi"][llm_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc - return GoogleGenerativeAIEmbeddings(model=embedder_params["model"]) - elif "bedrock" in embedder_params["model"]: - embedder_params["model"] = embedder_params["model"].split("/")[-1] - client = embedder_params.get("client", None) + return OneApi(llm_params) + + if "nvidia" in llm_params["model"]: try: - models_tokens["bedrock"][embedder_params["model"]] + self.model_token = models_tokens["nvidia"][llm_params["model"].split("/")[-1]] + llm_params["model"] = "/".join(llm_params["model"].split("/")[1:]) except KeyError as exc: raise KeyError("Model not supported") from exc - return BedrockEmbeddings(client=client, model_id=embedder_params["model"]) - else: - raise ValueError("Model provided by the configuration not supported") + return ChatNVIDIA(llm_params) + + # Raise an error if the model did not match any of the previous cases + raise ValueError("Model provided by the configuration not supported") + def get_state(self, key=None) -> dict: """ "" @@ -442,11 +267,9 @@ def _create_graph(self): """ Abstract method to create a graph representation. """ - pass @abstractmethod def run(self) -> str: """ Abstract method to execute the graph and return the result. """ - pass diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py index f4efd1fb..d7ec186e 100644 --- a/scrapegraphai/graphs/csv_scraper_graph.py +++ b/scrapegraphai/graphs/csv_scraper_graph.py @@ -36,7 +36,7 @@ def _create_graph(self): input="csv | csv_dir", output=["doc"], ) - + generate_answer_node = GenerateAnswerCSVNode( input="user_prompt & (relevant_chunks | doc)", output=["answer"], diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index 787e537a..0c0f1104 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -53,7 +53,6 @@ def __init__(self, prompt: str, config: dict, schema: Optional[BaseModel] = None self.copy_config = copy(config) else: self.copy_config = deepcopy(config) - self.copy_schema = deepcopy(schema) self.considered_urls = [] # New attribute to store URLs diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 8fc532cd..d1d6f94b 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -11,7 +11,6 @@ from ..nodes import ( FetchNode, ParseNode, - RAGNode, GenerateAnswerNode, TextToSpeechNode, ) @@ -72,13 +71,6 @@ def _create_graph(self) -> BaseGraph: "chunk_size": self.model_token } ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model } - ) generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -100,14 +92,12 @@ def _create_graph(self) -> BaseGraph: nodes=[ fetch_node, parse_node, - rag_node, generate_answer_node, text_to_speech_node ], edges=[ (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node), + (parse_node, generate_answer_node), (generate_answer_node, text_to_speech_node) ], entry_point=fetch_node, @@ -121,7 +111,7 @@ def run(self) -> str: Returns: str: The answer to the prompt. """ - + inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) @@ -132,4 +122,4 @@ def run(self) -> str: "output_path", "output.mp3")) print(f"Audio saved to {self.config.get('output_path', 'output.mp3')}") - return self.final_state.get("answer", "No answer found.") \ No newline at end of file + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 0724ee95..cb00435d 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -196,6 +196,11 @@ "fireworks": { "llama-v2-7b": 4096, "mixtral-8x7b-instruct": 4096, - "nomic-ai/nomic-embed-text-v1.5": 8192 + "nomic-ai/nomic-embed-text-v1.5": 8192, + "llama-3.1-405B-instruct": 131072, + "llama-3.1-70B-instruct": 131072, + "llama-3.1-8B-instruct": 131072, + "mixtral-moe-8x22B-instruct": 65536, + "mixtral-moe-8x7B-instruct": 65536 }, } diff --git a/scrapegraphai/models/__init__.py b/scrapegraphai/models/__init__.py index bfcb84d6..ce798ad8 100644 --- a/scrapegraphai/models/__init__.py +++ b/scrapegraphai/models/__init__.py @@ -1,19 +1,7 @@ """ __init__.py file for models folder """ - -from .openai import OpenAI -from .azure_openai import AzureOpenAI from .openai_itt import OpenAIImageToText from .openai_tts import OpenAITextToSpeech -from .gemini import Gemini -from .ollama import Ollama -from .hugging_face import HuggingFace -from .groq import Groq -from .bedrock import Bedrock -from .anthropic import Anthropic from .deepseek import DeepSeek from .oneapi import OneApi -from .fireworks import Fireworks -from .vertex import VertexAI -from .nvidia import Nvidia diff --git a/scrapegraphai/models/anthropic.py b/scrapegraphai/models/anthropic.py deleted file mode 100644 index 3a7480d0..00000000 --- a/scrapegraphai/models/anthropic.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -Anthropic Module -""" -from langchain_anthropic import ChatAnthropic - - -class Anthropic(ChatAnthropic): - """ - A wrapper for the ChatAnthropic class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) \ No newline at end of file diff --git a/scrapegraphai/models/azure_openai.py b/scrapegraphai/models/azure_openai.py deleted file mode 100644 index ae47d4e6..00000000 --- a/scrapegraphai/models/azure_openai.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -AzureOpenAI Module -""" -from langchain_openai import AzureChatOpenAI - - -class AzureOpenAI(AzureChatOpenAI): - """ - A wrapper for the AzureChatOpenAI class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) diff --git a/scrapegraphai/models/bedrock.py b/scrapegraphai/models/bedrock.py deleted file mode 100644 index 06299075..00000000 --- a/scrapegraphai/models/bedrock.py +++ /dev/null @@ -1,19 +0,0 @@ -""" -Bedrock Module -""" -from langchain_aws import ChatBedrock - - -class Bedrock(ChatBedrock): - """Class for wrapping bedrock module""" - - def __init__(self, llm_config: dict): - """ - A wrapper for the ChatBedrock class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - # Initialize the superclass (ChatBedrock) with provided config parameters - super().__init__(**llm_config) diff --git a/scrapegraphai/models/ernie.py b/scrapegraphai/models/ernie.py deleted file mode 100644 index 75e2a261..00000000 --- a/scrapegraphai/models/ernie.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -Ernie Module -""" -from langchain_community.chat_models import ErnieBotChat - - -class Ernie(ErnieBotChat): - """ - A wrapper for the ErnieBotChat class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) diff --git a/scrapegraphai/models/fireworks.py b/scrapegraphai/models/fireworks.py deleted file mode 100644 index 445c4846..00000000 --- a/scrapegraphai/models/fireworks.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -Fireworks Module -""" -from langchain_fireworks import ChatFireworks - - -class Fireworks(ChatFireworks): - """ - Initializes the Fireworks class. - - Args: - llm_config (dict): A dictionary containing configuration parameters for the LLM (required). - The specific keys and values will depend on the LLM implementation - used by the underlying `ChatFireworks` class. Consult its documentation - for details. - - Raises: - ValueError: If required keys are missing from the llm_config dictionary. - """ - - def __init__(self, llm_config: dict): - """ - Initializes the Fireworks class. - - Args: - llm_config (dict): A dictionary containing configuration parameters for the LLM. - The specific keys and values will depend on the LLM implementation. - - Raises: - ValueError: If required keys are missing from the llm_config dictionary. - """ - - super().__init__(**llm_config) diff --git a/scrapegraphai/models/gemini.py b/scrapegraphai/models/gemini.py deleted file mode 100644 index 1c939c6c..00000000 --- a/scrapegraphai/models/gemini.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -Gemini Module -""" -from langchain_google_genai import ChatGoogleGenerativeAI - - -class Gemini(ChatGoogleGenerativeAI): - """ - A wrapper for the Gemini class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model - (e.g., model="gemini-pro") - """ - - def __init__(self, llm_config: dict): - # replace "api_key" to "google_api_key" - llm_config["google_api_key"] = llm_config.pop("api_key", None) - super().__init__(**llm_config) diff --git a/scrapegraphai/models/groq.py b/scrapegraphai/models/groq.py deleted file mode 100644 index 755f50aa..00000000 --- a/scrapegraphai/models/groq.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -Groq Module -""" - -from langchain_groq import ChatGroq - -class Groq(ChatGroq): - """ - A wrapper for the Groq class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model (e.g., model="llama3-70b-8192") - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) \ No newline at end of file diff --git a/scrapegraphai/models/hugging_face.py b/scrapegraphai/models/hugging_face.py deleted file mode 100644 index 9696db1e..00000000 --- a/scrapegraphai/models/hugging_face.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -HuggingFace Module -""" -from langchain_community.chat_models.huggingface import ChatHuggingFace - - -class HuggingFace(ChatHuggingFace): - """ - A wrapper for the HuggingFace class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) diff --git a/scrapegraphai/models/nvidia.py b/scrapegraphai/models/nvidia.py deleted file mode 100644 index 48ce3c0f..00000000 --- a/scrapegraphai/models/nvidia.py +++ /dev/null @@ -1,25 +0,0 @@ -""" -This is a Python wrapper class for ChatNVIDIA. -It provides default configuration and could be extended with additional methods if needed. -The purpose of this wrapper is to simplify the creation of instances of ChatNVIDIA by providing -default configurations for certain parameters, -allowing users to focus on specifying other important parameters without having -to understand all the details of the underlying class's constructor. -It inherits from the base class ChatNVIDIA and overrides -its init method to provide a more user-friendly interface. -The constructor takes one argument: llm_config, which is used to initialize the superclass -with default configuration. -""" - -from langchain_nvidia_ai_endpoints import ChatNVIDIA - -class Nvidia(ChatNVIDIA): - """ A wrapper for the Nvidia class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) diff --git a/scrapegraphai/models/ollama.py b/scrapegraphai/models/ollama.py deleted file mode 100644 index 4bf48178..00000000 --- a/scrapegraphai/models/ollama.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -Ollama Module -""" -from langchain_community.chat_models import ChatOllama - - -class Ollama(ChatOllama): - """ - A wrapper for the ChatOllama class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) diff --git a/scrapegraphai/models/openai.py b/scrapegraphai/models/openai.py deleted file mode 100644 index bfd9d74c..00000000 --- a/scrapegraphai/models/openai.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -OpenAI Module -""" -from langchain_openai import ChatOpenAI - - -class OpenAI(ChatOpenAI): - """ - A wrapper for the ChatOpenAI class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) diff --git a/scrapegraphai/models/vertex.py b/scrapegraphai/models/vertex.py deleted file mode 100644 index eb4676fc..00000000 --- a/scrapegraphai/models/vertex.py +++ /dev/null @@ -1,16 +0,0 @@ -""" -VertexAI Module -""" -from langchain_google_vertexai import ChatVertexAI - -class VertexAI(ChatVertexAI): - """ - A wrapper for the ChatVertexAI class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index d1b59500..26fc44c4 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -86,7 +86,8 @@ def update_config(self, params: dict, overwrite: bool = False): Args: param (dict): The dictionary to update node_config with. - overwrite (bool): Flag indicating if the values of node_config should be overwritten if their value is not None. + overwrite (bool): Flag indicating if the values of node_config + should be overwritten if their value is not None. """ for key, val in params.items(): if hasattr(self, key) and not overwrite: @@ -133,7 +134,8 @@ def _validate_input_keys(self, input_keys): def _parse_input_keys(self, state: dict, expression: str) -> List[str]: """ - Parses the input keys expression to extract relevant keys from the state based on logical conditions. + Parses the input keys expression to extract + relevant keys from the state based on logical conditions. The expression can contain AND (&), OR (|), and parentheses to group conditions. Args: diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 5f55b6e6..aa9496eb 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -4,17 +4,17 @@ import json from typing import List, Optional - +from langchain_openai import ChatOpenAI import pandas as pd import requests from langchain_community.document_loaders import PyPDFLoader from langchain_core.documents import Document from ..utils.cleanup_html import cleanup_html from ..docloaders import ChromiumLoader +from ..docloaders.browser_base import browser_base_fetch from ..utils.convert_to_md import convert_to_md from ..utils.logging import get_logger from .base_node import BaseNode -from ..models import OpenAI class FetchNode(BaseNode): @@ -75,6 +75,8 @@ def __init__( False if node_config is None else node_config.get("cut", True) ) + self.browser_base = node_config.get("browser_base") + def execute(self, state): """ Executes the node's logic to fetch HTML content from a specified URL and @@ -131,7 +133,7 @@ def execute(self, state): state.update({self.output[0]: compressed_document}) return state elif input_keys[0] == "json": - f = open(source) + f = open(source, encoding="utf-8") compressed_document = [ Document(page_content=str(json.load(f)), metadata={"source": "json"}) ] @@ -163,7 +165,10 @@ def execute(self, state): if not source.strip(): raise ValueError("No HTML body content found in the local source.") - if (not self.script_creator) or (self.force and not self.script_creator): + parsed_content = source + + if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator: + parsed_content = convert_to_md(source) else: parsed_content = source @@ -178,14 +183,14 @@ def execute(self, state): if response.status_code == 200: if not response.text.strip(): raise ValueError("No HTML body content found in the response.") - - parsed_content = response - + if not self.cut: parsed_content = cleanup_html(response, source) - if (not self.script_creator) or (self.force and not self.script_creator): - parsed_content = convert_to_md(parsed_content, source) + if (isinstance(self.llm_model, ChatOpenAI) + and not self.script_creator) or (self.force and not self.script_creator): + parsed_content = convert_to_md(source, input_data[0]) + compressed_document = [Document(page_content=parsed_content)] else: self.logger.warning( @@ -199,16 +204,22 @@ def execute(self, state): if self.node_config is not None: loader_kwargs = self.node_config.get("loader_kwargs", {}) - loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) - document = loader.load() + if self.browser_base is not None: + data = browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), [source]) + + document = [Document(page_content=content, + metadata={"source": source}) for content in data] + else: + loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) + document = loader.load() if not document or not document[0].page_content.strip(): raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.") parsed_content = document[0].page_content - if (not self.script_creator) or (self.force and not self.script_creator and not self.openai_md_enabled): - - parsed_content = convert_to_md(document[0].page_content, source) + if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled: + parsed_content = convert_to_md(document[0].page_content, input_data[0]) compressed_document = [ diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index 6ce19ef2..a91dae3f 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -3,18 +3,12 @@ Module for generating the answer node """ -# Imports from standard library from typing import List, Optional - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm - from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode from ..helpers.generate_answer_node_csv_prompts import template_chunks_csv, template_no_chunks_csv, template_merge_csv diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index f764e58b..9c530688 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,14 +1,14 @@ """ GenerateAnswerNode Module """ - from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel +from langchain_openai import ChatOpenAI +from langchain_community.chat_models import ChatOllama from tqdm import tqdm from ..utils.logging import get_logger -from ..models import Ollama, OpenAI from .base_node import BaseNode from ..helpers import template_chunks, template_no_chunks, template_merge, template_chunks_md, template_no_chunks_md, template_merge_md @@ -41,7 +41,7 @@ def __init__( self.llm_model = node_config["llm_model"] - if isinstance(node_config["llm_model"], Ollama): + if isinstance(node_config["llm_model"], ChatOllama): self.llm_model.format="json" self.verbose = ( @@ -93,7 +93,7 @@ def execute(self, state: dict) -> dict: format_instructions = output_parser.get_format_instructions() - if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator or self.is_md_scraper: + if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator or self.is_md_scraper: template_no_chunks_prompt = template_no_chunks_md template_chunks_prompt = template_chunks_md template_merge_prompt = template_merge_md @@ -107,44 +107,42 @@ def execute(self, state: dict) -> dict: template_chunks_prompt = self.additional_info + template_chunks_prompt template_merge_prompt = self.additional_info + template_merge_prompt - chains_dict = {} + if len(doc) == 1: + prompt = PromptTemplate( + template=template_no_chunks_prompt, + input_variables=["question"], + partial_variables={"context": doc, + "format_instructions": format_instructions}) + chain = prompt | self.llm_model | output_parser + answer = chain.invoke({"question": user_prompt}) + + state.update({self.output[0]: answer}) + return state - # Use tqdm to add progress bar + chains_dict = {} for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)): - if len(doc) == 1: - prompt = PromptTemplate( - template=template_no_chunks_prompt, - input_variables=["question"], - partial_variables={"context": chunk, - "format_instructions": format_instructions}) - chain = prompt | self.llm_model | output_parser - answer = chain.invoke({"question": user_prompt}) - break prompt = PromptTemplate( - template=template_chunks_prompt, - input_variables=["question"], - partial_variables={"context": chunk, - "chunk_id": i + 1, - "format_instructions": format_instructions}) - # Dynamically name the chains based on their index + template=template_chunks, + input_variables=["question"], + partial_variables={"context": chunk, + "chunk_id": i + 1, + "format_instructions": format_instructions}) chain_name = f"chunk{i+1}" chains_dict[chain_name] = prompt | self.llm_model | output_parser - if len(chains_dict) > 1: - # Use dictionary unpacking to pass the dynamically named chains to RunnableParallel - map_chain = RunnableParallel(**chains_dict) - # Chain - answer = map_chain.invoke({"question": user_prompt}) - # Merge the answers from the chunks - merge_prompt = PromptTemplate( + async_runner = RunnableParallel(**chains_dict) + + batch_results = async_runner.invoke({"question": user_prompt}) + + merge_prompt = PromptTemplate( template = template_merge_prompt, input_variables=["context", "question"], partial_variables={"format_instructions": format_instructions}, ) - merge_chain = merge_prompt | self.llm_model | output_parser - answer = merge_chain.invoke({"context": answer, "question": user_prompt}) - # Update the state with the generated answer + merge_chain = merge_prompt | self.llm_model | output_parser + answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) + state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index c2f2b65d..93e96f4e 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -10,7 +10,7 @@ from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm -from ..models import Ollama +from langchain_community.chat_models import ChatOllama # Imports from the library from .base_node import BaseNode from ..helpers.generate_answer_node_omni_prompts import template_no_chunk_omni, template_chunks_omni, template_merge_omni @@ -44,7 +44,7 @@ def __init__( super().__init__(node_name, "node", input, output, 3, node_config) self.llm_model = node_config["llm_model"] - if isinstance(node_config["llm_model"], Ollama): + if isinstance(node_config["llm_model"], ChatOllama): self.llm_model.format="json" self.verbose = ( @@ -113,7 +113,7 @@ def execute(self, state: dict) -> dict: chain = prompt | self.llm_model | output_parser answer = chain.invoke({"question": user_prompt}) - + state.update({self.output[0]: answer}) return state @@ -148,4 +148,4 @@ def execute(self, state: dict) -> dict: answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) state.update({self.output[0]: answer}) - return state \ No newline at end of file + return state diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index 7add7948..4cef7ae9 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -2,18 +2,13 @@ Module for generating the answer node """ -# Imports from standard library from typing import List, Optional - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm -from ..models import Ollama +from langchain_community.chat_models import ChatOllama from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode from ..helpers.generate_answer_node_pdf_prompts import template_chunks_pdf, template_no_chunks_pdf, template_merge_pdf @@ -59,7 +54,7 @@ def __init__( super().__init__(node_name, "node", input, output, 2, node_config) self.llm_model = node_config["llm_model"] - if isinstance(node_config["llm_model"], Ollama): + if isinstance(node_config["llm_model"], ChatOllama): self.llm_model.format="json" self.verbose = ( diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index 393f5e90..733898bd 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -83,7 +83,6 @@ def execute(self, state: dict) -> dict: user_prompt = input_data[0] doc = input_data[1] - # schema to be used for output parsing if self.node_config.get("schema", None) is not None: output_schema = JsonOutputParser(pydantic_object=self.node_config["schema"]) else: @@ -130,7 +129,6 @@ def execute(self, state: dict) -> dict: ) map_chain = prompt | self.llm_model | StrOutputParser() - # Chain answer = map_chain.invoke({"question": user_prompt}) state.update({self.output[0]: answer}) diff --git a/scrapegraphai/nodes/get_probable_tags_node.py b/scrapegraphai/nodes/get_probable_tags_node.py index f31633c0..38c2ba15 100644 --- a/scrapegraphai/nodes/get_probable_tags_node.py +++ b/scrapegraphai/nodes/get_probable_tags_node.py @@ -1,7 +1,6 @@ """ GetProbableTagsNode Module """ - from typing import List, Optional from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py index 061be77a..6ce4bdaf 100644 --- a/scrapegraphai/nodes/graph_iterator_node.py +++ b/scrapegraphai/nodes/graph_iterator_node.py @@ -5,13 +5,11 @@ import asyncio import copy from typing import List, Optional - from tqdm.asyncio import tqdm - from ..utils.logging import get_logger from .base_node import BaseNode -_default_batchsize = 16 +DEFAULT_BATCHSIZE = 16 class GraphIteratorNode(BaseNode): @@ -51,13 +49,15 @@ def execute(self, state: dict) -> dict: the correct data from the state. Returns: - dict: The updated state with the output key containing the results of the graph instances. + dict: The updated state with the output key c + ontaining the results of the graph instances. Raises: - KeyError: If the input keys are not found in the state, indicating that the - necessary information for running the graph instances is missing. + KeyError: If the input keys are not found in the state, + indicating that thenecessary information for running + the graph instances is missing. """ - batchsize = self.node_config.get("batchsize", _default_batchsize) + batchsize = self.node_config.get("batchsize", DEFAULT_BATCHSIZE) self.logger.info( f"--- Executing {self.node_name} Node with batchsize {batchsize} ---" diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py index 7e7507a9..c1a69390 100644 --- a/scrapegraphai/nodes/image_to_text_node.py +++ b/scrapegraphai/nodes/image_to_text_node.py @@ -3,14 +3,14 @@ """ from typing import List, Optional - from ..utils.logging import get_logger from .base_node import BaseNode class ImageToTextNode(BaseNode): """ - Retrieve images from a list of URLs and return a description of the images using an image-to-text model. + Retrieve images from a list of URLs and return a description of + the images using an image-to-text model. Attributes: llm_model: An instance of the language model client used for image-to-text conversion. diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py index 0efd8ec8..548b7c04 100644 --- a/scrapegraphai/nodes/merge_answers_node.py +++ b/scrapegraphai/nodes/merge_answers_node.py @@ -2,18 +2,10 @@ MergeAnswersNode Module """ -# Imports from standard library from typing import List, Optional -from tqdm import tqdm - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser -from tqdm import tqdm - from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode diff --git a/scrapegraphai/nodes/merge_generated_scripts.py b/scrapegraphai/nodes/merge_generated_scripts.py index cfda3960..8c8eaecd 100644 --- a/scrapegraphai/nodes/merge_generated_scripts.py +++ b/scrapegraphai/nodes/merge_generated_scripts.py @@ -5,15 +5,9 @@ # Imports from standard library from typing import List, Optional from tqdm import tqdm - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser, StrOutputParser -from tqdm import tqdm - from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index b5418717..d1bb87bd 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -75,23 +75,23 @@ def execute(self, state: dict) -> dict: chunks = chunk(text=docs_transformed.page_content, chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda x: len(x), + token_counter= lambda x: len(x), memoize=False) else: docs_transformed = docs_transformed[0] - if type(docs_transformed) == Document: + if isinstance(docs_transformed, Document): chunks = chunk(text=docs_transformed.page_content, chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda x: len(x), + token_counter= lambda x: len(x), memoize=False) else: - + chunks = chunk(text=docs_transformed, chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda x: len(x), + token_counter= lambda x: len(x), memoize=False) - + state.update({self.output[0]: chunks}) return state diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index a4f58191..952daa6c 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -14,8 +14,20 @@ from langchain_community.document_transformers import EmbeddingsRedundantFilter from langchain_community.vectorstores import FAISS +from langchain_community.chat_models import ChatOllama +from langchain_aws import BedrockEmbeddings, ChatBedrock +from langchain_huggingface import ChatHuggingFace, HuggingFaceEmbeddings +from langchain_community.embeddings import OllamaEmbeddings +from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI +from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings +from langchain_fireworks import FireworksEmbeddings, ChatFireworks +from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI +from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA + from ..utils.logging import get_logger from .base_node import BaseNode +from ..helpers import models_tokens +from ..models import DeepSeek class RAGNode(BaseNode): @@ -95,10 +107,21 @@ def execute(self, state: dict) -> dict: self.logger.info("--- (updated chunks metadata) ---") # check if embedder_model is provided, if not use llm_model - self.embedder_model = ( - self.embedder_model if self.embedder_model else self.llm_model - ) - embeddings = self.embedder_model + if self.embedder_model is not None: + embeddings = self.embedder_model + elif 'embeddings' in self.node_config: + try: + embeddings = self._create_embedder(self.node_config['embedder_config']) + except Exception: + try: + embeddings = self._create_default_embedder() + self.embedder_model = embeddings + except ValueError: + embeddings = self.llm_model + self.embedder_model = self.llm_model + else: + embeddings = self.llm_model + self.embedder_model = self.llm_model folder_name = self.node_config.get("cache_path", "cache") @@ -141,3 +164,116 @@ def execute(self, state: dict) -> dict: state.update({self.output[0]: compressed_docs}) return state + + + def _create_default_embedder(self, llm_config=None) -> object: + """ + Create an embedding model instance based on the chosen llm model. + + Returns: + object: An instance of the embedding model client. + + Raises: + ValueError: If the model is not supported. + """ + if isinstance(self.llm_model, ChatGoogleGenerativeAI): + return GoogleGenerativeAIEmbeddings( + google_api_key=llm_config["api_key"], model="models/embedding-001" + ) + if isinstance(self.llm_model, ChatOpenAI): + return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key, + base_url=self.llm_model.openai_api_base) + elif isinstance(self.llm_model, DeepSeek): + return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) + elif isinstance(self.llm_model, ChatVertexAI): + return VertexAIEmbeddings() + elif isinstance(self.llm_model, AzureOpenAIEmbeddings): + return self.llm_model + elif isinstance(self.llm_model, AzureChatOpenAI): + return AzureOpenAIEmbeddings() + elif isinstance(self.llm_model, ChatFireworks): + return FireworksEmbeddings(model=self.llm_model.model_name) + elif isinstance(self.llm_model, ChatNVIDIA): + return NVIDIAEmbeddings(model=self.llm_model.model_name) + elif isinstance(self.llm_model, ChatOllama): + # unwrap the kwargs from the model whihc is a dict + params = self.llm_model._lc_kwargs + # remove streaming and temperature + params.pop("streaming", None) + params.pop("temperature", None) + + return OllamaEmbeddings(**params) + elif isinstance(self.llm_model, ChatHuggingFace): + return HuggingFaceEmbeddings(model=self.llm_model.model) + elif isinstance(self.llm_model, ChatBedrock): + return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id) + else: + raise ValueError("Embedding Model missing or not supported") + + + def _create_embedder(self, embedder_config: dict) -> object: + """ + Create an embedding model instance based on the configuration provided. + + Args: + embedder_config (dict): Configuration parameters for the embedding model. + + Returns: + object: An instance of the embedding model client. + + Raises: + KeyError: If the model is not supported. + """ + embedder_params = {**embedder_config} + if "model_instance" in embedder_config: + return embedder_params["model_instance"] + # Instantiate the embedding model based on the model name + if "openai" in embedder_params["model"]: + return OpenAIEmbeddings(api_key=embedder_params["api_key"]) + if "azure" in embedder_params["model"]: + return AzureOpenAIEmbeddings() + if "nvidia" in embedder_params["model"]: + embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) + try: + models_tokens["nvidia"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return NVIDIAEmbeddings(model=embedder_params["model"], + nvidia_api_key=embedder_params["api_key"]) + if "ollama" in embedder_params["model"]: + embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) + try: + models_tokens["ollama"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return OllamaEmbeddings(**embedder_params) + if "hugging_face" in embedder_params["model"]: + embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) + try: + models_tokens["hugging_face"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return HuggingFaceEmbeddings(model=embedder_params["model"]) + if "fireworks" in embedder_params["model"]: + embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) + try: + models_tokens["fireworks"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return FireworksEmbeddings(model=embedder_params["model"]) + if "gemini" in embedder_params["model"]: + try: + models_tokens["gemini"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return GoogleGenerativeAIEmbeddings(model=embedder_params["model"]) + if "bedrock" in embedder_params["model"]: + embedder_params["model"] = embedder_params["model"].split("/")[-1] + client = embedder_params.get("client", None) + try: + models_tokens["bedrock"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return BedrockEmbeddings(client=client, model_id=embedder_params["model"]) + + raise ValueError("Model provided by the configuration not supported") diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index 66231600..7fa2fe6b 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -4,15 +4,9 @@ from typing import List, Optional from urllib.parse import urlparse - from langchain_community.document_loaders import AsyncChromiumLoader from langchain.prompts import PromptTemplate from langchain.output_parsers import CommaSeparatedListOutputParser - -from langchain.output_parsers import CommaSeparatedListOutputParser -from langchain.prompts import PromptTemplate -from langchain_community.document_loaders import AsyncChromiumLoader - from ..helpers import robots_dictionary from ..utils.logging import get_logger from .base_node import BaseNode @@ -146,4 +140,4 @@ def execute(self, state: dict) -> dict: self.logger.warning("\033[32m(Scraping this website is allowed)\033[0m") state.update({self.output[0]: is_scrapable}) - return state \ No newline at end of file + return state diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 97fed67b..61b11995 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -1,17 +1,14 @@ """ SearchInternetNode Module """ - from typing import List, Optional - from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate - +from langchain_community.chat_models import ChatOllama from ..utils.logging import get_logger from ..utils.research_web import search_on_web from .base_node import BaseNode - class SearchInternetNode(BaseNode): """ A node that generates a search query based on the user's input and searches the internet @@ -94,7 +91,14 @@ def execute(self, state: dict) -> dict: # Execute the chain to get the search query search_answer = search_prompt | self.llm_model | output_parser - search_query = search_answer.invoke({"user_prompt": user_prompt})[0] + + # Ollama: Use no json format when creating the search query + if isinstance(self.llm_model, ChatOllama) and self.llm_model.format == 'json': + self.llm_model.format = None + search_query = search_answer.invoke({"user_prompt": user_prompt})[0] + self.llm_model.format = 'json' + else: + search_query = search_answer.invoke({"user_prompt": user_prompt})[0] self.logger.info(f"Search Query: {search_query}") diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index b3d289d9..6fbe51dd 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -2,19 +2,13 @@ SearchLinkNode Module """ -# Imports from standard library from typing import List, Optional import re from tqdm import tqdm - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel - from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode diff --git a/scrapegraphai/nodes/search_node_with_context.py b/scrapegraphai/nodes/search_node_with_context.py index 62de184a..678e44ae 100644 --- a/scrapegraphai/nodes/search_node_with_context.py +++ b/scrapegraphai/nodes/search_node_with_context.py @@ -67,7 +67,6 @@ def execute(self, state: dict) -> dict: # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] - user_prompt = input_data[0] doc = input_data[1] output_parser = CommaSeparatedListOutputParser() diff --git a/scrapegraphai/nodes/text_to_speech_node.py b/scrapegraphai/nodes/text_to_speech_node.py index 59e3fb8b..e8e43cb5 100644 --- a/scrapegraphai/nodes/text_to_speech_node.py +++ b/scrapegraphai/nodes/text_to_speech_node.py @@ -1,13 +1,10 @@ """ TextToSpeechNode Module """ - from typing import List, Optional - from ..utils.logging import get_logger from .base_node import BaseNode - class TextToSpeechNode(BaseNode): """ Converts text to speech using the specified text-to-speech model. diff --git a/scrapegraphai/utils/convert_to_md.py b/scrapegraphai/utils/convert_to_md.py index ece41c60..4c22d35b 100644 --- a/scrapegraphai/utils/convert_to_md.py +++ b/scrapegraphai/utils/convert_to_md.py @@ -1,8 +1,8 @@ """ convert_to_md modul """ -import html2text from urllib.parse import urlparse +import html2text def convert_to_md(html: str, url: str = None) -> str: """ Convert HTML to Markdown. diff --git a/scrapegraphai/utils/logging.py b/scrapegraphai/utils/logging.py index 2684d0b1..afb63c52 100644 --- a/scrapegraphai/utils/logging.py +++ b/scrapegraphai/utils/logging.py @@ -12,7 +12,7 @@ _library_name = __name__.split(".", maxsplit=1)[0] -_default_handler = None +DEFAULT_HANDLER = None _default_logging_level = logging.WARNING _semaphore = threading.Lock() @@ -23,22 +23,22 @@ def _get_library_root_logger() -> logging.Logger: def _set_library_root_logger() -> None: - global _default_handler + global DEFAULT_HANDLER with _semaphore: - if _default_handler: + if DEFAULT_HANDLER: return - _default_handler = logging.StreamHandler() # sys.stderr as stream + DEFAULT_HANDLER = logging.StreamHandler() # sys.stderr as stream # https://github.com/pyinstaller/pyinstaller/issues/7334#issuecomment-1357447176 if sys.stderr is None: - sys.stderr = open(os.devnull, "w") + sys.stderr = open(os.devnull, "w", encoding="utf-8") - _default_handler.flush = sys.stderr.flush + DEFAULT_HANDLER.flush = sys.stderr.flush library_root_logger = _get_library_root_logger() - library_root_logger.addHandler(_default_handler) + library_root_logger.addHandler(DEFAULT_HANDLER) library_root_logger.setLevel(_default_logging_level) library_root_logger.propagate = False @@ -86,8 +86,8 @@ def set_handler(handler: logging.Handler) -> None: _get_library_root_logger().addHandler(handler) -def set_default_handler() -> None: - set_handler(_default_handler) +def setDEFAULT_HANDLER() -> None: + set_handler(DEFAULT_HANDLER) def unset_handler(handler: logging.Handler) -> None: @@ -98,8 +98,8 @@ def unset_handler(handler: logging.Handler) -> None: _get_library_root_logger().removeHandler(handler) -def unset_default_handler() -> None: - unset_handler(_default_handler) +def unsetDEFAULT_HANDLER() -> None: + unset_handler(DEFAULT_HANDLER) def set_propagation() -> None: diff --git a/scrapegraphai/utils/parse_state_keys.py b/scrapegraphai/utils/parse_state_keys.py index 85712ef6..107397e9 100644 --- a/scrapegraphai/utils/parse_state_keys.py +++ b/scrapegraphai/utils/parse_state_keys.py @@ -13,19 +13,22 @@ def parse_expression(expression, state: dict) -> list: state (dict): Dictionary of state keys used to evaluate the expression. Raises: - ValueError: If the expression is empty, has adjacent state keys without operators, invalid operator usage, - unbalanced parentheses, or if no state keys match the expression. + ValueError: If the expression is empty, has adjacent state keys without operators, + invalid operator usage, unbalanced parentheses, or if no state keys match the expression. Returns: - list: A list of state keys that match the boolean expression, ensuring each key appears only once. + list: A list of state keys that match the boolean expression, + ensuring each key appears only once. Example: >>> parse_expression("user_input & (relevant_chunks | parsed_document | document)", {"user_input": None, "document": None, "parsed_document": None, "relevant_chunks": None}) ['user_input', 'relevant_chunks', 'parsed_document', 'document'] - This function evaluates the expression to determine the logical inclusion of state keys based on provided boolean logic. - It checks for syntax errors such as unbalanced parentheses, incorrect adjacency of operators, and empty expressions. + This function evaluates the expression to determine the + logical inclusion of state keys based on provided boolean logic. + It checks for syntax errors such as unbalanced parentheses, + incorrect adjacency of operators, and empty expressions. """ # Check for empty expression diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py index 07e04d0f..6f6019e9 100644 --- a/scrapegraphai/utils/proxy_rotation.py +++ b/scrapegraphai/utils/proxy_rotation.py @@ -6,7 +6,6 @@ import random import re from typing import List, Optional, Set, TypedDict - import requests from fp.errors import FreeProxyException from fp.fp import FreeProxy diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 101693e4..fe7902d3 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -1,3 +1,6 @@ +""" +Research_web module +""" import re from typing import List from langchain_community.tools import DuckDuckGoSearchResults @@ -5,13 +8,15 @@ import requests from bs4 import BeautifulSoup -def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080) -> List[str]: +def search_on_web(query: str, search_engine: str = "Google", + max_results: int = 10, port: int = 8080) -> List[str]: """ Searches the web for a given query using specified search engine options. Args: query (str): The search query to find on the internet. - search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. + search_engine (str, optional): Specifies the search engine to use, + options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. max_results (int, optional): The maximum number of search results to return. port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. @@ -25,19 +30,19 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = >>> search_on_web("example query", search_engine="Google", max_results=5) ['http://example.com', 'http://example.org', ...] """ - + if search_engine.lower() == "google": res = [] for url in google_search(query, stop=max_results): res.append(url) return res - + elif search_engine.lower() == "duckduckgo": research = DuckDuckGoSearchResults(max_results=max_results) res = research.run(query) links = re.findall(r'https?://[^\s,\]]+', res) return links - + elif search_engine.lower() == "bing": headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" @@ -46,24 +51,24 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = response = requests.get(search_url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - + search_results = [] for result in soup.find_all('li', class_='b_algo', limit=max_results): link = result.find('a')['href'] search_results.append(link) return search_results - + elif search_engine.lower() == "searxng": url = f"http://localhost:{port}" params = {"q": query, "format": "json"} - + # Send the GET request to the server response = requests.get(url, params=params) - + # Parse the response and limit to the specified max_results data = response.json() limited_results = data["results"][:max_results] return limited_results - + else: raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG") diff --git a/scrapegraphai/utils/sys_dynamic_import.py b/scrapegraphai/utils/sys_dynamic_import.py index 30f75d15..19b0d29a 100644 --- a/scrapegraphai/utils/sys_dynamic_import.py +++ b/scrapegraphai/utils/sys_dynamic_import.py @@ -5,7 +5,7 @@ import sys import typing - +import importlib.util # noqa: F401 if typing.TYPE_CHECKING: import types @@ -24,9 +24,6 @@ def srcfile_import(modpath: str, modname: str) -> "types.ModuleType": Raises: ImportError: If the module cannot be imported from the srcfile """ - import importlib.util # noqa: F401 - - # spec = importlib.util.spec_from_file_location(modname, modpath) if spec is None: diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py index 5b23fdf4..c5263efe 100644 --- a/scrapegraphai/utils/token_calculator.py +++ b/scrapegraphai/utils/token_calculator.py @@ -22,7 +22,8 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str] >>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING") ["This is a sample text", "for truncation."] - This function ensures that each chunk of text can be tokenized by the specified model without exceeding the model's token limit. + This function ensures that each chunk of text can be tokenized + by the specified model without exceeding the model's token limit. """ encoding = tiktoken.get_encoding(encoding_name) diff --git a/tests/nodes/robot_node_test.py b/tests/nodes/robot_node_test.py index 00a45b05..62527dda 100644 --- a/tests/nodes/robot_node_test.py +++ b/tests/nodes/robot_node_test.py @@ -1,7 +1,6 @@ import pytest from unittest.mock import MagicMock - -from scrapegraphai.models import Ollama +from langchain_community.chat_models import ChatOllama from scrapegraphai.nodes import RobotsNode @pytest.fixture diff --git a/tests/nodes/search_internet_node_test.py b/tests/nodes/search_internet_node_test.py index db2cbdee..8e198448 100644 --- a/tests/nodes/search_internet_node_test.py +++ b/tests/nodes/search_internet_node_test.py @@ -1,5 +1,5 @@ import unittest -from scrapegraphai.models import Ollama +from langchain_community.chat_models import ChatOllama from scrapegraphai.nodes import SearchInternetNode class TestSearchInternetNode(unittest.TestCase): @@ -18,7 +18,7 @@ def setUp(self): } # Define the model - self.llm_model = Ollama(self.graph_config["llm"]) + self.llm_model = ChatOllama(self.graph_config["llm"]) # Initialize the SearchInternetNode self.search_node = SearchInternetNode( diff --git a/tests/nodes/search_link_node_test.py b/tests/nodes/search_link_node_test.py index 648db4ee..1f8c5a58 100644 --- a/tests/nodes/search_link_node_test.py +++ b/tests/nodes/search_link_node_test.py @@ -1,5 +1,5 @@ import pytest -from scrapegraphai.models import Ollama +from langchain_community.chat_models import ChatOllama from scrapegraphai.nodes import SearchLinkNode from unittest.mock import patch, MagicMock @@ -18,7 +18,7 @@ def setup(): } # Instantiate the LLM model with the configuration - llm_model = Ollama(graph_config["llm"]) + llm_model = ChatOllama(graph_config["llm"]) # Define the SearchLinkNode with necessary configurations search_link_node = SearchLinkNode(