From 7599234ab9563ca4ee9b7f5b2d0267daac621ecf Mon Sep 17 00:00:00 2001 From: Shubham Kamboj Date: Sat, 4 May 2024 21:42:57 +0530 Subject: [PATCH 1/2] feat: Enable end users to pass model instances of HuggingFaceHub --- .../smart_scraper_huggingfacehub.py | 63 +++++++++++++++++++ scrapegraphai/graphs/abstract_graph.py | 7 +++ scrapegraphai/helpers/models_tokens.py | 3 + scrapegraphai/nodes/rag_node.py | 4 ++ 4 files changed, 77 insertions(+) create mode 100644 examples/huggingfacehub/smart_scraper_huggingfacehub.py diff --git a/examples/huggingfacehub/smart_scraper_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_huggingfacehub.py new file mode 100644 index 00000000..082ce59c --- /dev/null +++ b/examples/huggingfacehub/smart_scraper_huggingfacehub.py @@ -0,0 +1,63 @@ +""" +Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +from langchain_community.llms import HuggingFaceEndpoint +from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings + + + + +## required environment variable in .env +#HUGGINGFACEHUB_API_TOKEN +load_dotenv() + +HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') +# ************************************************ +# Initialize the model instances +# ************************************************ + +repo_id = "mistralai/Mistral-7B-Instruct-v0.2" + +llm_model_instance = HuggingFaceEndpoint( + repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN +) + + + + +embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( + api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" +) + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +graph_config = { + "llm": {"model_instance": llm_model_instance}, + "embeddings": {"model_instance": embedder_model_instance} +} + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, event_end_date, event_end_time, location, event_mode, event_category, third_party_redirect, no_of_days, time_in_hours, hosted_or_attending, refreshments_type, registration_available, registration_link", + # also accepts a string with the already downloaded HTML code + source="https://www.hmhco.com/event", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + + diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index b8a9efe9..a6e84789 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -64,6 +64,13 @@ def _set_model_token(self, llm): self.model_token = models_tokens["azure"][llm.model_name] except KeyError: raise KeyError("Model not supported") + + elif 'HuggingFaceEndpoint' in str(type(llm)): + if 'mistral' in llm.repo_id: + try: + self.model_token = models_tokens['mistral'][llm.repo_id] + except KeyError: + raise KeyError("Model not supported") def _create_llm(self, llm_config: dict, chat=False) -> object: diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index a9bab3fc..5bc9a7f8 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -65,5 +65,8 @@ "mistral.mistral-large-2402-v1:0": 32768, "cohere.embed-english-v3": 512, "cohere.embed-multilingual-v3": 512 + }, + "mistral": { + "mistralai/Mistral-7B-Instruct-v0.2": 32000 } } diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 92e7011f..50b43b6a 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -12,6 +12,7 @@ from langchain_community.vectorstores import FAISS from langchain_community.embeddings import OllamaEmbeddings from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings +from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings from ..models import OpenAI, Ollama, AzureOpenAI, HuggingFace, Bedrock from .base_node import BaseNode @@ -95,6 +96,9 @@ def execute(self, state: dict) -> dict: api_key=embedding_model.openai_api_key) elif isinstance(embedding_model, AzureOpenAIEmbeddings): embeddings = embedding_model + elif isinstance(embedding_model, HuggingFaceInferenceAPIEmbeddings): + embeddings = embedding_model + elif isinstance(embedding_model, AzureOpenAI): embeddings = AzureOpenAIEmbeddings() elif isinstance(embedding_model, Ollama): From fd59f282a8031aeec43da4dbdc1cb551eb7b1cde Mon Sep 17 00:00:00 2001 From: Shubham Kamboj Date: Sat, 4 May 2024 22:18:54 +0530 Subject: [PATCH 2/2] Bug fixes --- scrapegraphai/graphs/abstract_graph.py | 4 +++- scrapegraphai/nodes/rag_node.py | 31 +------------------------- 2 files changed, 4 insertions(+), 31 deletions(-) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index aa4a0cbe..83b5b712 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -188,7 +188,6 @@ def _create_default_embedder(self) -> object: Raises: ValueError: If the model is not supported. """ - if isinstance(self.llm_model, OpenAI): return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) elif isinstance(self.llm_model, AzureOpenAIEmbeddings): @@ -223,6 +222,9 @@ def _create_embedder(self, embedder_config: dict) -> object: Raises: KeyError: If the model is not supported. """ + + if 'model_instance' in embedder_config: + return embedder_config['model_instance'] # Instantiate the embedding model based on the model name if "openai" in embedder_config["model"]: diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 86de7d7b..b883845a 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -8,9 +8,6 @@ from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline from langchain_community.document_transformers import EmbeddingsRedundantFilter from langchain_community.vectorstores import FAISS -from langchain_community.embeddings import OllamaEmbeddings -from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings -from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings from .base_node import BaseNode @@ -86,33 +83,7 @@ def execute(self, state: dict) -> dict: print("--- (updated chunks metadata) ---") # check if embedder_model is provided, if not use llm_model - embedding_model = self.embedder_model if self.embedder_model else self.llm_model - - if isinstance(embedding_model, OpenAI): - embeddings = OpenAIEmbeddings( - api_key=embedding_model.openai_api_key) - elif isinstance(embedding_model, AzureOpenAIEmbeddings): - embeddings = embedding_model - elif isinstance(embedding_model, HuggingFaceInferenceAPIEmbeddings): - embeddings = embedding_model - - elif isinstance(embedding_model, AzureOpenAI): - embeddings = AzureOpenAIEmbeddings() - elif isinstance(embedding_model, Ollama): - # unwrap the kwargs from the model whihc is a dict - params = embedding_model._lc_kwargs - # remove streaming and temperature - params.pop("streaming", None) - params.pop("temperature", None) - - embeddings = OllamaEmbeddings(**params) - elif isinstance(embedding_model, HuggingFace): - embeddings = HuggingFaceHubEmbeddings(model=embedding_model.model) - elif isinstance(embedding_model, Bedrock): - embeddings = BedrockEmbeddings( - client=None, model_id=embedding_model.model_id) - else: - raise ValueError("Embedding Model missing or not supported") + self.embedder_model = self.embedder_model if self.embedder_model else self.llm_model embeddings = self.embedder_model retriever = FAISS.from_documents(