From 7599234ab9563ca4ee9b7f5b2d0267daac621ecf Mon Sep 17 00:00:00 2001
From: Shubham Kamboj <shubham.kamboj@shorthills.ai>
Date: Sat, 4 May 2024 21:42:57 +0530
Subject: [PATCH 1/2] feat: Enable end users to pass model instances of
 HuggingFaceHub

---
 .../smart_scraper_huggingfacehub.py           | 63 +++++++++++++++++++
 scrapegraphai/graphs/abstract_graph.py        |  7 +++
 scrapegraphai/helpers/models_tokens.py        |  3 +
 scrapegraphai/nodes/rag_node.py               |  4 ++
 4 files changed, 77 insertions(+)
 create mode 100644 examples/huggingfacehub/smart_scraper_huggingfacehub.py

diff --git a/examples/huggingfacehub/smart_scraper_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_huggingfacehub.py
new file mode 100644
index 00000000..082ce59c
--- /dev/null
+++ b/examples/huggingfacehub/smart_scraper_huggingfacehub.py
@@ -0,0 +1,63 @@
+""" 
+Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+
+
+
+
+## required environment variable in .env
+#HUGGINGFACEHUB_API_TOKEN
+load_dotenv()
+
+HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+# ************************************************
+# Initialize the model instances
+# ************************************************
+
+repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
+
+llm_model_instance = HuggingFaceEndpoint(
+    repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
+)
+
+
+
+
+embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
+    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
+)
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+graph_config = {
+    "llm": {"model_instance": llm_model_instance},
+    "embeddings": {"model_instance": embedder_model_instance}
+}
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, event_end_date, event_end_time, location, event_mode, event_category, third_party_redirect, no_of_days, time_in_hours, hosted_or_attending, refreshments_type,  registration_available, registration_link",
+    # also accepts a string with the already downloaded HTML code
+    source="https://www.hmhco.com/event",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
index b8a9efe9..a6e84789 100644
--- a/scrapegraphai/graphs/abstract_graph.py
+++ b/scrapegraphai/graphs/abstract_graph.py
@@ -64,6 +64,13 @@ def _set_model_token(self, llm):
                 self.model_token = models_tokens["azure"][llm.model_name]
             except KeyError:
                 raise KeyError("Model not supported")
+            
+        elif 'HuggingFaceEndpoint' in str(type(llm)):
+            if 'mistral' in llm.repo_id:
+                try:
+                    self.model_token = models_tokens['mistral'][llm.repo_id]
+                except KeyError:
+                    raise KeyError("Model not supported")
 
 
     def _create_llm(self, llm_config: dict, chat=False) -> object:
diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py
index a9bab3fc..5bc9a7f8 100644
--- a/scrapegraphai/helpers/models_tokens.py
+++ b/scrapegraphai/helpers/models_tokens.py
@@ -65,5 +65,8 @@
         "mistral.mistral-large-2402-v1:0": 32768,
         "cohere.embed-english-v3": 512,
         "cohere.embed-multilingual-v3": 512
+    },
+    "mistral": {
+        "mistralai/Mistral-7B-Instruct-v0.2": 32000
     }
 }
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
index 92e7011f..50b43b6a 100644
--- a/scrapegraphai/nodes/rag_node.py
+++ b/scrapegraphai/nodes/rag_node.py
@@ -12,6 +12,7 @@
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import OllamaEmbeddings
 from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings
+from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
 
 from ..models import OpenAI, Ollama, AzureOpenAI, HuggingFace, Bedrock
 from .base_node import BaseNode
@@ -95,6 +96,9 @@ def execute(self, state: dict) -> dict:
                 api_key=embedding_model.openai_api_key)
         elif isinstance(embedding_model, AzureOpenAIEmbeddings):
             embeddings = embedding_model
+        elif isinstance(embedding_model, HuggingFaceInferenceAPIEmbeddings):
+            embeddings = embedding_model
+
         elif isinstance(embedding_model, AzureOpenAI):
             embeddings = AzureOpenAIEmbeddings()
         elif isinstance(embedding_model, Ollama):

From fd59f282a8031aeec43da4dbdc1cb551eb7b1cde Mon Sep 17 00:00:00 2001
From: Shubham Kamboj <shubham.kamboj@shorthills.ai>
Date: Sat, 4 May 2024 22:18:54 +0530
Subject: [PATCH 2/2] Bug fixes

---
 scrapegraphai/graphs/abstract_graph.py |  4 +++-
 scrapegraphai/nodes/rag_node.py        | 31 +-------------------------
 2 files changed, 4 insertions(+), 31 deletions(-)

diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
index aa4a0cbe..83b5b712 100644
--- a/scrapegraphai/graphs/abstract_graph.py
+++ b/scrapegraphai/graphs/abstract_graph.py
@@ -188,7 +188,6 @@ def _create_default_embedder(self) -> object:
         Raises:
             ValueError: If the model is not supported.
         """
-
         if isinstance(self.llm_model, OpenAI):
             return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
         elif isinstance(self.llm_model, AzureOpenAIEmbeddings):
@@ -223,6 +222,9 @@ def _create_embedder(self, embedder_config: dict) -> object:
         Raises:
             KeyError: If the model is not supported.
         """
+
+        if 'model_instance' in embedder_config:
+            return embedder_config['model_instance']
         
         # Instantiate the embedding model based on the model name
         if "openai" in embedder_config["model"]:
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
index 86de7d7b..b883845a 100644
--- a/scrapegraphai/nodes/rag_node.py
+++ b/scrapegraphai/nodes/rag_node.py
@@ -8,9 +8,6 @@
 from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline
 from langchain_community.document_transformers import EmbeddingsRedundantFilter
 from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings import OllamaEmbeddings
-from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings
-from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
 
 from .base_node import BaseNode
 
@@ -86,33 +83,7 @@ def execute(self, state: dict) -> dict:
             print("--- (updated chunks metadata) ---")
 
         # check if embedder_model is provided, if not use llm_model
-        embedding_model = self.embedder_model if self.embedder_model else self.llm_model
-
-        if isinstance(embedding_model, OpenAI):
-            embeddings = OpenAIEmbeddings(
-                api_key=embedding_model.openai_api_key)
-        elif isinstance(embedding_model, AzureOpenAIEmbeddings):
-            embeddings = embedding_model
-        elif isinstance(embedding_model, HuggingFaceInferenceAPIEmbeddings):
-            embeddings = embedding_model
-
-        elif isinstance(embedding_model, AzureOpenAI):
-            embeddings = AzureOpenAIEmbeddings()
-        elif isinstance(embedding_model, Ollama):
-            # unwrap the kwargs from the model whihc is a dict
-            params = embedding_model._lc_kwargs
-            # remove streaming and temperature
-            params.pop("streaming", None)
-            params.pop("temperature", None)
-
-            embeddings = OllamaEmbeddings(**params)
-        elif isinstance(embedding_model, HuggingFace):
-            embeddings = HuggingFaceHubEmbeddings(model=embedding_model.model)
-        elif isinstance(embedding_model, Bedrock):
-            embeddings = BedrockEmbeddings(
-                client=None, model_id=embedding_model.model_id)
-        else:
-            raise ValueError("Embedding Model missing or not supported")
+        self.embedder_model = self.embedder_model if self.embedder_model else self.llm_model
         embeddings = self.embedder_model
 
         retriever = FAISS.from_documents(