Significant-Gravitas · Pwuts · May 1, 2023 · Apr 25, 2023 · Apr 26, 2023 · Apr 26, 2023
@@ -33,6 +33,9 @@
         self.smart_llm_model = os.getenv("SMART_LLM_MODEL", "gpt-4")
         self.fast_token_limit = int(os.getenv("FAST_TOKEN_LIMIT", 4000))
         self.smart_token_limit = int(os.getenv("SMART_TOKEN_LIMIT", 8000))
+        self.embedding_model = os.getenv("EMBEDDING_MODEL", "text-embedding-ada-002")
+        self.embedding_token_limit = int(os.getenv("EMBEDDING_TOKEN_LIMIT", 8191))
+        self.embedding_encoding = os.getenv("EMBEDDING_ENCODING", "cl100k_base")
         self.browse_chunk_max_length = int(os.getenv("BROWSE_CHUNK_MAX_LENGTH", 3000))
         self.browse_spacy_language_model = os.getenv(
             "BROWSE_SPACY_LANGUAGE_MODEL", "en_core_web_sm"
@@ -214,6 +217,18 @@
         """Set the smart token limit value."""
         self.smart_token_limit = value
 
+    def set_embedding_model(self, value: str) -> None:
+        """Set the embedding model value."""
+        self.embedding_model = value
+
+    def set_embedding_token_limit(self, value: int) -> None:
+        """Set the embedding token limit value."""
+        self.embedding_token_limit = value
+
+    def set_embedding_encoding(self, value: str) -> None:
+        """Set the embedding encoding value."""
+        self.embedding_encoding = value
+
     def set_browse_chunk_max_length(self, value: int) -> None:
         """Set the browse_website command chunk max length value."""
         self.browse_chunk_max_length = value

diff --git a/autogpt/llm_utils.py b/autogpt/llm_utils.py
@@ -2,9 +2,12 @@
 
 import functools
 import time
+from itertools import islice
 from typing import List, Optional
 
+import numpy as np
 import openai
+import tiktoken
 from colorama import Fore, Style
 from openai.error import APIError, RateLimitError, Timeout
 
@@ -210,6 +213,23 @@
     return resp
 
 
+def batched(iterable, n):
+    """Batch data into tuples of length n. The last batch may be shorter."""
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    while batch := tuple(islice(it, n)):
+        yield batch
+
+
+def chunked_tokens(text, encoding_name, chunk_length):
+    encoding = tiktoken.get_encoding(encoding_name)
+    tokens = encoding.encode(text)
+    chunks_iterator = batched(tokens, chunk_length)
+    yield from chunks_iterator
+
+
 def get_ada_embedding(text: str) -> List[float]:
     """Get an embedding from the ada model.
 
@@ -220,7 +240,7 @@
         List[float]: The embedding.
     """
     cfg = Config()
-    model = "text-embedding-ada-002"
+    model = cfg.embedding_model
     text = text.replace("\n", " ")
 
     if cfg.use_azure:
@@ -229,13 +249,7 @@
         kwargs = {"model": model}
 
     embedding = create_embedding(text, **kwargs)
-    api_manager = ApiManager()
-    api_manager.update_cost(
-        prompt_tokens=embedding.usage.prompt_tokens,
-        completion_tokens=0,
-        model=model,
-    )
-    return embedding["data"][0]["embedding"]
+    return embedding
 
 
 @retry_openai_api()
@@ -254,8 +268,31 @@
         openai.Embedding: The embedding object.
     """
     cfg = Config()
-    return openai.Embedding.create(
-        input=[text],
-        api_key=cfg.openai_api_key,
-        **kwargs,
-    )
+    chunk_embeddings = []
+    chunk_lens = []
+    for chunk in chunked_tokens(
+        text,
+        encoding_name=cfg.embedding_encoding,
+        chunk_length=cfg.embedding_token_limit,
+    ):
+        embedding = openai.Embedding.create(
+            input=[chunk],
+            api_key=cfg.openai_api_key,
+            **kwargs,
+        )
+        api_manager = ApiManager()
+        api_manager.update_cost(
+            prompt_tokens=embedding.usage.prompt_tokens,
+            completion_tokens=0,
+            model=cfg.embedding_model,
+        )
+        chunk_embeddings.append(embedding["data"][0]["embedding"])
+        chunk_lens.append(len(chunk))
+
+    # do weighted avg
+    chunk_embeddings = np.average(chunk_embeddings, axis=0, weights=chunk_lens)
+    chunk_embeddings = chunk_embeddings / np.linalg.norm(
+        chunk_embeddings
+    )  # normalize the length to one
+    chunk_embeddings = chunk_embeddings.tolist()
+    return chunk_embeddings
diff --git a/tests/test_llm_utils.py b/tests/test_llm_utils.py
@@ -1,7 +1,7 @@
 import pytest
 from openai.error import APIError, RateLimitError
 
-from autogpt.llm_utils import get_ada_embedding, retry_openai_api
+from autogpt.llm_utils import chunked_tokens, get_ada_embedding, retry_openai_api
 from autogpt.modelsinfo import COSTS
 
 
@@ -15,11 +15,8 @@ def error(request):
 
 @pytest.fixture
 def mock_create_embedding(mocker):
-    mock_response = mocker.MagicMock()
-    mock_response.usage.prompt_tokens = 5
-    mock_response.__getitem__.side_effect = lambda key: [{"embedding": [0.1, 0.2, 0.3]}]
     return mocker.patch(
-        "autogpt.llm_utils.create_embedding", return_value=mock_response
+        "autogpt.llm_utils.create_embedding", return_value=[0.1, 0.2, 0.3]
     )
 
 
@@ -123,7 +120,32 @@ def test_get_ada_embedding(mock_create_embedding, api_manager):
 
     assert embedding == [0.1, 0.2, 0.3]
 
-    cost = COSTS[model]["prompt"]
-    assert api_manager.get_total_prompt_tokens() == 5
-    assert api_manager.get_total_completion_tokens() == 0
-    assert api_manager.get_total_cost() == (5 * cost) / 1000
+
+def test_chunked_tokens():
+    text = "Auto-GPT is an experimental open-source application showcasing the capabilities of the GPT-4 language model"
+    expected_output = [
+        (
+            13556,
+            12279,
+            2898,
+            374,
+            459,
+            22772,
+            1825,
+            31874,
+            3851,
+            67908,
+            279,
+            17357,
+            315,
+            279,
+            480,
+            2898,
+            12,
+            19,
+            4221,
+            1646,
+        )
+    ]
+    output = list(chunked_tokens(text, "cl100k_base", 8191))
+    assert output == expected_output