Fix the maximum context length issue by chunking

Add basic unit test for the new chunked func
Significant-Gravitas · Apr 26, 2023 · c744134 · c744134
1 parent ae31dd4
commit c744134
Show file tree

Hide file tree

Showing 3 changed files with 120 additions and 6 deletions.
diff --git a/autogpt/config/config.py b/autogpt/config/config.py
@@ -36,6 +36,9 @@ def __init__(self) -> None:
         self.smart_llm_model = os.getenv("SMART_LLM_MODEL", "gpt-4")
         self.fast_token_limit = int(os.getenv("FAST_TOKEN_LIMIT", 4000))
         self.smart_token_limit = int(os.getenv("SMART_TOKEN_LIMIT", 8000))
+        self.embedding_model = os.getenv("EMBEDDING_MODEL", "text-embedding-ada-002")
+        self.embedding_token_limit = int(os.getenv("EMBEDDING_TOKEN_LIMIT", 8191))
+        self.embedding_encoding = os.getenv("EMBEDDING_ENCODING", "cl100k_base")
         self.browse_chunk_max_length = int(os.getenv("BROWSE_CHUNK_MAX_LENGTH", 3000))
         self.browse_spacy_language_model = os.getenv(
             "BROWSE_SPACY_LANGUAGE_MODEL", "en_core_web_sm"
@@ -217,6 +220,18 @@ def set_smart_token_limit(self, value: int) -> None:
         """Set the smart token limit value."""
         self.smart_token_limit = value
 
+    def set_embedding_model(self, value: str) -> None:
+        """Set the embedding model value."""
+        self.embedding_model = value
+
+    def set_embedding_token_limit(self, value: int) -> None:
+        """Set the embedding token limit value."""
+        self.embedding_token_limit = value
+
+    def set_embedding_encoding(self, value: str) -> None:
+        """Set the embedding encoding value."""
+        self.embedding_encoding = value
+
     def set_browse_chunk_max_length(self, value: int) -> None:
         """Set the browse_website command chunk max length value."""
         self.browse_chunk_max_length = value

diff --git a/autogpt/llm_utils.py b/autogpt/llm_utils.py
@@ -1,9 +1,12 @@
 from __future__ import annotations
 
 import time
+from itertools import islice
 from typing import List, Optional
 
+import numpy as np
 import openai
+import tiktoken
 from colorama import Fore, Style
 from openai.error import APIError, RateLimitError, Timeout
 
@@ -156,9 +159,24 @@ def create_chat_completion(
 
 def get_ada_embedding(text):
     text = text.replace("\n", " ")
-    return api_manager.embedding_create(
-        text_list=[text], model="text-embedding-ada-002"
-    )
+    return api_manager.embedding_create(text_list=[text], model=CFG.CFG.embedding_model)
+
+
+def batched(iterable, n):
+    """Batch data into tuples of length n. The last batch may be shorter."""
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    while batch := tuple(islice(it, n)):
+        yield batch
+
+
+def chunked_tokens(text, encoding_name, chunk_length):
+    encoding = tiktoken.get_encoding(encoding_name)
+    tokens = encoding.encode(text)
+    chunks_iterator = batched(tokens, chunk_length)
+    yield from chunks_iterator
 
 
 def create_embedding_with_ada(text) -> list:
@@ -167,9 +185,28 @@ def create_embedding_with_ada(text) -> list:
     for attempt in range(num_retries):
         backoff = 2 ** (attempt + 2)
         try:
-            return api_manager.embedding_create(
-                text_list=[text], model="text-embedding-ada-002"
-            )
+            chunk_embeddings = []
+            chunk_lens = []
+            for chunk in chunked_tokens(
+                text,
+                encoding_name=CFG.embedding_encoding,
+                chunk_length=CFG.embedding_token_limit,
+            ):
+                chunk_embeddings.append(
+                    api_manager.embedding_create(
+                        text_list=[chunk], model=CFG.embedding_model
+                    )
+                )
+                chunk_lens.append(len(chunk))
+
+            # do weighted avg
+            chunk_embeddings = np.average(chunk_embeddings, axis=0, weights=chunk_lens)
+            chunk_embeddings = chunk_embeddings / np.linalg.norm(
+                chunk_embeddings
+            )  # normalize the length to one
+            chunk_embeddings = chunk_embeddings.tolist()
+            return chunk_embeddings
+
         except RateLimitError:
             pass
         except (APIError, Timeout) as e:

diff --git a/tests/test_llm_utils.py b/tests/test_llm_utils.py
@@ -0,0 +1,62 @@
+# Generated by CodiumAI
+import pytest
+
+from autogpt.llm_utils import chunked_tokens
+
+"""
+Code Analysis
+
+Objective:
+The objective of the 'chunked_tokens' function is to split a given text into smaller chunks of a specified length, encode each chunk using a specified encoding, and yield the resulting chunks as an iterator.
+
+Inputs:
+- 'text': a string representing the text to be chunked and encoded
+- 'encoding_name': a string representing the name of the encoding to be used for encoding the text
+- 'chunk_length': an integer representing the desired length of each chunk
+
+Flow:
+1. Get the encoding corresponding to the specified encoding name using the 'get_encoding' function from the 'tiktoken' module.
+2. Encode the input text using the obtained encoding to get a list of tokens.
+3. Use the 'batched' function to split the list of tokens into smaller batches of length 'chunk_length'.
+4. Yield each batch of tokens as an iterator using the 'yield from' statement.
+
+Outputs:
+- An iterator yielding batches of encoded tokens, where each batch has a length of 'chunk_length' except for the last batch, which may have a shorter length.
+
+Additional aspects:
+- The 'batched' function is used as a helper function to split the list of tokens into smaller batches.
+- The 'batched' function raises a 'ValueError' if the specified batch length is less than 1.
+- The 'batched' function uses the 'islice' function from the 'itertools' module to slice the input iterable into batches.
+"""
+
+
+class TestChunkedTokens:
+    # Tests that text can be chunked.
+    def test_chunked_tokens_equal_chunks(self):
+        text = "Auto-GPT is an experimental open-source application showcasing the capabilities of the GPT-4 language model"
+        expected_output = [
+            (
+                13556,
+                12279,
+                2898,
+                374,
+                459,
+                22772,
+                1825,
+                31874,
+                3851,
+                67908,
+                279,
+                17357,
+                315,
+                279,
+                480,
+                2898,
+                12,
+                19,
+                4221,
+                1646,
+            )
+        ]
+        output = list(chunked_tokens(text, "cl100k_base", 8191))
+        assert output == expected_output