Skip to content

Commit

Permalink
Fix the maximum context length issue by chunking
Browse files Browse the repository at this point in the history
Add basic unit test for the new chunked func
  • Loading branch information
kinance committed Apr 26, 2023
1 parent ae31dd4 commit c744134
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 6 deletions.
15 changes: 15 additions & 0 deletions autogpt/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ def __init__(self) -> None:
self.smart_llm_model = os.getenv("SMART_LLM_MODEL", "gpt-4")
self.fast_token_limit = int(os.getenv("FAST_TOKEN_LIMIT", 4000))
self.smart_token_limit = int(os.getenv("SMART_TOKEN_LIMIT", 8000))
self.embedding_model = os.getenv("EMBEDDING_MODEL", "text-embedding-ada-002")
self.embedding_token_limit = int(os.getenv("EMBEDDING_TOKEN_LIMIT", 8191))
self.embedding_encoding = os.getenv("EMBEDDING_ENCODING", "cl100k_base")
self.browse_chunk_max_length = int(os.getenv("BROWSE_CHUNK_MAX_LENGTH", 3000))
self.browse_spacy_language_model = os.getenv(
"BROWSE_SPACY_LANGUAGE_MODEL", "en_core_web_sm"
Expand Down Expand Up @@ -217,6 +220,18 @@ def set_smart_token_limit(self, value: int) -> None:
"""Set the smart token limit value."""
self.smart_token_limit = value

def set_embedding_model(self, value: str) -> None:
"""Set the embedding model value."""
self.embedding_model = value

def set_embedding_token_limit(self, value: int) -> None:
"""Set the embedding token limit value."""
self.embedding_token_limit = value

def set_embedding_encoding(self, value: str) -> None:
"""Set the embedding encoding value."""
self.embedding_encoding = value

def set_browse_chunk_max_length(self, value: int) -> None:
"""Set the browse_website command chunk max length value."""
self.browse_chunk_max_length = value
Expand Down
49 changes: 43 additions & 6 deletions autogpt/llm_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from __future__ import annotations

import time
from itertools import islice
from typing import List, Optional

import numpy as np
import openai
import tiktoken
from colorama import Fore, Style
from openai.error import APIError, RateLimitError, Timeout

Expand Down Expand Up @@ -156,9 +159,24 @@ def create_chat_completion(

def get_ada_embedding(text):
text = text.replace("\n", " ")
return api_manager.embedding_create(
text_list=[text], model="text-embedding-ada-002"
)
return api_manager.embedding_create(text_list=[text], model=CFG.CFG.embedding_model)


def batched(iterable, n):
"""Batch data into tuples of length n. The last batch may be shorter."""
# batched('ABCDEFG', 3) --> ABC DEF G
if n < 1:
raise ValueError("n must be at least one")
it = iter(iterable)
while batch := tuple(islice(it, n)):
yield batch


def chunked_tokens(text, encoding_name, chunk_length):
encoding = tiktoken.get_encoding(encoding_name)
tokens = encoding.encode(text)
chunks_iterator = batched(tokens, chunk_length)
yield from chunks_iterator


def create_embedding_with_ada(text) -> list:
Expand All @@ -167,9 +185,28 @@ def create_embedding_with_ada(text) -> list:
for attempt in range(num_retries):
backoff = 2 ** (attempt + 2)
try:
return api_manager.embedding_create(
text_list=[text], model="text-embedding-ada-002"
)
chunk_embeddings = []
chunk_lens = []
for chunk in chunked_tokens(
text,
encoding_name=CFG.embedding_encoding,
chunk_length=CFG.embedding_token_limit,
):
chunk_embeddings.append(
api_manager.embedding_create(
text_list=[chunk], model=CFG.embedding_model
)
)
chunk_lens.append(len(chunk))

# do weighted avg
chunk_embeddings = np.average(chunk_embeddings, axis=0, weights=chunk_lens)
chunk_embeddings = chunk_embeddings / np.linalg.norm(
chunk_embeddings
) # normalize the length to one
chunk_embeddings = chunk_embeddings.tolist()
return chunk_embeddings

except RateLimitError:
pass
except (APIError, Timeout) as e:
Expand Down
62 changes: 62 additions & 0 deletions tests/test_llm_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Generated by CodiumAI
import pytest

from autogpt.llm_utils import chunked_tokens

"""
Code Analysis
Objective:
The objective of the 'chunked_tokens' function is to split a given text into smaller chunks of a specified length, encode each chunk using a specified encoding, and yield the resulting chunks as an iterator.
Inputs:
- 'text': a string representing the text to be chunked and encoded
- 'encoding_name': a string representing the name of the encoding to be used for encoding the text
- 'chunk_length': an integer representing the desired length of each chunk
Flow:
1. Get the encoding corresponding to the specified encoding name using the 'get_encoding' function from the 'tiktoken' module.
2. Encode the input text using the obtained encoding to get a list of tokens.
3. Use the 'batched' function to split the list of tokens into smaller batches of length 'chunk_length'.
4. Yield each batch of tokens as an iterator using the 'yield from' statement.
Outputs:
- An iterator yielding batches of encoded tokens, where each batch has a length of 'chunk_length' except for the last batch, which may have a shorter length.
Additional aspects:
- The 'batched' function is used as a helper function to split the list of tokens into smaller batches.
- The 'batched' function raises a 'ValueError' if the specified batch length is less than 1.
- The 'batched' function uses the 'islice' function from the 'itertools' module to slice the input iterable into batches.
"""


class TestChunkedTokens:
# Tests that text can be chunked.
def test_chunked_tokens_equal_chunks(self):
text = "Auto-GPT is an experimental open-source application showcasing the capabilities of the GPT-4 language model"
expected_output = [
(
13556,
12279,
2898,
374,
459,
22772,
1825,
31874,
3851,
67908,
279,
17357,
315,
279,
480,
2898,
12,
19,
4221,
1646,
)
]
output = list(chunked_tokens(text, "cl100k_base", 8191))
assert output == expected_output

0 comments on commit c744134

Please sign in to comment.