Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix split_text chunking bug #2088

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 31 additions & 20 deletions autogpt/processing/text.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Text processing functions"""
import textwrap
from typing import Dict, Generator, Optional

Pwuts marked this conversation as resolved.
Show resolved Hide resolved
from selenium.webdriver.remote.webdriver import WebDriver
Expand All @@ -8,37 +9,46 @@
from autogpt.memory import get_memory

CFG = Config()
MEMORY = get_memory(CFG)


def get_memory_instance():
return get_memory(CFG)


def split_text(text: str, max_length: int = 8192) -> Generator[str, None, None]:
"""Split text into chunks of a maximum length
"""Split text into chunks of a maximum length.

This function takes a text string and splits it into smaller chunks of a maximum length,
wrapping lines that exceed the maximum length. It uses the textwrap module to wrap lines
to the specified maximum length.

Args:
text (str): The text to split
max_length (int, optional): The maximum length of each chunk. Defaults to 8192.
text (str): The text to split.
max_length (int, optional): The maximum length of each chunk. Defaults to 50.

Yields:
str: The next chunk of text
str: The next chunk of text.

Raises:
ValueError: If the text is longer than the maximum length
ValueError: If the max_length is less than or equal to 0.
"""
Pwuts marked this conversation as resolved.
Show resolved Hide resolved
paragraphs = text.split("\n")
current_length = 0
current_chunk = []

for paragraph in paragraphs:
if current_length + len(paragraph) + 1 <= max_length:
current_chunk.append(paragraph)
current_length += len(paragraph) + 1
else:
yield "\n".join(current_chunk)
current_chunk = [paragraph]
current_length = len(paragraph) + 1
if max_length <= 0:
raise ValueError("max_length should be greater than 0")

if current_chunk:
yield "\n".join(current_chunk)
if not text:
return

lines = text.split("\n")

for line in lines:
if not line.strip():
continue

wrapped_lines = textwrap.wrap(line, width=max_length)
Pwuts marked this conversation as resolved.
Show resolved Hide resolved
if wrapped_lines:
yield from wrapped_lines
else:
yield line


def summarize_text(
Expand All @@ -57,6 +67,7 @@ def summarize_text(
"""
if not text:
return "Error: No text to summarize"
MEMORY = get_memory_instance()
Pwuts marked this conversation as resolved.
Show resolved Hide resolved

text_length = len(text)
print(f"Text length: {text_length} characters")
Expand Down
91 changes: 91 additions & 0 deletions tests/unit/test_split_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import unittest

from autogpt.processing.text import split_text


class TestSplitText(unittest.TestCase):
def test_empty_string(self):
text = ""
result = list(split_text(text))
self.assertEqual(result, [])

def test_no_split_required(self):
text = "This is a short text that doesn't require any splitting based on the given maximum length."
result = list(split_text(text, max_length=100))
self.assertEqual(result, [text])

def test_split_required(self):
text = (
"This is a longer piece of text that requires splitting based on the given maximum length.\n"
"It contains multiple lines that will need to be divided into smaller chunks."
)
result = list(split_text(text, max_length=50))
self.assertEqual(
result,
[
"This is a longer piece of text that requires",
"splitting based on the given maximum length.",
"It contains multiple lines that will need to be",
"divided into smaller chunks.",
],
)

def test_long_paragraph_split(self):
text = (
"This is a very long paragraph that needs to be split into smaller chunks. "
* 10
)
result = list(split_text(text, max_length=50))
expected = [
"This is a very long paragraph that needs to be",
"split into smaller chunks. This is a very long",
"paragraph that needs to be split into smaller",
"chunks. This is a very long paragraph that needs",
"to be split into smaller chunks. This is a very",
"long paragraph that needs to be split into smaller",
"chunks. This is a very long paragraph that needs",
"to be split into smaller chunks. This is a very",
"long paragraph that needs to be split into smaller",
"chunks. This is a very long paragraph that needs",
"to be split into smaller chunks. This is a very",
"long paragraph that needs to be split into smaller",
"chunks. This is a very long paragraph that needs",
"to be split into smaller chunks. This is a very",
"long paragraph that needs to be split into smaller",
"chunks.",
]
self.assertEqual(result, expected)

def test_split_with_whitespace(self):
text = (
"This is a test\n\nwith extra whitespace and a longer text that might require splitting "
"based on the given maximum length."
)
result = list(split_text(text, max_length=50))
self.assertEqual(
result,
[
"This is a test",
"with extra whitespace and a longer text that might",
"require splitting based on the given maximum",
"length.",
],
)

def test_value_error(self):
text = (
"This is a very long paragraph that needs to be split into smaller chunks."
)
with self.assertRaises(ValueError):
list(split_text(text, max_length=0))

def test_max_length_larger_than_text(self):
text = (
"This is a text that has a length smaller than the provided maximum length."
)
result = list(split_text(text, max_length=10000))
self.assertEqual(result, [text])


if __name__ == "__main__":
unittest.main()