Significant-Gravitas · vzla0094 · Apr 17, 2023 · Apr 17, 2023 · Apr 17, 2023 · Apr 18, 2023
@@ -1,4 +1,5 @@
 """Text processing functions"""
+import textwrap
 from typing import Dict, Generator, Optional
 
 from selenium.webdriver.remote.webdriver import WebDriver
@@ -8,37 +9,46 @@
 from autogpt.memory import get_memory
 
 CFG = Config()
-MEMORY = get_memory(CFG)
+
+
+def get_memory_instance():
+    return get_memory(CFG)
 
 
 def split_text(text: str, max_length: int = 8192) -> Generator[str, None, None]:
-    """Split text into chunks of a maximum length
+    """Split text into chunks of a maximum length.
+
+    This function takes a text string and splits it into smaller chunks of a maximum length,
+    wrapping lines that exceed the maximum length. It uses the textwrap module to wrap lines
+    to the specified maximum length.
 
     Args:
-        text (str): The text to split
-        max_length (int, optional): The maximum length of each chunk. Defaults to 8192.
+        text (str): The text to split.
+        max_length (int, optional): The maximum length of each chunk. Defaults to 50.
 
     Yields:
-        str: The next chunk of text
+        str: The next chunk of text.
 
     Raises:
-        ValueError: If the text is longer than the maximum length
+        ValueError: If the max_length is less than or equal to 0.
     """
-    paragraphs = text.split("\n")
-    current_length = 0
-    current_chunk = []
-
-    for paragraph in paragraphs:
-        if current_length + len(paragraph) + 1 <= max_length:
-            current_chunk.append(paragraph)
-            current_length += len(paragraph) + 1
-        else:
-            yield "\n".join(current_chunk)
-            current_chunk = [paragraph]
-            current_length = len(paragraph) + 1
+    if max_length <= 0:
+        raise ValueError("max_length should be greater than 0")
 
-    if current_chunk:
-        yield "\n".join(current_chunk)
+    if not text:
+        return
+
+    lines = text.split("\n")
+
+    for line in lines:
+        if not line.strip():
+            continue
+
+        wrapped_lines = textwrap.wrap(line, width=max_length)
+        if wrapped_lines:
+            yield from wrapped_lines
+        else:
+            yield line
 
 
 def summarize_text(
@@ -57,6 +67,7 @@ def summarize_text(
     """
     if not text:
         return "Error: No text to summarize"
+    MEMORY = get_memory_instance()
 
     text_length = len(text)
     print(f"Text length: {text_length} characters")

diff --git a/tests/unit/test_split_text.py b/tests/unit/test_split_text.py
@@ -0,0 +1,91 @@
+import unittest
+
+from autogpt.processing.text import split_text
+
+
+class TestSplitText(unittest.TestCase):
+    def test_empty_string(self):
+        text = ""
+        result = list(split_text(text))
+        self.assertEqual(result, [])
+
+    def test_no_split_required(self):
+        text = "This is a short text that doesn't require any splitting based on the given maximum length."
+        result = list(split_text(text, max_length=100))
+        self.assertEqual(result, [text])
+
+    def test_split_required(self):
+        text = (
+            "This is a longer piece of text that requires splitting based on the given maximum length.\n"
+            "It contains multiple lines that will need to be divided into smaller chunks."
+        )
+        result = list(split_text(text, max_length=50))
+        self.assertEqual(
+            result,
+            [
+                "This is a longer piece of text that requires",
+                "splitting based on the given maximum length.",
+                "It contains multiple lines that will need to be",
+                "divided into smaller chunks.",
+            ],
+        )
+
+    def test_long_paragraph_split(self):
+        text = (
+            "This is a very long paragraph that needs to be split into smaller chunks. "
+            * 10
+        )
+        result = list(split_text(text, max_length=50))
+        expected = [
+            "This is a very long paragraph that needs to be",
+            "split into smaller chunks. This is a very long",
+            "paragraph that needs to be split into smaller",
+            "chunks. This is a very long paragraph that needs",
+            "to be split into smaller chunks. This is a very",
+            "long paragraph that needs to be split into smaller",
+            "chunks. This is a very long paragraph that needs",
+            "to be split into smaller chunks. This is a very",
+            "long paragraph that needs to be split into smaller",
+            "chunks. This is a very long paragraph that needs",
+            "to be split into smaller chunks. This is a very",
+            "long paragraph that needs to be split into smaller",
+            "chunks. This is a very long paragraph that needs",
+            "to be split into smaller chunks. This is a very",
+            "long paragraph that needs to be split into smaller",
+            "chunks.",
+        ]
+        self.assertEqual(result, expected)
+
+    def test_split_with_whitespace(self):
+        text = (
+            "This is a test\n\nwith extra whitespace and a longer text that might require splitting "
+            "based on the given maximum length."
+        )
+        result = list(split_text(text, max_length=50))
+        self.assertEqual(
+            result,
+            [
+                "This is a test",
+                "with extra whitespace and a longer text that might",
+                "require splitting based on the given maximum",
+                "length.",
+            ],
+        )
+
+    def test_value_error(self):
+        text = (
+            "This is a very long paragraph that needs to be split into smaller chunks."
+        )
+        with self.assertRaises(ValueError):
+            list(split_text(text, max_length=0))
+
+    def test_max_length_larger_than_text(self):
+        text = (
+            "This is a text that has a length smaller than the provided maximum length."
+        )
+        result = list(split_text(text, max_length=10000))
+        self.assertEqual(result, [text])
+
+
+if __name__ == "__main__":
+    unittest.main()