Implement loading MemoryItems from file in JSONFileMemory

Further changes: * remove `init` param from `get_memory()`, replace usages by `memory.clear()` * make token length calculation optional in `MemoryItem.dump()`
Significant-Gravitas · Jun 15, 2023 · f16d7ba · f16d7ba
1 parent 6e6e7fc
commit f16d7ba
Show file tree

Hide file tree

Showing 9 changed files with 93 additions and 20 deletions.
diff --git a/autogpt/commands/file_operations.py b/autogpt/commands/file_operations.py
@@ -160,7 +160,7 @@ def ingest_file(
 
         # TODO: differentiate between different types of files
         file_memory = MemoryItem.from_text_file(content, filename)
-        logger.debug(f"Created memory: {file_memory.dump()}")
+        logger.debug(f"Created memory: {file_memory.dump(True)}")
         memory.add(file_memory)
 
         logger.info(f"Ingested {len(file_memory.e_chunks)} chunks from {filename}")

diff --git a/autogpt/main.py b/autogpt/main.py
@@ -175,7 +175,8 @@ def run_auto_gpt(
 
     # Initialize memory and make sure it is empty.
     # this is particularly important for indexing and referencing pinecone memory
-    memory = get_memory(cfg, init=True)
+    memory = get_memory(cfg)
+    memory.clear()
     logger.typewriter_log(
         "Using memory of type:", Fore.GREEN, f"{memory.__class__.__name__}"
     )

diff --git a/autogpt/memory/vector/__init__.py b/autogpt/memory/vector/__init__.py
@@ -39,7 +39,7 @@
 #     MilvusMemory = None
 
 
-def get_memory(cfg: Config, init=False) -> VectorMemory:
+def get_memory(cfg: Config) -> VectorMemory:
     memory = None
 
     match cfg.memory_backend:
@@ -60,7 +60,7 @@ def get_memory(cfg: Config, init=False) -> VectorMemory:
             #     )
             # else:
             #     memory = PineconeMemory(cfg)
-            #     if init:
+            #     if clear:
             #         memory.clear()
 
         case "redis":

diff --git a/autogpt/memory/vector/memory_item.py b/autogpt/memory/vector/memory_item.py
@@ -109,21 +109,21 @@ def from_ai_action(ai_message: Message, result_message: Message):
         # The result_message contains either user feedback
         # or the result of the command specified in ai_message
 
-        if ai_message["role"] != "assistant":
-            raise ValueError(f"Invalid role on 'ai_message': {ai_message['role']}")
+        if ai_message.role != "assistant":
+            raise ValueError(f"Invalid role on 'ai_message': {ai_message.role}")
 
         result = (
-            result_message["content"]
-            if result_message["content"].startswith("Command")
+            result_message.content
+            if result_message.content.startswith("Command")
             else "None"
         )
         user_input = (
-            result_message["content"]
-            if result_message["content"].startswith("Human feedback")
+            result_message.content
+            if result_message.content.startswith("Human feedback")
             else "None"
         )
         memory_content = (
-            f"Assistant Reply: {ai_message['content']}"
+            f"Assistant Reply: {ai_message.content}"
             "\n\n"
             f"Result: {result}"
             "\n\n"
@@ -145,11 +145,14 @@ def from_webpage(content: str, url: str, question: str | None = None):
             question_for_summary=question,
         )
 
-    def dump(self) -> str:
-        token_length = count_string_tokens(self.raw_content, Config().embedding_model)
+    def dump(self, calculate_length=False) -> str:
+        if calculate_length:
+            token_length = count_string_tokens(
+                self.raw_content, Config().embedding_model
+            )
         return f"""
 =============== MemoryItem ===============
-Length: {token_length} tokens in {len(self.e_chunks)} chunks
+Size: {f'{token_length} tokens in ' if calculate_length else ''}{len(self.e_chunks)} chunks
 Metadata: {json.dumps(self.metadata, indent=2)}
 ---------------- SUMMARY -----------------
 {self.summary}
@@ -158,6 +161,31 @@ def dump(self) -> str:
 ==========================================
 """
 
+    def __eq__(self, other: MemoryItem):
+        return (
+            self.raw_content == other.raw_content
+            and self.chunks == other.chunks
+            and self.chunk_summaries == other.chunk_summaries
+            # Embeddings can either be list[float] or np.ndarray[float32],
+            # and for comparison they must be of the same type
+            and np.array_equal(
+                self.e_summary
+                if isinstance(self.e_summary, np.ndarray)
+                else np.array(self.e_summary, dtype=np.float32),
+                other.e_summary
+                if isinstance(other.e_summary, np.ndarray)
+                else np.array(other.e_summary, dtype=np.float32),
+            )
+            and np.array_equal(
+                self.e_chunks
+                if isinstance(self.e_chunks[0], np.ndarray)
+                else [np.array(c, dtype=np.float32) for c in self.e_chunks],
+                other.e_chunks
+                if isinstance(other.e_chunks[0], np.ndarray)
+                else [np.array(c, dtype=np.float32) for c in other.e_chunks],
+            )
+        )
+
 
 @dataclasses.dataclass
 class MemoryItemRelevance:

diff --git a/autogpt/memory/vector/providers/json_file.py b/autogpt/memory/vector/providers/json_file.py
@@ -32,10 +32,17 @@ def __init__(self, cfg: Config) -> None:
         workspace_path = Path(cfg.workspace_path)
         self.file_path = workspace_path / f"{cfg.memory_index}.json"
         self.file_path.touch()
-        logger.debug(f"Initialized {__name__} with index path {self.file_path}")
+        logger.debug(
+            f"Initialized {__class__.__name__} with index path {self.file_path}"
+        )
 
         self.memories = []
-        self.save_index()
+        try:
+            self.load_index()
+            logger.debug(f"Loaded {len(self.memories)} MemoryItems from file")
+        except Exception as e:
+            logger.warn(f"Could not load MemoryItems from file: {e}")
+            self.save_index()
 
     def __iter__(self) -> Iterator[MemoryItem]:
         return iter(self.memories)
@@ -48,6 +55,7 @@ def __len__(self) -> int:
 
     def add(self, item: MemoryItem):
         self.memories.append(item)
+        logger.debug(f"Adding item to memory: {item.dump()}")
         self.save_index()
         return len(self.memories)
 
@@ -62,6 +70,17 @@ def clear(self):
         self.memories.clear()
         self.save_index()
 
+    def load_index(self):
+        """Loads all memories from the index file"""
+        if not self.file_path.is_file():
+            logger.debug(f"Index file '{self.file_path}' does not exist")
+            return
+        with self.file_path.open("r") as f:
+            logger.debug(f"Loading memories from index file '{self.file_path}'")
+            json_index = orjson.loads(f.read())
+            for memory_item_dict in json_index:
+                self.memories.append(MemoryItem(**memory_item_dict))
+
     def save_index(self):
         logger.debug(f"Saving memory index to file {self.file_path}")
         with self.file_path.open("wb") as f:

diff --git a/data_ingestion.py b/data_ingestion.py
@@ -70,7 +70,9 @@ def main() -> None:
     args = parser.parse_args()
 
     # Initialize memory
-    memory = get_memory(cfg, init=args.init)
+    memory = get_memory(cfg)
+    if args.init:
+        memory.clear()
     logger.debug("Using memory of type: " + memory.__class__.__name__)
 
     if args.file:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -94,7 +94,8 @@ def agent(config: Config, workspace: Workspace) -> Agent:
     ai_config.command_registry = command_registry
 
     config.set_memory_backend("json_file")
-    memory_json_file = get_memory(config, init=True)
+    memory_json_file = get_memory(config)
+    memory_json_file.clear()
 
     system_prompt = ai_config.construct_full_prompt()
 

diff --git a/tests/integration/agent_factory.py b/tests/integration/agent_factory.py
@@ -28,7 +28,9 @@ def memory_json_file(agent_test_config: Config):
     was_memory_backend = agent_test_config.memory_backend
 
     agent_test_config.set_memory_backend("json_file")
-    yield get_memory(agent_test_config, init=True)
+    memory = get_memory(agent_test_config)
+    memory.clear()
+    yield memory
 
     agent_test_config.set_memory_backend(was_memory_backend)
 

diff --git a/tests/integration/memory/test_json_file_memory.py b/tests/integration/memory/test_json_file_memory.py
@@ -34,7 +34,9 @@ def test_json_memory_init_with_backing_empty_file(config: Config, workspace: Wor
     assert index_file.read_text() == "[]"
 
 
-def test_json_memory_init_with_backing_file(config: Config, workspace: Workspace):
+def test_json_memory_init_with_backing_invalid_file(
+    config: Config, workspace: Workspace
+):
     index_file = workspace.root / f"{config.memory_index}.json"
     index_file.touch()
 
@@ -78,6 +80,24 @@ def test_json_memory_get(config: Config, memory_item: MemoryItem, mock_get_embed
     assert retrieved.memory_item == memory_item
 
 
+def test_json_memory_load_index(config: Config, memory_item: MemoryItem):
+    index = JSONFileMemory(config)
+    index.add(memory_item)
+
+    try:
+        assert index.file_path.exists(), "index was not saved to file"
+        assert len(index) == 1, f"index constains {len(index)} items instead of 1"
+        assert index.memories[0] == memory_item, "item in index != added mock item"
+    except AssertionError as e:
+        raise ValueError(f"Setting up for load_index test failed: {e}")
+
+    index.memories = []
+    index.load_index()
+
+    assert len(index) == 1
+    assert index.memories[0] == memory_item
+
+
 @pytest.mark.vcr
 @requires_api_key("OPENAI_API_KEY")
 def test_json_memory_get_relevant(config: Config, patched_api_requestor: None) -> None: