From 0185c7ff7ded42d495348d68b9bcc86874aa5f82 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Sun, 29 Jun 2025 10:02:25 +0330 Subject: [PATCH 1/2] feat: limit batch size to 1! As a temporary fix to llama-index first loading into vectorstore issue, we limit the batch size to 1. The issue described: In llama-index pipeline when trying to load documents into vectorstore, it first loads into docstore and then into vectorstore. In any case problems raised while loading into docstore the data would be missed to be loaded into vectorstore. So we limit the batch size to 1 meaning the data will be 1 by 1 loaded into docstore + vectorstore. --- hivemind_etl/mediawiki/etl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hivemind_etl/mediawiki/etl.py b/hivemind_etl/mediawiki/etl.py index 209cd62..b8d8b49 100644 --- a/hivemind_etl/mediawiki/etl.py +++ b/hivemind_etl/mediawiki/etl.py @@ -103,7 +103,7 @@ def load(self, documents: list[Document]) -> None: ) # Process batches in parallel using ThreadPoolExecutor - batch_size = 1000 + batch_size = 1 batches = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)] with ThreadPoolExecutor(max_workers=10) as executor: From aaf252c1c3c27bf19ba4858984c0be739854ea9a Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Sun, 29 Jun 2025 10:28:27 +0330 Subject: [PATCH 2/2] feat: added tracking for temporary fix! --- hivemind_etl/mediawiki/etl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hivemind_etl/mediawiki/etl.py b/hivemind_etl/mediawiki/etl.py index b8d8b49..fc26253 100644 --- a/hivemind_etl/mediawiki/etl.py +++ b/hivemind_etl/mediawiki/etl.py @@ -103,6 +103,8 @@ def load(self, documents: list[Document]) -> None: ) # Process batches in parallel using ThreadPoolExecutor + # TODO: Revert to larger batch size once llama-index loading issue is resolved + # See: https://github.com/TogetherCrew/temporal-worker-python/issues/60 batch_size = 1 batches = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]