From f4bdeb40c4c4dafd6725583afab9a8e13de63a69 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 3 Sep 2025 13:21:20 +0330 Subject: [PATCH 1/5] feat: disable caching in CustomIngestionPipeline across multiple modules! - Updated the CustomIngestionPipeline instantiation in etl.py, pipeline.py, website_etl.py, and activities.py to set use_cache=False, ensuring that caching is disabled during document ingestion. --- hivemind_etl/mediawiki/etl.py | 2 +- hivemind_etl/simple_ingestion/pipeline.py | 2 ++ hivemind_etl/website/website_etl.py | 2 +- hivemind_summarizer/activities.py | 2 ++ 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/hivemind_etl/mediawiki/etl.py b/hivemind_etl/mediawiki/etl.py index fc26253..eaf1d6c 100644 --- a/hivemind_etl/mediawiki/etl.py +++ b/hivemind_etl/mediawiki/etl.py @@ -99,7 +99,7 @@ def transform(self) -> list[Document]: def load(self, documents: list[Document]) -> None: logging.info(f"Loading {len(documents)} documents into Qdrant!") ingestion_pipeline = CustomIngestionPipeline( - self.community_id, collection_name=self.platform_id + self.community_id, collection_name=self.platform_id, use_cache=False ) # Process batches in parallel using ThreadPoolExecutor diff --git a/hivemind_etl/simple_ingestion/pipeline.py b/hivemind_etl/simple_ingestion/pipeline.py index 1b954be..d03cb6c 100644 --- a/hivemind_etl/simple_ingestion/pipeline.py +++ b/hivemind_etl/simple_ingestion/pipeline.py @@ -147,6 +147,7 @@ async def process_document( pipeline = CustomIngestionPipeline( community_id=ingestion_request.communityId, collection_name=collection_name, + use_cache=False, ) document = Document( @@ -188,6 +189,7 @@ async def process_documents_batch( pipeline = CustomIngestionPipeline( community_id=batch_chunk.communityId, collection_name=collection_name, + use_cache=False, ) # Convert all documents in this chunk to Document objects diff --git a/hivemind_etl/website/website_etl.py b/hivemind_etl/website/website_etl.py index 2d678bb..4f956db 100644 --- a/hivemind_etl/website/website_etl.py +++ b/hivemind_etl/website/website_etl.py @@ -30,7 +30,7 @@ def __init__( # preparing the ingestion pipeline self.ingestion_pipeline = CustomIngestionPipeline( - self.community_id, collection_name=self.platform_id + self.community_id, collection_name=self.platform_id, use_cache=False, ) async def extract( diff --git a/hivemind_summarizer/activities.py b/hivemind_summarizer/activities.py index 8e4f008..292d96c 100644 --- a/hivemind_summarizer/activities.py +++ b/hivemind_summarizer/activities.py @@ -97,6 +97,7 @@ async def fetch_platform_summaries_by_date( pipeline = CustomIngestionPipeline( community_id=community_id, collection_name=f"{input.platform_id}_summary", + use_cache=False, ) # get the latest date from the collection latest_date = pipeline.get_latest_document_date( @@ -211,6 +212,7 @@ async def fetch_platform_summaries_by_date_range( extract_text_only=extract_text_only, platform_id=input.platform_id, community_id=community_id, + use_cache=False, ) summaries = await fetch_platform_summaries_by_date(date_input) result[date] = summaries From bdfa26f42453231d46cc5cd5367ed7421d27030c Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 3 Sep 2025 13:21:53 +0330 Subject: [PATCH 2/5] fix: added missing comma! --- hivemind_etl/mediawiki/etl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hivemind_etl/mediawiki/etl.py b/hivemind_etl/mediawiki/etl.py index eaf1d6c..1ed6fc8 100644 --- a/hivemind_etl/mediawiki/etl.py +++ b/hivemind_etl/mediawiki/etl.py @@ -99,7 +99,7 @@ def transform(self) -> list[Document]: def load(self, documents: list[Document]) -> None: logging.info(f"Loading {len(documents)} documents into Qdrant!") ingestion_pipeline = CustomIngestionPipeline( - self.community_id, collection_name=self.platform_id, use_cache=False + self.community_id, collection_name=self.platform_id, use_cache=False, ) # Process batches in parallel using ThreadPoolExecutor From 69617b70f15f3b5ff25edb3c0cb04c64e5d9011a Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 3 Sep 2025 13:30:23 +0330 Subject: [PATCH 3/5] chore: update tc-hivemind-backend version to 1.4.6 in requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7e9561d..3a474be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ python-dotenv>=1.0.0, <2.0.0 -tc-hivemind-backend==1.4.3 +tc-hivemind-backend==1.4.6 llama-index-storage-docstore-redis==0.1.2 llama-index-storage-docstore-mongodb==0.1.3 crawlee[playwright]==0.3.8 From f0aa5380cac938d3ebf9bdc2886561f77596e062 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 3 Sep 2025 13:36:30 +0330 Subject: [PATCH 4/5] fix: disable caching in MediawikiETL tests! - Updated test assertions in TestMediawikiETL to include use_cache=False when instantiating the ingestion pipeline, ensuring consistency with the recent changes to disable caching during document ingestion. --- tests/unit/test_mediawiki_etl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_mediawiki_etl.py b/tests/unit/test_mediawiki_etl.py index 8f42e60..3e78151 100644 --- a/tests/unit/test_mediawiki_etl.py +++ b/tests/unit/test_mediawiki_etl.py @@ -164,7 +164,7 @@ def test_load_with_dump_deletion(self, mock_ingestion_pipeline_class): # Verify that methods were called correctly mock_ingestion_pipeline_class.assert_called_once_with( - self.community_id, collection_name=self.platform_id + self.community_id, collection_name=self.platform_id, use_cache=False ) mock_pipeline.run_pipeline.assert_called_once_with(documents) self.assertFalse(os.path.exists(etl.dump_dir)) @@ -192,7 +192,7 @@ def test_load_without_dump_deletion(self, mock_ingestion_pipeline_class): # Verify that methods were called correctly mock_ingestion_pipeline_class.assert_called_once_with( - self.community_id, collection_name=self.platform_id + self.community_id, collection_name=self.platform_id, use_cache=False ) mock_pipeline.run_pipeline.assert_called_once_with(documents) self.assertTrue(os.path.exists(etl.dump_dir)) From 754d262c736089e669dc1597557d851ef4ce9760 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 3 Sep 2025 13:39:50 +0330 Subject: [PATCH 5/5] Fix: using the workflows with disabled CodeClimate! --- .github/workflows/production.yml | 2 +- .github/workflows/start.staging.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml index 39d3f14..8469487 100644 --- a/.github/workflows/production.yml +++ b/.github/workflows/production.yml @@ -9,6 +9,6 @@ on: jobs: ci: - uses: TogetherCrew/operations/.github/workflows/ci.yml@main + uses: TogetherCrew/operations/.github/workflows/ci2.yml@main secrets: CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }} diff --git a/.github/workflows/start.staging.yml b/.github/workflows/start.staging.yml index a53de6e..dcb51ed 100644 --- a/.github/workflows/start.staging.yml +++ b/.github/workflows/start.staging.yml @@ -6,6 +6,6 @@ on: pull_request jobs: ci: - uses: TogetherCrew/operations/.github/workflows/ci.yml@main + uses: TogetherCrew/operations/.github/workflows/ci2.yml@main secrets: CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}