diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml index 39d3f14..8469487 100644 --- a/.github/workflows/production.yml +++ b/.github/workflows/production.yml @@ -9,6 +9,6 @@ on: jobs: ci: - uses: TogetherCrew/operations/.github/workflows/ci.yml@main + uses: TogetherCrew/operations/.github/workflows/ci2.yml@main secrets: CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }} diff --git a/.github/workflows/start.staging.yml b/.github/workflows/start.staging.yml index a53de6e..dcb51ed 100644 --- a/.github/workflows/start.staging.yml +++ b/.github/workflows/start.staging.yml @@ -6,6 +6,6 @@ on: pull_request jobs: ci: - uses: TogetherCrew/operations/.github/workflows/ci.yml@main + uses: TogetherCrew/operations/.github/workflows/ci2.yml@main secrets: CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }} diff --git a/hivemind_etl/mediawiki/etl.py b/hivemind_etl/mediawiki/etl.py index fc26253..1ed6fc8 100644 --- a/hivemind_etl/mediawiki/etl.py +++ b/hivemind_etl/mediawiki/etl.py @@ -99,7 +99,7 @@ def transform(self) -> list[Document]: def load(self, documents: list[Document]) -> None: logging.info(f"Loading {len(documents)} documents into Qdrant!") ingestion_pipeline = CustomIngestionPipeline( - self.community_id, collection_name=self.platform_id + self.community_id, collection_name=self.platform_id, use_cache=False, ) # Process batches in parallel using ThreadPoolExecutor diff --git a/hivemind_etl/simple_ingestion/pipeline.py b/hivemind_etl/simple_ingestion/pipeline.py index 1b954be..d03cb6c 100644 --- a/hivemind_etl/simple_ingestion/pipeline.py +++ b/hivemind_etl/simple_ingestion/pipeline.py @@ -147,6 +147,7 @@ async def process_document( pipeline = CustomIngestionPipeline( community_id=ingestion_request.communityId, collection_name=collection_name, + use_cache=False, ) document = Document( @@ -188,6 +189,7 @@ async def process_documents_batch( pipeline = CustomIngestionPipeline( community_id=batch_chunk.communityId, collection_name=collection_name, + use_cache=False, ) # Convert all documents in this chunk to Document objects diff --git a/hivemind_etl/website/website_etl.py b/hivemind_etl/website/website_etl.py index 2d678bb..4f956db 100644 --- a/hivemind_etl/website/website_etl.py +++ b/hivemind_etl/website/website_etl.py @@ -30,7 +30,7 @@ def __init__( # preparing the ingestion pipeline self.ingestion_pipeline = CustomIngestionPipeline( - self.community_id, collection_name=self.platform_id + self.community_id, collection_name=self.platform_id, use_cache=False, ) async def extract( diff --git a/hivemind_summarizer/activities.py b/hivemind_summarizer/activities.py index 8e4f008..292d96c 100644 --- a/hivemind_summarizer/activities.py +++ b/hivemind_summarizer/activities.py @@ -97,6 +97,7 @@ async def fetch_platform_summaries_by_date( pipeline = CustomIngestionPipeline( community_id=community_id, collection_name=f"{input.platform_id}_summary", + use_cache=False, ) # get the latest date from the collection latest_date = pipeline.get_latest_document_date( @@ -211,6 +212,7 @@ async def fetch_platform_summaries_by_date_range( extract_text_only=extract_text_only, platform_id=input.platform_id, community_id=community_id, + use_cache=False, ) summaries = await fetch_platform_summaries_by_date(date_input) result[date] = summaries diff --git a/requirements.txt b/requirements.txt index 7e9561d..3a474be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ python-dotenv>=1.0.0, <2.0.0 -tc-hivemind-backend==1.4.3 +tc-hivemind-backend==1.4.6 llama-index-storage-docstore-redis==0.1.2 llama-index-storage-docstore-mongodb==0.1.3 crawlee[playwright]==0.3.8 diff --git a/tests/unit/test_mediawiki_etl.py b/tests/unit/test_mediawiki_etl.py index 8f42e60..3e78151 100644 --- a/tests/unit/test_mediawiki_etl.py +++ b/tests/unit/test_mediawiki_etl.py @@ -164,7 +164,7 @@ def test_load_with_dump_deletion(self, mock_ingestion_pipeline_class): # Verify that methods were called correctly mock_ingestion_pipeline_class.assert_called_once_with( - self.community_id, collection_name=self.platform_id + self.community_id, collection_name=self.platform_id, use_cache=False ) mock_pipeline.run_pipeline.assert_called_once_with(documents) self.assertFalse(os.path.exists(etl.dump_dir)) @@ -192,7 +192,7 @@ def test_load_without_dump_deletion(self, mock_ingestion_pipeline_class): # Verify that methods were called correctly mock_ingestion_pipeline_class.assert_called_once_with( - self.community_id, collection_name=self.platform_id + self.community_id, collection_name=self.platform_id, use_cache=False ) mock_pipeline.run_pipeline.assert_called_once_with(documents) self.assertTrue(os.path.exists(etl.dump_dir))