OpenDCAI · SunnyHaze · Jul 12, 2025 · Jul 12, 2025 · Jul 12, 2025 · Jul 12, 2025
diff --git a/dataflow/operators/conversations/__init__.py b/dataflow/operators/conversations/__init__.py
@@ -0,0 +1,11 @@
+import sys
+from dataflow.utils.registry import LazyLoader
+from .consistent_chat import ConsistentChatGenerator
+
+cur_path = "dataflow/operators/conversations/"
+
+_import_structure = {
+    "ConsistentChatGenerator": (cur_path + "consistent_chat.py", "ConsistentChatGenerator"),
+}
+
+sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/conversations/", _import_structure)
diff --git a/...low/operators/dialogue/consistent_chat.py → ...perators/conversations/consistent_chat.py b/...low/operators/dialogue/consistent_chat.py → ...perators/conversations/consistent_chat.py
@@ -1,7 +1,5 @@
 import random
 import json
-import os
-from dataflow.serving import APILLMServing_request
 from dataflow.utils.registry import OPERATOR_REGISTRY
 from dataflow import get_logger
 from dataflow.core import OperatorABC
@@ -22,7 +20,7 @@ def __init__(self, llm_serving: LLMServingABC = None, num_dialogs_per_intent = 2
         self.prompt = ConsistentChatPrompt()
         self.logger.info(f'{self.__class__.__name__} initialized.')
 
-    def run(self):
+    def run(self, storage: DataFlowStorage):
         all_query_prompts = []
 
         # Step 1: Generate all queries using LLM
@@ -33,6 +31,7 @@ def run(self):
                 query_prompt = self.prompt.get_query_prompt(info_flow, topic)
                 all_query_prompts.append(query_prompt)
         # Step 2: Generate queries by calling llm_serving once
+        self.logger.info("Generating queries...")
         queries_list = self.llm_serving.generate_from_input(user_inputs=all_query_prompts)
         valid_queries = []
         cnt = 0
@@ -50,6 +49,7 @@ def run(self):
             category = queries.get("category")
             turns = queries.get("turns")
             all_response_prompts.append(self.prompt.get_response_prompt(topic=category, queries=turns))
+        self.logger.info("Generating responses...")
         responses_list = self.llm_serving.generate_from_input(user_inputs=all_response_prompts)
 
         final_queries = []
@@ -87,9 +87,7 @@ def run(self):
                 continue 
         self.logger.info(f'Number of synthesized dialogues: {len(formatted_data)}')
 
-        output_filename = "generated_dialogs.json"
-        with open(output_filename, "w") as f:
-            json.dump(formatted_data, f, indent=4)
-
-        print(f"Data generated and saved to {output_filename}")
-        return formatted_data
+        df = pd.DataFrame(formatted_data)
+        storage.write(df)
+        self.logger.info(f'Number of synthesized dialogues: {len(df)} written to storage as DataFrame')
+        return df
diff --git a/dataflow/operators/filter/GeneralText/deduplicators/__init__.py b/dataflow/operators/filter/GeneralText/deduplicators/__init__.py
diff --git a/dataflow/operators/filter/GeneralText/deduplicators/ccnet_deduplicator.py b/dataflow/operators/filter/GeneralText/deduplicators/ccnet_deduplicator.py
diff --git a/dataflow/operators/filter/GeneralText/deduplicators/hash_deduplicator.py b/dataflow/operators/filter/GeneralText/deduplicators/hash_deduplicator.py
diff --git a/dataflow/operators/filter/GeneralText/deduplicators/minhash_deduplicator.py b/dataflow/operators/filter/GeneralText/deduplicators/minhash_deduplicator.py
diff --git a/dataflow/operators/filter/GeneralText/deduplicators/ngramhash_deduplicator.py b/dataflow/operators/filter/GeneralText/deduplicators/ngramhash_deduplicator.py