Async io #45

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

sator-labs merged 12 commits into main from async_io

Jan 6, 2026

judge.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -9,6 +9,7 @@ @@
     from judge import judge_conversations, judge_single_conversation
     from judge.llm_judge import LLMJudge
+    from judge.rubric_config import ConversationData, RubricConfig, load_conversations
     from utils.utils import parse_key_value_list
@@ Expand All / @@ -26,30 +27,48 @@ async def main(args): @@
                 judge_models[model_spec] = 1
         models_str = ", ".join(f"{model}x{count}" for model, count in judge_models.items())
-        print(f"🎯 LLM Judge | Models: {models_str} | Rubrics: {', '.join(args.rubrics)}")
+        print(f"🎯 LLM Judge | Models: {models_str}")
+        # Load rubric configuration once at startup
+        print("📚 Loading rubric configuration...")
+        rubric_config = await RubricConfig.load(rubric_folder="data")
         if args.conversation:
             # Single conversation with first judge model (single instance)
             first_model = next(iter(judge_models.keys()))
+            # Load single conversation
+            conversation = await ConversationData.load(args.conversation)
+            # Create judge with rubric config
             judge = LLMJudge(
                 judge_model=first_model,
+                rubric_config=rubric_config,
                 judge_model_extra_params=args.judge_model_extra_params,
             )
-            await judge_single_conversation(
-                judge, args.conversation, args.rubrics, args.output
-            )
+            await judge_single_conversation(judge, conversation, args.output)
         else:
+            # Load all conversations at startup
+            print(f"📂 Loading conversations from {args.folder}...")
+            conversations = await load_conversations(args.folder, limit=args.limit)
+            print(f"✅ Loaded {len(conversations)} conversations")
             # Batch evaluation with multiple judges
+            from pathlib import Path
+            folder_name = Path(args.folder).name
             await judge_conversations(
                 judge_models=judge_models,
-                conversation_folder=args.folder,
+                conversations=conversations,
+                rubric_config=rubric_config,
                 max_concurrent=args.max_concurrent,
-                rubrics=args.rubrics,
                 output_root=args.output,
-                limit=args.limit,
+                conversation_folder_name=folder_name,
                 verbose=True,
                 judge_model_extra_params=args.judge_model_extra_params,
                 per_judge=args.per_judge,
+                verbose_workers=args.verbose_workers,
             )
@@ Expand Down Expand Up / @@ -145,6 +164,12 @@ async def main(args): @@
             ),
         )
+        parser.add_argument(
+            "--verbose-workers",
+            action="store_true",
+            help="Enable verbose worker logging to show concurrency behavior",
+        )
         args = parser.parse_args()
         print(f"Running judge on: {args.folder or args.conversation}")
@@ Expand Down @@

judge/llm_judge.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -7,7 +7,7 @@
  
    from judge.constants import BEST_PRACTICE, DAMAGING, NEUTRAL

    from judge.question_navigator import QuestionNavigator

    from judge.response_models import QuestionResponse

    from judge.utils import load_rubric_structure

    from judge.rubric_config import ConversationData, RubricConfig

    from llm_clients import LLMFactory

    from llm_clients.llm_interface import JudgeLLM

    @@ -18,28 +18,20 @@ class LLMJudge:
  
        def __init__(

            self,

            judge_model: str,

            rubric_config: RubricConfig,

            judge_model_extra_params: Optional[Dict[str, Any]] = None,

            rubric_folder: str = "data",

            rubric_prompt_beginning_file: str = "rubric_prompt_beginning.txt",

            rubric_file: str = "rubric.tsv",

            sep: str = "\t",

            log_file: Optional[str] = None,

            verbose: bool = False,

            question_prompt_file: str = "question_prompt.txt",

        ):

            """

            Initialize the LLM Judge.

            Initialize the LLM Judge with pre-loaded rubric configuration.

            Args:

                judge_model: Model to use for judging.

                judge_model_extra_params: Extra parameters for the judge model.

                rubric_folder: Folder containing rubric files

                rubric_file: File containing the question-flow rubric

                sep: Separator for the rubric file

                judge_model: Model to use for judging

                rubric_config: Pre-loaded rubric configuration data

                judge_model_extra_params: Extra parameters for the judge model

                log_file: Path to log file (default: logs/judge_{timestamp}.log)

                verbose: Whether to print verbose output during initialization

            Note: assumes that `rubric_folder` contains the `rubric_file`

            """

            # Setup logger

    @@ -65,23 +57,6 @@ def __init__(
  
            self.logger.addHandler(file_handler)

            self.log_file = log_file

            rubric_path = Path(rubric_folder) / rubric_file

            rubric_prompt_beginning_path = (

                Path(rubric_folder) / rubric_prompt_beginning_file

            )

            self.question_prompt_file = Path(rubric_folder) / question_prompt_file

            if not rubric_path.exists():

                raise FileNotFoundError(f"Rubric file not found: {rubric_path}")

            if not rubric_prompt_beginning_path.exists():

                raise FileNotFoundError(

                    f"Rubric prompt beginning file not found: "

                    f"{rubric_prompt_beginning_path}"

                )

            if not self.question_prompt_file.exists():

                raise FileNotFoundError(

                    f"Question prompt file not found: {self.question_prompt_file}"

                )

            self.judge_model = judge_model

            self.judge_model_extra_params = judge_model_extra_params or {}

    @@ -90,23 +65,23 @@ def __init__(
  
            if "temperature" not in self.judge_model_extra_params:

                self.judge_model_extra_params["temperature"] = 0

            # Store rubric data from config

            self.rubric_prompt_beginning = rubric_config.rubric_prompt_beginning

            self.question_prompt_template = rubric_config.question_prompt_template

            self.dimensions = rubric_config.dimensions

            self.question_flow_data = rubric_config.question_flow_data

            self.question_order = rubric_config.question_order

            # Initialize question navigator with parsed data

            self.navigator = QuestionNavigator(

                question_flow_data=rubric_config.question_flow_data,

                question_order=rubric_config.question_order,

            )

            # Log initialization info

            self.logger.info("=== Initializing LLM Judge ===")

            self.logger.info(f"Judge model: {judge_model}")

            self.logger.info(f"Rubric folder: {rubric_folder}")

            self.logger.info(f"Log file: {log_file}")

            with open(rubric_prompt_beginning_path, "r", encoding="utf-8") as f:

                self.rubric_prompt_beginning = f.read()

            # Initialize question navigator (handles rubric parsing and navigation)

            self.navigator = QuestionNavigator(str(rubric_path), sep=sep)

            self.question_flow_data = self.navigator.question_flow_data

            self.question_order = self.navigator.question_order

            # Load dimensions from rubric

            self.dimensions, _ = load_rubric_structure(str(rubric_path), sep=sep)

            self.logger.info(

                f"Loaded question-flow rubric with {len(self.question_flow_data)} questions"

            )

    @@ -115,29 +90,16 @@ def __init__(
  
                f"Loaded question-flow rubric with {len(self.question_flow_data)} questions"

            )

        def load_conversation(self, conversation_file: str) -> str:

            """

            Load conversation from file.

            Args:

                conversation_file: Path to conversation file

            Returns:

                Conversation text

            """

            conversation_path = Path(conversation_file)

            if not conversation_path.exists():

                raise FileNotFoundError(f"Conversation file not found: {conversation_path}")

            with open(conversation_path, "r", encoding="utf-8") as f:

                return f.read()

        def _create_evaluator(

            self, conversation: str, conversation_file: str, verbose: bool

            self, conversation: str, conversation_filename: str, verbose: bool

        ) -> JudgeLLM:

            """Create and configure the LLM evaluator with conversation context.

            Args:

                conversation: Conversation text content

                conversation_filename: Filename for logging purposes

                verbose: Whether to print verbose output

            Returns:

                JudgeLLM instance configured for evaluation

    @@ -146,7 +108,7 @@ def _create_evaluator(
  
            """

            # Log evaluation start

            self.logger.info("=" * 80)

            self.logger.info(f"Starting evaluation: {conversation_file}")

            self.logger.info(f"Starting evaluation: {conversation_filename}")

            self.logger.info(f"Model: {self.judge_model}")

            self.logger.info("=" * 80)

            conv_preview = conversation[:1000]

    @@ -185,7 +147,7 @@ def _create_evaluator(
  
        async def evaluate_conversation_question_flow(

            self,

            conversation_file: str,

            conversation: ConversationData,

            output_folder: str,

            auto_save: bool = True,

            verbose: bool = False,

    @@ -202,7 +164,7 @@ async def evaluate_conversation_question_flow(
  
            3. Save results if requested

            Args:

                conversation_file: Path to conversation file

                conversation: ConversationData with content and metadata

                output_folder: Folder to save evaluation results

                auto_save: Whether to automatically save results to files

                verbose: Whether to print progress information

    @@ -220,10 +182,10 @@ async def evaluate_conversation_question_flow(
  
                    "Question flow rubric not loaded. Check rubric file exists."

                )

            # Load conversation and create evaluator

            conversation = self.load_conversation(conversation_file)

            # Create evaluator with conversation content

            conversation_filename = conversation.metadata.get("filename", "unknown")

            self.evaluator = self._create_evaluator(

                conversation, conversation_file, verbose

                conversation.content, conversation_filename, verbose

            )

            # Step 1: Navigate through questions and collect answers

    @@ -248,7 +210,7 @@ async def evaluate_conversation_question_flow(
  
            self._log_final_results(results)

            if auto_save:

                self._save_results(

                    conversation_file, output_folder, results, verbose, judge_instance

                    conversation, output_folder, results, verbose, judge_instance

                )

            return results

    @@ -351,14 +313,24 @@ def _log_final_results(
  
        def _save_results(

            self,

            conversation_file: str,

            conversation: ConversationData,

            output_folder: str,

            results: Dict[str, Dict[str, str]],

            verbose: bool,

            judge_instance: Optional[int] = None,

        ):

            """Save evaluation results to file."""

            conversation_name = Path(conversation_file).stem

            """Save evaluation results to file.

            Args:

                conversation: ConversationData with metadata

                output_folder: Folder to save results

                results: Evaluation results dictionary

                verbose: Whether to print progress

                judge_instance: Optional judge instance number for filename

            """

            # Extract conversation name from metadata

            filename = conversation.metadata.get("filename", "unknown.txt")

            conversation_name = Path(filename).stem

            # Build filename with judge model and instance info

            judge_suffix = self.judge_model.replace("/", "_").replace(":", "_")

    @@ -547,7 +519,9 @@ async def _ask_single_question(
  
            options_str = ", ".join(answer_options)

            # Build prompt

            prompt = self._build_question_prompt(question_text, examples_text, options_str)

            prompt = await self._build_question_prompt(

                question_text, examples_text, options_str

            )

            if verbose:

                print(f"  Options: {options_str}")

    @@ -585,15 +559,26 @@ async def _ask_single_question(
  
            # Return the matched answer and reasoning

            return matched_answer, structured_response.reasoning

        def _build_question_prompt(self, question: str, examples: str, options: str) -> str:

            """Build the prompt for asking a question."""

        async def _build_question_prompt(

            self, question: str, examples: str, options: str

        ) -> str:

            """Build the prompt for asking a question using pre-loaded template.

            Args:

                question: Question text

                examples: Examples text

                options: Answer options

            Returns:

                Formatted question prompt

            """

            examples_section = f"\n{examples}\n" if examples else ""

            with open(self.question_prompt_file, "r", encoding="utf-8") as f:

                prompt = f.read()

                prompt = prompt.format(

                    question=question, examples_section=examples_section, options=options

                )

                return prompt

            # Use pre-loaded template from rubric config

            prompt = self.question_prompt_template.format(

                question=question, examples_section=examples_section, options=options

            )

            return prompt

        def _store_answer(

            self,

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Async io #45

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!