Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 32 additions & 7 deletions judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from judge import judge_conversations, judge_single_conversation
from judge.llm_judge import LLMJudge
from judge.rubric_config import ConversationData, RubricConfig, load_conversations
from utils.utils import parse_key_value_list


Expand All @@ -26,30 +27,48 @@ async def main(args):
judge_models[model_spec] = 1

models_str = ", ".join(f"{model}x{count}" for model, count in judge_models.items())
print(f"🎯 LLM Judge | Models: {models_str} | Rubrics: {', '.join(args.rubrics)}")
print(f"🎯 LLM Judge | Models: {models_str}")

# Load rubric configuration once at startup
print("📚 Loading rubric configuration...")
rubric_config = await RubricConfig.load(rubric_folder="data")

if args.conversation:
# Single conversation with first judge model (single instance)
first_model = next(iter(judge_models.keys()))

# Load single conversation
conversation = await ConversationData.load(args.conversation)

# Create judge with rubric config
judge = LLMJudge(
judge_model=first_model,
rubric_config=rubric_config,
judge_model_extra_params=args.judge_model_extra_params,
)
await judge_single_conversation(
judge, args.conversation, args.rubrics, args.output
)
await judge_single_conversation(judge, conversation, args.output)
else:
# Load all conversations at startup
print(f"📂 Loading conversations from {args.folder}...")
conversations = await load_conversations(args.folder, limit=args.limit)
print(f"✅ Loaded {len(conversations)} conversations")

# Batch evaluation with multiple judges
from pathlib import Path

folder_name = Path(args.folder).name

await judge_conversations(
judge_models=judge_models,
conversation_folder=args.folder,
conversations=conversations,
rubric_config=rubric_config,
max_concurrent=args.max_concurrent,
rubrics=args.rubrics,
output_root=args.output,
limit=args.limit,
conversation_folder_name=folder_name,
verbose=True,
judge_model_extra_params=args.judge_model_extra_params,
per_judge=args.per_judge,
verbose_workers=args.verbose_workers,
)


Expand Down Expand Up @@ -145,6 +164,12 @@ async def main(args):
),
)

parser.add_argument(
"--verbose-workers",
action="store_true",
help="Enable verbose worker logging to show concurrency behavior",
)

args = parser.parse_args()

print(f"Running judge on: {args.folder or args.conversation}")
Expand Down
149 changes: 67 additions & 82 deletions judge/llm_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from judge.constants import BEST_PRACTICE, DAMAGING, NEUTRAL
from judge.question_navigator import QuestionNavigator
from judge.response_models import QuestionResponse
from judge.utils import load_rubric_structure
from judge.rubric_config import ConversationData, RubricConfig
from llm_clients import LLMFactory
from llm_clients.llm_interface import JudgeLLM

Expand All @@ -18,28 +18,20 @@ class LLMJudge:
def __init__(
self,
judge_model: str,
rubric_config: RubricConfig,
judge_model_extra_params: Optional[Dict[str, Any]] = None,
rubric_folder: str = "data",
rubric_prompt_beginning_file: str = "rubric_prompt_beginning.txt",
rubric_file: str = "rubric.tsv",
sep: str = "\t",
log_file: Optional[str] = None,
verbose: bool = False,
question_prompt_file: str = "question_prompt.txt",
):
"""
Initialize the LLM Judge.
Initialize the LLM Judge with pre-loaded rubric configuration.

Args:
judge_model: Model to use for judging.
judge_model_extra_params: Extra parameters for the judge model.
rubric_folder: Folder containing rubric files
rubric_file: File containing the question-flow rubric
sep: Separator for the rubric file
judge_model: Model to use for judging
rubric_config: Pre-loaded rubric configuration data
judge_model_extra_params: Extra parameters for the judge model
log_file: Path to log file (default: logs/judge_{timestamp}.log)
verbose: Whether to print verbose output during initialization

Note: assumes that `rubric_folder` contains the `rubric_file`
"""

# Setup logger
Expand All @@ -65,23 +57,6 @@ def __init__(
self.logger.addHandler(file_handler)

self.log_file = log_file

rubric_path = Path(rubric_folder) / rubric_file
rubric_prompt_beginning_path = (
Path(rubric_folder) / rubric_prompt_beginning_file
)
self.question_prompt_file = Path(rubric_folder) / question_prompt_file
if not rubric_path.exists():
raise FileNotFoundError(f"Rubric file not found: {rubric_path}")
if not rubric_prompt_beginning_path.exists():
raise FileNotFoundError(
f"Rubric prompt beginning file not found: "
f"{rubric_prompt_beginning_path}"
)
if not self.question_prompt_file.exists():
raise FileNotFoundError(
f"Question prompt file not found: {self.question_prompt_file}"
)
self.judge_model = judge_model
self.judge_model_extra_params = judge_model_extra_params or {}

Expand All @@ -90,23 +65,23 @@ def __init__(
if "temperature" not in self.judge_model_extra_params:
self.judge_model_extra_params["temperature"] = 0

# Store rubric data from config
self.rubric_prompt_beginning = rubric_config.rubric_prompt_beginning
self.question_prompt_template = rubric_config.question_prompt_template
self.dimensions = rubric_config.dimensions
self.question_flow_data = rubric_config.question_flow_data
self.question_order = rubric_config.question_order

# Initialize question navigator with parsed data
self.navigator = QuestionNavigator(
question_flow_data=rubric_config.question_flow_data,
question_order=rubric_config.question_order,
)

# Log initialization info
self.logger.info("=== Initializing LLM Judge ===")
self.logger.info(f"Judge model: {judge_model}")
self.logger.info(f"Rubric folder: {rubric_folder}")
self.logger.info(f"Log file: {log_file}")

with open(rubric_prompt_beginning_path, "r", encoding="utf-8") as f:
self.rubric_prompt_beginning = f.read()

# Initialize question navigator (handles rubric parsing and navigation)
self.navigator = QuestionNavigator(str(rubric_path), sep=sep)
self.question_flow_data = self.navigator.question_flow_data
self.question_order = self.navigator.question_order

# Load dimensions from rubric
self.dimensions, _ = load_rubric_structure(str(rubric_path), sep=sep)

self.logger.info(
f"Loaded question-flow rubric with {len(self.question_flow_data)} questions"
)
Expand All @@ -115,29 +90,16 @@ def __init__(
f"Loaded question-flow rubric with {len(self.question_flow_data)} questions"
)

def load_conversation(self, conversation_file: str) -> str:
"""
Load conversation from file.

Args:
conversation_file: Path to conversation file

Returns:
Conversation text
"""
conversation_path = Path(conversation_file)

if not conversation_path.exists():
raise FileNotFoundError(f"Conversation file not found: {conversation_path}")

with open(conversation_path, "r", encoding="utf-8") as f:
return f.read()

def _create_evaluator(
self, conversation: str, conversation_file: str, verbose: bool
self, conversation: str, conversation_filename: str, verbose: bool
) -> JudgeLLM:
"""Create and configure the LLM evaluator with conversation context.

Args:
conversation: Conversation text content
conversation_filename: Filename for logging purposes
verbose: Whether to print verbose output

Returns:
JudgeLLM instance configured for evaluation

Expand All @@ -146,7 +108,7 @@ def _create_evaluator(
"""
# Log evaluation start
self.logger.info("=" * 80)
self.logger.info(f"Starting evaluation: {conversation_file}")
self.logger.info(f"Starting evaluation: {conversation_filename}")
self.logger.info(f"Model: {self.judge_model}")
self.logger.info("=" * 80)
conv_preview = conversation[:1000]
Expand Down Expand Up @@ -185,7 +147,7 @@ def _create_evaluator(

async def evaluate_conversation_question_flow(
self,
conversation_file: str,
conversation: ConversationData,
output_folder: str,
auto_save: bool = True,
verbose: bool = False,
Expand All @@ -202,7 +164,7 @@ async def evaluate_conversation_question_flow(
3. Save results if requested

Args:
conversation_file: Path to conversation file
conversation: ConversationData with content and metadata
output_folder: Folder to save evaluation results
auto_save: Whether to automatically save results to files
verbose: Whether to print progress information
Expand All @@ -220,10 +182,10 @@ async def evaluate_conversation_question_flow(
"Question flow rubric not loaded. Check rubric file exists."
)

# Load conversation and create evaluator
conversation = self.load_conversation(conversation_file)
# Create evaluator with conversation content
conversation_filename = conversation.metadata.get("filename", "unknown")
self.evaluator = self._create_evaluator(
conversation, conversation_file, verbose
conversation.content, conversation_filename, verbose
)

# Step 1: Navigate through questions and collect answers
Expand All @@ -248,7 +210,7 @@ async def evaluate_conversation_question_flow(
self._log_final_results(results)
if auto_save:
self._save_results(
conversation_file, output_folder, results, verbose, judge_instance
conversation, output_folder, results, verbose, judge_instance
)

return results
Expand Down Expand Up @@ -351,14 +313,24 @@ def _log_final_results(

def _save_results(
self,
conversation_file: str,
conversation: ConversationData,
output_folder: str,
results: Dict[str, Dict[str, str]],
verbose: bool,
judge_instance: Optional[int] = None,
):
"""Save evaluation results to file."""
conversation_name = Path(conversation_file).stem
"""Save evaluation results to file.

Args:
conversation: ConversationData with metadata
output_folder: Folder to save results
results: Evaluation results dictionary
verbose: Whether to print progress
judge_instance: Optional judge instance number for filename
"""
# Extract conversation name from metadata
filename = conversation.metadata.get("filename", "unknown.txt")
conversation_name = Path(filename).stem

# Build filename with judge model and instance info
judge_suffix = self.judge_model.replace("/", "_").replace(":", "_")
Expand Down Expand Up @@ -547,7 +519,9 @@ async def _ask_single_question(
options_str = ", ".join(answer_options)

# Build prompt
prompt = self._build_question_prompt(question_text, examples_text, options_str)
prompt = await self._build_question_prompt(
question_text, examples_text, options_str
)

if verbose:
print(f" Options: {options_str}")
Expand Down Expand Up @@ -585,15 +559,26 @@ async def _ask_single_question(
# Return the matched answer and reasoning
return matched_answer, structured_response.reasoning

def _build_question_prompt(self, question: str, examples: str, options: str) -> str:
"""Build the prompt for asking a question."""
async def _build_question_prompt(
self, question: str, examples: str, options: str
) -> str:
"""Build the prompt for asking a question using pre-loaded template.

Args:
question: Question text
examples: Examples text
options: Answer options

Returns:
Formatted question prompt
"""
examples_section = f"\n{examples}\n" if examples else ""
with open(self.question_prompt_file, "r", encoding="utf-8") as f:
prompt = f.read()
prompt = prompt.format(
question=question, examples_section=examples_section, options=options
)
return prompt

# Use pre-loaded template from rubric config
prompt = self.question_prompt_template.format(
question=question, examples_section=examples_section, options=options
)
return prompt

def _store_answer(
self,
Expand Down
Loading