diff --git a/src/art/rewards/ruler.py b/src/art/rewards/ruler.py index 3724105d..2ea33312 100644 --- a/src/art/rewards/ruler.py +++ b/src/art/rewards/ruler.py @@ -39,7 +39,7 @@ class Response(BaseModel): DEFAULT_RUBRIC = dedent( - """ + """ - A trajectory that achieves its goal should always get a significantly higher score than a trajectory that does not achieve its goal. - A trajectory that achieves its goal more efficiently (eg. by avoiding unproductive detours) should get a higher score than a trajectory that achieves its goal less efficiently. - If one trajectory is only slightly better than another, the difference in scores should be small. If it is significantly better, the difference in scores should be large. @@ -117,23 +117,44 @@ async def ruler( else: break + # Detect if all trajectories are identical + all_identical = all( + len(msg_list) == common_prefix_len for msg_list in message_lists + ) + + if all_identical and len(message_lists) > 1: + print( + f"[RULER] Warning: All {len(message_lists)} trajectories are identical. " + "Using absolute scoring (loses relative grounding benefit)." + ) + # If there is a non-empty common prefix, serialize it once to save tokens. + # Skip this optimization if all trajectories are identical (we'll send the full trajectory instead). user_text = "" - if common_prefix_len > 0: + if common_prefix_len > 0 and not all_identical: common_prefix_messages = message_lists[0][:common_prefix_len] user_text += ( "\n" + json.dumps(common_prefix_messages) + "\n\n\n" ) # Serialize each trajectory (minus the common prefix) for the judge. + # If all trajectories are identical, only serialize one full trajectory to save tokens. serialized_trajectories: List[str] = [] - for idx, full_messages in enumerate(message_lists, start=1): - trimmed_messages = full_messages[common_prefix_len:] + if all_identical: + # Send the full trajectory since they're all identical + full_trajectory = message_lists[0] serialized_trajectories.append( - f'\n' - + json.dumps(trimmed_messages) - + "\n" + f'\n' + json.dumps(full_trajectory) + "\n" ) + else: + # Serialize each unique trajectory + for idx, full_messages in enumerate(message_lists, start=1): + trimmed_messages = full_messages[common_prefix_len:] + serialized_trajectories.append( + f'\n' + + json.dumps(trimmed_messages) + + "\n" + ) user_text += "Trajectories:\n\n" + "\n\n".join(serialized_trajectories) @@ -175,9 +196,25 @@ async def ruler( content = first_choice.message.content or "{}" # type: ignore[attr-defined] parsed = Response.model_validate_json(content) - assert len(parsed.scores) == len(message_lists) - return parsed.scores + # If all trajectories were identical, we only sent one to the judge + # Duplicate the score for all trajectories + if all_identical: + if len(parsed.scores) != 1: + raise ValueError( + f"Expected 1 score for identical trajectories, but got {len(parsed.scores)}" + ) + single_score = parsed.scores[0] + return [ + single_score.model_copy(update={"trajectory_id": str(i)}) + for i in range(1, len(message_lists) + 1) + ] + else: + if len(parsed.scores) != len(message_lists): + raise ValueError( + f"Expected {len(message_lists)} scores, but got {len(parsed.scores)}" + ) + return parsed.scores async def ruler_score_group(