Skip to content

Commit 79868ae

Browse files
Port over Critic system from benchmark project (#1171)
Co-authored-by: openhands <openhands@all-hands.dev>
1 parent 3ed2971 commit 79868ae

File tree

8 files changed

+534
-0
lines changed

8 files changed

+534
-0
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from openhands.sdk.critic.base import CriticBase, CriticResult
2+
from openhands.sdk.critic.impl import (
3+
AgentFinishedCritic,
4+
EmptyPatchCritic,
5+
PassCritic,
6+
)
7+
8+
9+
__all__ = [
10+
"CriticBase",
11+
"CriticResult",
12+
"AgentFinishedCritic",
13+
"EmptyPatchCritic",
14+
"PassCritic",
15+
]
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import abc
2+
from collections.abc import Sequence
3+
from typing import ClassVar
4+
5+
from pydantic import BaseModel, Field
6+
7+
from openhands.sdk.event import LLMConvertibleEvent
8+
from openhands.sdk.utils.models import DiscriminatedUnionMixin
9+
10+
11+
class CriticResult(BaseModel):
12+
"""A critic result is a score and a message."""
13+
14+
THRESHOLD: ClassVar[float] = 0.5
15+
16+
score: float = Field(
17+
description="A predicted probability of success between 0 and 1.",
18+
ge=0.0,
19+
le=1.0,
20+
)
21+
message: str | None = Field(description="An optional message explaining the score.")
22+
23+
@property
24+
def success(self) -> bool:
25+
"""Whether the agent is successful."""
26+
return self.score >= CriticResult.THRESHOLD
27+
28+
29+
class CriticBase(DiscriminatedUnionMixin, abc.ABC):
30+
"""A critic is a function that takes in a list of events,
31+
optional git patch, and returns a score about the quality of agent's action.
32+
"""
33+
34+
@abc.abstractmethod
35+
def evaluate(
36+
self, events: Sequence[LLMConvertibleEvent], git_patch: str | None = None
37+
) -> CriticResult:
38+
pass
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
"""Critic implementations module."""
2+
3+
from openhands.sdk.critic.impl.agent_finished import AgentFinishedCritic
4+
from openhands.sdk.critic.impl.empty_patch import EmptyPatchCritic
5+
from openhands.sdk.critic.impl.pass_critic import PassCritic
6+
7+
8+
__all__ = [
9+
"AgentFinishedCritic",
10+
"EmptyPatchCritic",
11+
"PassCritic",
12+
]
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""
2+
AgentFinishedCritic implementation.
3+
4+
This critic evaluates whether an agent properly finished a task by checking:
5+
1. The agent's last action was a FinishAction (proper completion)
6+
2. The generated git patch is non-empty (actual changes were made)
7+
"""
8+
9+
from collections.abc import Sequence
10+
11+
from openhands.sdk.critic.base import CriticBase, CriticResult
12+
from openhands.sdk.event import ActionEvent, LLMConvertibleEvent
13+
from openhands.sdk.logger import get_logger
14+
from openhands.sdk.tool.builtins.finish import FinishAction
15+
16+
17+
logger = get_logger(__name__)
18+
19+
20+
class AgentFinishedCritic(CriticBase):
21+
"""
22+
Critic that evaluates whether an agent properly finished a task.
23+
24+
This critic checks two main criteria:
25+
1. The agent's last action was a FinishAction (proper completion)
26+
2. The generated git patch is non-empty (actual changes were made)
27+
"""
28+
29+
def evaluate(
30+
self, events: Sequence[LLMConvertibleEvent], git_patch: str | None = None
31+
) -> CriticResult:
32+
"""
33+
Evaluate if an agent properly finished with a non-empty git patch.
34+
35+
Args:
36+
events: List of events from the agent's execution
37+
git_patch: Optional git patch generated by the agent
38+
39+
Returns:
40+
CriticResult with score 1.0 if successful, 0.0 otherwise
41+
"""
42+
reasons = []
43+
44+
# Check if git patch is non-empty
45+
if not git_patch or not git_patch.strip():
46+
reasons.append("Empty git patch")
47+
logger.debug("AgentFinishedCritic: Empty git patch")
48+
return CriticResult(
49+
score=0.0,
50+
message="Agent did not produce a non-empty git patch. "
51+
+ "; ".join(reasons),
52+
)
53+
54+
# Check if agent properly finished with FinishAction
55+
if not self._has_finish_action(events):
56+
reasons.append("No FinishAction found")
57+
logger.debug("AgentFinishedCritic: No FinishAction")
58+
return CriticResult(
59+
score=0.0,
60+
message="Agent did not finish properly. " + "; ".join(reasons),
61+
)
62+
63+
logger.debug("AgentFinishedCritic: Successfully completed")
64+
return CriticResult(
65+
score=1.0,
66+
message="Agent completed with FinishAction and non-empty patch",
67+
)
68+
69+
def _has_finish_action(self, events: Sequence[LLMConvertibleEvent]) -> bool:
70+
"""Check if the last action was a FinishAction."""
71+
if not events:
72+
return False
73+
74+
# Look for the last ActionEvent in the history
75+
for event in reversed(events):
76+
if isinstance(event, ActionEvent):
77+
# Check if this is a FinishAction
78+
if event.action and isinstance(event.action, FinishAction):
79+
return True
80+
# If we find any other action type, the agent didn't finish
81+
return False
82+
83+
return False
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
"""
2+
EmptyPatchCritic implementation.
3+
4+
This critic only evaluates whether a git patch is non-empty.
5+
Unlike AgentFinishedCritic, it does not check for proper agent completion.
6+
"""
7+
8+
from collections.abc import Sequence
9+
10+
from openhands.sdk.critic.base import CriticBase, CriticResult
11+
from openhands.sdk.event import LLMConvertibleEvent
12+
from openhands.sdk.logger import get_logger
13+
14+
15+
logger = get_logger(__name__)
16+
17+
18+
class EmptyPatchCritic(CriticBase):
19+
"""
20+
Critic that only evaluates whether a git patch is non-empty.
21+
22+
This critic checks only one criterion:
23+
- The generated git patch is non-empty (actual changes were made)
24+
25+
Unlike AgentFinishedCritic, this critic does not check for proper
26+
agent completion with FinishAction.
27+
"""
28+
29+
def evaluate(
30+
self,
31+
events: Sequence[LLMConvertibleEvent], # noqa: ARG002
32+
git_patch: str | None = None,
33+
) -> CriticResult:
34+
"""
35+
Evaluate if a git patch is non-empty.
36+
37+
Args:
38+
events: List of events from the agent's execution (not used)
39+
git_patch: Optional git patch generated by the agent
40+
41+
Returns:
42+
CriticResult with score 1.0 if patch is non-empty, 0.0 otherwise
43+
"""
44+
if not git_patch or not git_patch.strip():
45+
logger.debug("EmptyPatchCritic: Empty git patch")
46+
return CriticResult(score=0.0, message="Git patch is empty or missing")
47+
48+
logger.debug("EmptyPatchCritic: Non-empty git patch found")
49+
return CriticResult(score=1.0, message="Git patch is non-empty")
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""
2+
PassCritic implementation.
3+
4+
This critic always returns success, useful when no evaluation is needed
5+
or when all instances should be considered successful.
6+
"""
7+
8+
from collections.abc import Sequence
9+
10+
from openhands.sdk.critic.base import CriticBase, CriticResult
11+
from openhands.sdk.event import LLMConvertibleEvent
12+
from openhands.sdk.logger import get_logger
13+
14+
15+
logger = get_logger(__name__)
16+
17+
18+
class PassCritic(CriticBase):
19+
"""
20+
Critic that always returns success.
21+
22+
This critic can be used when no evaluation is needed or when
23+
all instances should be considered successful regardless of their output.
24+
"""
25+
26+
def evaluate(
27+
self,
28+
events: Sequence[LLMConvertibleEvent], # noqa: ARG002
29+
git_patch: str | None = None, # noqa: ARG002
30+
) -> CriticResult:
31+
"""
32+
Always evaluate as successful.
33+
34+
Args:
35+
events: List of events from the agent's execution (not used)
36+
git_patch: Optional git patch generated by the agent (not used)
37+
38+
Returns:
39+
CriticResult with score 1.0 (always successful)
40+
"""
41+
logger.debug("PassCritic: Always returns success")
42+
return CriticResult(score=1.0, message="PassCritic always succeeds")

tests/sdk/critic/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Tests for the critic module."""

0 commit comments

Comments
 (0)