Port over Critic system from benchmark project (#1171)

xingyaoww · openhands-agent · web-flow · commit 79868ae53ec3 · 2025-11-18T11:01:18.000+08:00
Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/openhands-sdk/openhands/sdk/critic/__init__.py b/openhands-sdk/openhands/sdk/critic/__init__.py
@@ -0,0 +1,15 @@
+from openhands.sdk.critic.base import CriticBase, CriticResult
+from openhands.sdk.critic.impl import (
+    AgentFinishedCritic,
+    EmptyPatchCritic,
+    PassCritic,
+)
+
+
+__all__ = [
+    "CriticBase",
+    "CriticResult",
+    "AgentFinishedCritic",
+    "EmptyPatchCritic",
+    "PassCritic",
+]
diff --git a/openhands-sdk/openhands/sdk/critic/base.py b/openhands-sdk/openhands/sdk/critic/base.py
@@ -0,0 +1,38 @@
+import abc
+from collections.abc import Sequence
+from typing import ClassVar
+
+from pydantic import BaseModel, Field
+
+from openhands.sdk.event import LLMConvertibleEvent
+from openhands.sdk.utils.models import DiscriminatedUnionMixin
+
+
+class CriticResult(BaseModel):
+    """A critic result is a score and a message."""
+
+    THRESHOLD: ClassVar[float] = 0.5
+
+    score: float = Field(
+        description="A predicted probability of success between 0 and 1.",
+        ge=0.0,
+        le=1.0,
+    )
+    message: str | None = Field(description="An optional message explaining the score.")
+
+    @property
+    def success(self) -> bool:
+        """Whether the agent is successful."""
+        return self.score >= CriticResult.THRESHOLD
+
+
+class CriticBase(DiscriminatedUnionMixin, abc.ABC):
+    """A critic is a function that takes in a list of events,
+    optional git patch, and returns a score about the quality of agent's action.
+    """
+
+    @abc.abstractmethod
+    def evaluate(
+        self, events: Sequence[LLMConvertibleEvent], git_patch: str | None = None
+    ) -> CriticResult:
+        pass
diff --git a/openhands-sdk/openhands/sdk/critic/impl/__init__.py b/openhands-sdk/openhands/sdk/critic/impl/__init__.py
@@ -0,0 +1,12 @@
+"""Critic implementations module."""
+
+from openhands.sdk.critic.impl.agent_finished import AgentFinishedCritic
+from openhands.sdk.critic.impl.empty_patch import EmptyPatchCritic
+from openhands.sdk.critic.impl.pass_critic import PassCritic
+
+
+__all__ = [
+    "AgentFinishedCritic",
+    "EmptyPatchCritic",
+    "PassCritic",
+]
diff --git a/openhands-sdk/openhands/sdk/critic/impl/agent_finished.py b/openhands-sdk/openhands/sdk/critic/impl/agent_finished.py
@@ -0,0 +1,83 @@
+"""
+AgentFinishedCritic implementation.
+
+This critic evaluates whether an agent properly finished a task by checking:
+1. The agent's last action was a FinishAction (proper completion)
+2. The generated git patch is non-empty (actual changes were made)
+"""
+
+from collections.abc import Sequence
+
+from openhands.sdk.critic.base import CriticBase, CriticResult
+from openhands.sdk.event import ActionEvent, LLMConvertibleEvent
+from openhands.sdk.logger import get_logger
+from openhands.sdk.tool.builtins.finish import FinishAction
+
+
+logger = get_logger(__name__)
+
+
+class AgentFinishedCritic(CriticBase):
+    """
+    Critic that evaluates whether an agent properly finished a task.
+
+    This critic checks two main criteria:
+    1. The agent's last action was a FinishAction (proper completion)
+    2. The generated git patch is non-empty (actual changes were made)
+    """
+
+    def evaluate(
+        self, events: Sequence[LLMConvertibleEvent], git_patch: str | None = None
+    ) -> CriticResult:
+        """
+        Evaluate if an agent properly finished with a non-empty git patch.
+
+        Args:
+            events: List of events from the agent's execution
+            git_patch: Optional git patch generated by the agent
+
+        Returns:
+            CriticResult with score 1.0 if successful, 0.0 otherwise
+        """
+        reasons = []
+
+        # Check if git patch is non-empty
+        if not git_patch or not git_patch.strip():
+            reasons.append("Empty git patch")
+            logger.debug("AgentFinishedCritic: Empty git patch")
+            return CriticResult(
+                score=0.0,
+                message="Agent did not produce a non-empty git patch. "
+                + "; ".join(reasons),
+            )
+
+        # Check if agent properly finished with FinishAction
+        if not self._has_finish_action(events):
+            reasons.append("No FinishAction found")
+            logger.debug("AgentFinishedCritic: No FinishAction")
+            return CriticResult(
+                score=0.0,
+                message="Agent did not finish properly. " + "; ".join(reasons),
+            )
+
+        logger.debug("AgentFinishedCritic: Successfully completed")
+        return CriticResult(
+            score=1.0,
+            message="Agent completed with FinishAction and non-empty patch",
+        )
+
+    def _has_finish_action(self, events: Sequence[LLMConvertibleEvent]) -> bool:
+        """Check if the last action was a FinishAction."""
+        if not events:
+            return False
+
+        # Look for the last ActionEvent in the history
+        for event in reversed(events):
+            if isinstance(event, ActionEvent):
+                # Check if this is a FinishAction
+                if event.action and isinstance(event.action, FinishAction):
+                    return True
+                # If we find any other action type, the agent didn't finish
+                return False
+
+        return False
diff --git a/openhands-sdk/openhands/sdk/critic/impl/empty_patch.py b/openhands-sdk/openhands/sdk/critic/impl/empty_patch.py
@@ -0,0 +1,49 @@
+"""
+EmptyPatchCritic implementation.
+
+This critic only evaluates whether a git patch is non-empty.
+Unlike AgentFinishedCritic, it does not check for proper agent completion.
+"""
+
+from collections.abc import Sequence
+
+from openhands.sdk.critic.base import CriticBase, CriticResult
+from openhands.sdk.event import LLMConvertibleEvent
+from openhands.sdk.logger import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class EmptyPatchCritic(CriticBase):
+    """
+    Critic that only evaluates whether a git patch is non-empty.
+
+    This critic checks only one criterion:
+    - The generated git patch is non-empty (actual changes were made)
+
+    Unlike AgentFinishedCritic, this critic does not check for proper
+    agent completion with FinishAction.
+    """
+
+    def evaluate(
+        self,
+        events: Sequence[LLMConvertibleEvent],  # noqa: ARG002
+        git_patch: str | None = None,
+    ) -> CriticResult:
+        """
+        Evaluate if a git patch is non-empty.
+
+        Args:
+            events: List of events from the agent's execution (not used)
+            git_patch: Optional git patch generated by the agent
+
+        Returns:
+            CriticResult with score 1.0 if patch is non-empty, 0.0 otherwise
+        """
+        if not git_patch or not git_patch.strip():
+            logger.debug("EmptyPatchCritic: Empty git patch")
+            return CriticResult(score=0.0, message="Git patch is empty or missing")
+
+        logger.debug("EmptyPatchCritic: Non-empty git patch found")
+        return CriticResult(score=1.0, message="Git patch is non-empty")
diff --git a/openhands-sdk/openhands/sdk/critic/impl/pass_critic.py b/openhands-sdk/openhands/sdk/critic/impl/pass_critic.py
@@ -0,0 +1,42 @@
+"""
+PassCritic implementation.
+
+This critic always returns success, useful when no evaluation is needed
+or when all instances should be considered successful.
+"""
+
+from collections.abc import Sequence
+
+from openhands.sdk.critic.base import CriticBase, CriticResult
+from openhands.sdk.event import LLMConvertibleEvent
+from openhands.sdk.logger import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class PassCritic(CriticBase):
+    """
+    Critic that always returns success.
+
+    This critic can be used when no evaluation is needed or when
+    all instances should be considered successful regardless of their output.
+    """
+
+    def evaluate(
+        self,
+        events: Sequence[LLMConvertibleEvent],  # noqa: ARG002
+        git_patch: str | None = None,  # noqa: ARG002
+    ) -> CriticResult:
+        """
+        Always evaluate as successful.
+
+        Args:
+            events: List of events from the agent's execution (not used)
+            git_patch: Optional git patch generated by the agent (not used)
+
+        Returns:
+            CriticResult with score 1.0 (always successful)
+        """
+        logger.debug("PassCritic: Always returns success")
+        return CriticResult(score=1.0, message="PassCritic always succeeds")
diff --git a/tests/sdk/critic/__init__.py b/tests/sdk/critic/__init__.py
@@ -0,0 +1 @@
+"""Tests for the critic module."""
diff --git a/tests/sdk/critic/test_critic.py b/tests/sdk/critic/test_critic.py