In [2]:
from dotenv import load_dotenv
import sys
sys.path.append('..')

from src.phyvpuzzle.agents.reward_agent import ActionCandidate, OpenAIRewardAgent, TransformersRewardAgent
from src.phyvpuzzle.core.config import StepSelectionConfig
from PIL import Image
import os

load_dotenv()

False

In [7]:
import json

# 定义示例变量
task_description = "Pick up the red cube and place it in the blue container"
object_mapping = {
    "red_cube": {"id": 1, "position": [0.5, 0.3, 0.1], "color": "red", "shape": "cube"},
    "blue_container": {"id": 2, "position": [0.8, 0.5, 0.0], "color": "blue", "shape": "container"}
}
interaction_history = [
    "Step 1: Moved gripper to position [0.5, 0.3, 0.2]",
    "Step 2: Opened gripper"
]

# 定义候选动作对象
class Candidate:
    def __init__(self):
        self.action_type = "grasp"
        self.parameters = {"object_id": 1, "position": [0.5, 0.3, 0.1], "force": 0.5}
        self.response = "The red cube is within reach and the gripper is positioned correctly above it. Grasping it now will allow us to proceed with the task."

candidate = Candidate()

prompt = f"""You are a process supervision model for evaluating actions in physics-based tasks.

## Task Description:
{task_description}

## Current Object Information:
{object_mapping}

## Previous Interaction History:
{interaction_history}

## Proposed Action:
- Action Type: {candidate.action_type}
- Parameters: {json.dumps(candidate.parameters, indent=2)}
- Agent's Reasoning: {candidate.response}

## Your Task:
Evaluate whether this action is correct and helpful for completing the task.

Consider:
1. Visual Accuracy: Are visual elements from the image correctly identified (shapes, colors, positions, quantities, spatial relationships)?
2. Physical Validity: Does this action make physical sense given the current state?
3. Task Progress: Does it move toward the task goal efficiently?
4. Logical Reasoning: Is the agent's reasoning sound and logical?
5. Risk Assessment: Are there any risks of making the task harder to complete?

Response:
• "+" if the action is correct and helpful for task completion
• "-" if the action has any errors or is unhelpful

Only respond with "+" or "-". No explanations."""

print(prompt)

You are a process supervision model for evaluating actions in physics-based tasks.

## Task Description:
Pick up the red cube and place it in the blue container

## Current Object Information:
{'red_cube': {'id': 1, 'position': [0.5, 0.3, 0.1], 'color': 'red', 'shape': 'cube'}, 'blue_container': {'id': 2, 'position': [0.8, 0.5, 0.0], 'color': 'blue', 'shape': 'container'}}

## Previous Interaction History:
['Step 1: Moved gripper to position [0.5, 0.3, 0.2]', 'Step 2: Opened gripper']

## Proposed Action:
- Action Type: grasp
- Parameters: {
  "object_id": 1,
  "position": [
    0.5,
    0.3,
    0.1
  ],
  "force": 0.5
}
- Agent's Reasoning: The red cube is within reach and the gripper is positioned correctly above it. Grasping it now will allow us to proceed with the task.

## Your Task:
Evaluate whether this action is correct and helpful for completing the task.

Consider:
1. Visual Accuracy: Are visual elements from the image correctly identified (shapes, colors, positions, quantit

In [8]:
# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch

processor = AutoProcessor.from_pretrained("ob11/Qwen-VL-PRM-3B")
model = AutoModelForImageTextToText.from_pretrained("ob11/Qwen-VL-PRM-3B", device_map="cpu")

# 测试示例
from PIL import Image
import requests

# 准备测试图像
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

# 准备对话
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            # {"type": "text", "text": "Describe this image."},
            {"type": "text", "text": prompt},
        ],
    }
]

# 处理输入
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(text=text_prompt, images=[image], return_tensors="pt").to("cpu")

# 生成输出
output_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

print("模型输出:", output_text[0])


from torch.nn import functional as F

outputs = model(**inputs)
logits = outputs.logits

# Get the logits for the next token position
next_token_logits = logits[0, -1, :]

# Get token IDs for "+" and "-"
plus_token_id = processor.tokenizer.encode("+", add_special_tokens=False)[0]
minus_token_id = processor.tokenizer.encode("-", add_special_tokens=False)[0]

# Extract logits for + and - tokens
plus_logit = next_token_logits[plus_token_id].item()
minus_logit = next_token_logits[minus_token_id].item()

print(plus_logit, minus_logit)

# Apply softmax to get probabilities
logits_tensor = torch.tensor([plus_logit, minus_logit])
probs = F.softmax(logits_tensor, dim=0)

print(probs)

# Return probability of + token as reward score (0-1 range)
# Convert to 1-5 scale for consistency
prob_plus = probs[0].item()

Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 34952.53it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.33it/s]


模型输出: +
31.007083892822266 30.146175384521484
tensor([0.7029, 0.2971])


# Step Selection 方法对比示例

本示例展示三种 Step Selection 方法：
1. **Reward Model - Generative** (Transformers 本地模型)：1-5 分评分
2. **Reward Model - Discriminative** (Transformers 本地模型)：+/- 二元评分
3. **Pairwise Judge** (OpenAI GPT-5)：成对比较排序

---

## 方法 1: Reward Model - Generative (1-5 分)

In [None]:
# 配置 Reward Model - Generative 模式 (1-5 分)
config_reward_generative = StepSelectionConfig(
    enabled=True,
    selection_method="reward_model",
    num_candidates=3,
    top_k=3,
    reward_model_type="transformers",
    reward_model_name="ob11/Qwen-VL-PRM-3B",
    reward_model_mode="generative",  # 使用 generative 模式
    reward_device="auto",
    reward_torch_dtype="auto"
)

# 创建 Transformers Reward Model Agent (Generative)
reward_agent_generative = TransformersRewardAgent(config_reward_generative)

print("✓ Reward Model Agent - Generative (Qwen-VL-PRM-3B) 初始化完成")

In [None]:
# 准备测试数据
# 创建一个简单的测试图像
current_image = Image.new('RGB', (512, 512), color='white')

# 任务描述
task_description = """
3D CUBE STACKING PUZZLE

You have 7 3D puzzle pieces and one container.

TASK:
Assemble all pieces into the container to form a solid 3×3×3 cube (27 unit cubes total).

GOAL:
- Fit every piece completely inside the container.
- No gaps, overlaps, or floating pieces.
- The final structure must be stable and form a perfect cube.
"""

# 对象映射
object_mapping = """
OBJECT MAPPING (object_id → properties):
============================================================
object_id=2, RGB=(153, 48, 97)
object_id=3, RGB=(169, 27, 0)
object_id=4, RGB=(0, 42, 113)
============================================================
Total movable objects: 3
"""

# 交互历史
interaction_history = "Step 0: Initial state"

print("✓ 测试数据准备完成")

In [None]:
# 创建候选 actions（用于 Reward Model）
candidates_reward = [
    ActionCandidate(
        action_type="move_object",
        parameters={"object_id": 2, "target_position": [0.1, 0.2, 0.3]},
        response="I will move object 2 to position (0.1, 0.2, 0.3) to start building the structure.",
        tool_call={"id": "call_1", "type": "function", "function": {"name": "move_object", "arguments": '{"object_id": 2, "target_position": [0.1, 0.2, 0.3]}'}}
    ),
    ActionCandidate(
        action_type="move_object",
        parameters={"object_id": 3, "target_position": [0.0, 0.0, 0.1]},
        response="I will move object 3 to the base position to create a stable foundation.",
        tool_call={"id": "call_2", "type": "function", "function": {"name": "move_object", "arguments": '{"object_id": 3, "target_position": [0.0, 0.0, 0.1]}'}}
    ),
    ActionCandidate(
        action_type="move_object",
        parameters={"object_id": 4, "target_position": [-0.1, 0.1, 0.2]},
        response="I will place object 4 at this position to complete the first layer.",
        tool_call={"id": "call_3", "type": "function", "function": {"name": "move_object", "arguments": '{"object_id": 4, "target_position": [-0.1, 0.1, 0.2]}'}}
    )
]

# 使用 Reward Model - Generative 评分
print("使用 Reward Model - Generative (1-5 分) 方法评分...")
print("=" * 60)

best_actions_generative = reward_agent_generative.select_best_actions(
    candidates=candidates_reward,
    current_image=current_image,
    task_description=task_description,
    object_mapping=object_mapping,
    interaction_history=interaction_history
)

print(f"\nReward Model - Generative 选择了 top-{len(best_actions_generative)} 个最佳 actions:\n")

for i, action in enumerate(best_actions_generative, 1):
    print(f"第 {i} 名:")
    print(f"  Action: {action.action_type}")
    print(f"  Parameters: {action.parameters}")
    print(f"  Score: {action.score:.3f} (1-5 分)")
    print(f"  Reasoning: {action.response[:100]}...")
    print()

---

## 方法 2: Reward Model - Discriminative (+/- 二元)

In [3]:
# 配置 Reward Model - Discriminative 模式 (+/- 二元)
config_reward_discriminative = StepSelectionConfig(
    enabled=True,
    selection_method="reward_model",
    num_candidates=3,
    top_k=2,
    reward_model_type="transformers",
    reward_model_name="ob11/Qwen-VL-PRM-3B",
    reward_model_mode="discriminative",  # 使用 discriminative 模式
    reward_device="auto",
    reward_torch_dtype="auto"
)

# 创建 Transformers Reward Model Agent (Discriminative)
reward_agent_discriminative = TransformersRewardAgent(config_reward_discriminative)

print("✓ Reward Model Agent - Discriminative (Qwen-VL-PRM-3B) 初始化完成")

✓ Reward Model Agent - Discriminative (Qwen-VL-PRM-3B) 初始化完成


In [4]:
# 创建候选 actions（用于 Discriminative 模式）
current_image = Image.new('RGB', (512, 512), color='white')
candidates_discriminative = [
    ActionCandidate(
        action_type="move_object",
        parameters={"object_id": 2, "target_position": [0.1, 0.2, 0.3]},
        response="I will move object 2 to position (0.1, 0.2, 0.3) to start building the structure.",
        tool_call={"id": "call_1", "type": "function", "function": {"name": "move_object", "arguments": '{"object_id": 2, "target_position": [0.1, 0.2, 0.3]}'}}
    ),
    ActionCandidate(
        action_type="move_object",
        parameters={"object_id": 3, "target_position": [0.0, 0.0, 0.1]},
        response="I will move object 3 to the base position to create a stable foundation.",
        tool_call={"id": "call_2", "type": "function", "function": {"name": "move_object", "arguments": '{"object_id": 3, "target_position": [0.0, 0.0, 0.1]}'}}
    ),
    ActionCandidate(
        action_type="move_object",
        parameters={"object_id": 4, "target_position": [-0.1, 0.1, 0.2]},
        response="I will place object 4 at this position to complete the first layer.",
        tool_call={"id": "call_3", "type": "function", "function": {"name": "move_object", "arguments": '{"object_id": 4, "target_position": [-0.1, 0.1, 0.2]}'}}
    )
]

# 使用 Reward Model - Discriminative 评分
print("使用 Reward Model - Discriminative (+/-) 方法评分...")
print("=" * 60)

best_actions_discriminative = reward_agent_discriminative.select_best_actions(
    candidates=candidates_discriminative,
    current_image=current_image,
    task_description="Build a stable structure using the available objects",
    object_mapping={2: "red_cube", 3: "blue_cube", 4: "green_cube"},
    interaction_history=[]
)

print(f"\nReward Model - Discriminative 选择了 top-{len(best_actions_discriminative)} 个最佳 actions:\n")

for i, action in enumerate(best_actions_discriminative, 1):
    print(f"第 {i} 名:")
    print(f"  Action: {action.action_type}")
    print(f"  Parameters: {action.parameters}")
    print(f"  Score: {action.score:.3f} (基于 +/- logits 的概率)")
    print(f"  Reasoning: {action.response[:100]}...")
    print()

使用 Reward Model - Discriminative (+/-) 方法评分...


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 38304.15it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 35.91it/s]



Reward Model - Discriminative 选择了 top-3 个最佳 actions:

第 1 名:
  Action: move_object
  Parameters: {'object_id': 2, 'target_position': [0.1, 0.2, 0.3]}
  Score: 3.490 (基于 +/- logits 的概率)
  Reasoning: I will move object 2 to position (0.1, 0.2, 0.3) to start building the structure....

第 2 名:
  Action: move_object
  Parameters: {'object_id': 3, 'target_position': [0.0, 0.0, 0.1]}
  Score: 3.490 (基于 +/- logits 的概率)
  Reasoning: I will move object 3 to the base position to create a stable foundation....

第 3 名:
  Action: move_object
  Parameters: {'object_id': 4, 'target_position': [-0.1, 0.1, 0.2]}
  Score: 3.490 (基于 +/- logits 的概率)
  Reasoning: I will place object 4 at this position to complete the first layer....



---

## 方法 3: Pairwise Judge (OpenAI GPT-5)

In [None]:
# 配置 Pairwise Judge (使用 GPT-5)
config_pairwise = StepSelectionConfig(
    enabled=True,
    selection_method="pairwise_judge",
    num_candidates=3,
    top_k=3,
    pairwise_judge_type="openai",
    pairwise_judge_model="gpt-5",
    pairwise_sort_method="merge"
)

# 创建 OpenAI Pairwise Judge Agent
reward_agent_pairwise = OpenAIRewardAgent(config_pairwise)

print("✓ Pairwise Judge Agent (GPT-5) 初始化完成")

In [None]:
# 创建候选 actions
candidates_pairwise = [
    ActionCandidate(
        action_type="move_object",
        parameters={"object_id": 2, "target_position": [0.1, 0.2, 0.3]},
        response="I will move object 2 to position (0.1, 0.2, 0.3) to start building the structure.",
        tool_call={"id": "call_1", "type": "function", "function": {"name": "move_object", "arguments": '{"object_id": 2, "target_position": [0.1, 0.2, 0.3]}'}}
    ),
    ActionCandidate(
        action_type="move_object",
        parameters={"object_id": 3, "target_position": [0.0, 0.0, 0.1]},
        response="I will move object 3 to the base position to create a stable foundation.",
        tool_call={"id": "call_2", "type": "function", "function": {"name": "move_object", "arguments": '{"object_id": 3, "target_position": [0.0, 0.0, 0.1]}'}}
    ),
    ActionCandidate(
        action_type="move_object",
        parameters={"object_id": 4, "target_position": [-0.1, 0.1, 0.2]},
        response="I will place object 4 at this position to complete the first layer.",
        tool_call={"id": "call_3", "type": "function", "function": {"name": "move_object", "arguments": '{"object_id": 4, "target_position": [-0.1, 0.1, 0.2]}'}}
    )
]

# 使用 pairwise judge 评分
print("使用 Pairwise Judge (GPT-5) 方法评分...")
print("=" * 60)

best_actions_pairwise = reward_agent_pairwise.select_best_actions(
    candidates=candidates_pairwise,
    current_image=current_image,
    task_description=task_description,
    object_mapping=object_mapping,
    interaction_history=interaction_history
)

print(f"\nPairwise Judge 选择了 top-{len(best_actions_pairwise)} 个最佳 actions:\n")

for i, action in enumerate(best_actions_pairwise, 1):
    print(f"第 {i} 名:")
    print(f"  Action: {action.action_type}")
    print(f"  Parameters: {action.parameters}")
    print(f"  Score: {action.score:.3f} (相对得分)")
    print(f"  Reasoning: {action.response[:100]}...")
    print()

## 总结

### 方法 1: Reward Model - Generative (Transformers 本地模型)
- **模型**: ob11/Qwen-VL-PRM-3B (本地)
- **模式**: Generative
- **优点**: 独立评分，速度快（O(n) 次调用）
- **评分范围**: 1-5 分（带详细推理）
- **适用场景**: 候选较多时更高效，需要解释性评分

### 方法 2: Reward Model - Discriminative (Transformers 本地模型)
- **模型**: ob11/Qwen-VL-PRM-3B (本地)
- **模式**: Discriminative
- **优点**: 基于 logits 的概率评分，更快速
- **评分方式**: 提取 +/- token 的 logits，softmax 后得到概率（转换为 1-5 分）
- **适用场景**: 需要快速二元判断的场景，适合 PRM 模型

### 方法 3: Pairwise Judge (OpenAI GPT-5)
- **模型**: GPT-5 (API 调用)
- **优点**: 相对比较，排序更准确
- **评分方式**: 基于胜负次数的相对得分 (0-1)
- **复杂度**: 
  - bubble: O(n²) 次比较
  - merge: O(n log n) 次比较
  - full: O(n²) 次比较（最准确）
- **适用场景**: 需要精确排序，候选较少时使用

### 推荐组合
- **Reward Model - Generative**: 适合需要详细评分解释的场景
- **Reward Model - Discriminative**: 适合快速评分，特别是使用 PRM (Process Reward Model)
- **Pairwise Judge**: OpenAI GPT-5 用于最准确的排序（成本较高）

In [None]:
# 完成示例
print("\n" + "=" * 60)
print("三种方法对比完成！")
print("=" * 60)
print("\n对比总结:")
print("1. Generative 模式: 提供详细的 1-5 分评分和推理")
print("2. Discriminative 模式: 基于 +/- token logits 的快速评分")
print("3. Pairwise Judge: 通过成对比较获得最准确的排序")

True