RobotecAI · maciejmajek · Sep 15, 2025 · Aug 8, 2025 · Aug 11, 2025 · Aug 18, 2025
diff --git a/docs/simulation_and_benchmarking/rai_bench.md b/docs/simulation_and_benchmarking/rai_bench.md
@@ -6,6 +6,7 @@ RAI Bench is a comprehensive package that both provides benchmarks with ready-to
 
 -   [Manipulation O3DE Benchmark](#manipulation-o3de-benchmark)
 -   [Tool Calling Agent Benchmark](#tool-calling-agent-benchmark)
+-   [VLM Benchmark](#vlm-benchmark)
 
 ## Manipulation O3DE Benchmark
 
@@ -94,9 +95,9 @@ Evaluates agent performance independently from any simulation, based only on too
 The `SubTask` class is used to validate just one tool call. Following classes are available:
 
 -   `CheckArgsToolCallSubTask` - verify if a certain tool was called with expected arguments
--   `CheckTopicFieldsToolCallSubTask` - verify if a message published to ROS 2topic was of proper type and included expected fields
--   `CheckServiceFieldsToolCallSubTask` - verify if a message published to ROS 2service was of proper type and included expected fields
--   `CheckActionFieldsToolCallSubTask` - verify if a message published to ROS 2action was of proper type and included expected fields
+-   `CheckTopicFieldsToolCallSubTask` - verify if a message published to ROS2 topic was of proper type and included expected fields
+-   `CheckServiceFieldsToolCallSubTask` - verify if a message published to ROS2 service was of proper type and included expected fields
+-   `CheckActionFieldsToolCallSubTask` - verify if a message published to ROS2 action was of proper type and included expected fields
 
 ### Validator
 
@@ -129,7 +130,6 @@ The ToolCallingAgentBenchmark class manages the execution of tasks and collects
 There are predefined Tasks available which are grouped by categories:
 
 -   Basic - require retrieving info from certain topics
--   Spatial reasoning - questions about surroundings with images attached
 -   Manipulation
 -   Custom Interfaces - requires using messages with custom interfaces
 
@@ -164,3 +164,17 @@ class TaskArgs(BaseModel):
     -   `GetROS2RGBCameraTask` has 1 required tool call and 1 optional. When `extra_tool_calls` set to 5, agent can correct himself couple times and still pass even with 7 tool calls. There can be 2 types of invalid tool calls, first when the tool is used incorrectly and agent receives an error - this allows him to correct himself easier. Second type is when tool is called properly but it is not the tool that should be called or it is called with wrong params. In this case agent won't get any error so it will be harder for him to correct, but BOTH of these cases are counted as `extra tool call`.
 
 If you want to know details about every task, visit `rai_bench/tool_calling_agent/tasks`
+
+## VLM Benchmark
+
+The VLM Benchmark is a benchmark for VLM models. It includes a set of tasks containing questions related to images and evaluates the performance of the agent that returns the answer in the structured format.
+
+### Running
+
+To run the benchmark:
+
+```bash
+cd rai
+source setup_shell.sh
+python src/rai_bench/rai_bench/examples/vlm_benchmark.py --model-name gemma3:4b --vendor ollama
+```
diff --git a/docs/tutorials/benchmarking.md b/docs/tutorials/benchmarking.md
@@ -73,7 +73,6 @@ if __name__ == "__main__":
         extra_tool_calls=[0, 5],  # how many extra tool calls allowed to still pass
         task_types=[  # what types of tasks to include
             "basic",
-            "spatial_reasoning",
             "custom_interfaces",
         ],
         N_shots=[0, 2],  # examples in system prompt
@@ -95,7 +94,7 @@ if __name__ == "__main__":
     )
 ```
 
-Based on the example above the `Tool Calling` benchmark will run basic, spatial_reasoning and custom_interfaces tasks with every configuration of [extra_tool_calls x N_shots x prompt_detail] provided which will result in almost 500 tasks. Manipulation benchmark will run all specified task level once as there is no additional params. Reapeat is set to 1 in both configs so there will be no additional runs.
+Based on the example above the `Tool Calling` benchmark will run basic and custom_interfaces tasks with every configuration of [extra_tool_calls x N_shots x prompt_detail] provided which will result in almost 500 tasks. Manipulation benchmark will run all specified task level once as there is no additional params. Reapeat is set to 1 in both configs so there will be no additional runs.
 
 !!! note
 

diff --git a/src/rai_bench/rai_bench/__init__.py b/src/rai_bench/rai_bench/__init__.py
@@ -14,14 +14,14 @@
 from .test_models import (
     ManipulationO3DEBenchmarkConfig,
     ToolCallingAgentBenchmarkConfig,
-    test_dual_agents,
     test_models,
 )
 from .utils import (
     define_benchmark_logger,
     get_llm_for_benchmark,
     parse_manipulation_o3de_benchmark_args,
     parse_tool_calling_benchmark_args,
+    parse_vlm_benchmark_args,
 )
 
 __all__ = [
@@ -31,6 +31,6 @@
     "get_llm_for_benchmark",
     "parse_manipulation_o3de_benchmark_args",
     "parse_tool_calling_benchmark_args",
-    "test_dual_agents",
+    "parse_vlm_benchmark_args",
     "test_models",
 ]
diff --git a/src/rai_bench/rai_bench/agents.py b/src/rai_bench/rai_bench/agents.py
diff --git a/src/rai_bench/rai_bench/docs/tool_calling_agent_benchmark.md b/src/rai_bench/rai_bench/docs/tool_calling_agent_benchmark.md
@@ -14,4 +14,4 @@ Implementations can be found:
 
 -   Validators [Validators](../tool_calling_agent/validators.py)
 -   Subtasks [Validators](../tool_calling_agent/tasks/subtasks.py)
--   Tasks, including basic, spatial, custom interfaces and manipulation [Tasks](../tool_calling_agent/tasks/)
+-   Tasks, including basic, custom interfaces and manipulation [Tasks](../tool_calling_agent/tasks/)
diff --git a/src/rai_bench/rai_bench/examples/benchmarking_models.py b/src/rai_bench/rai_bench/examples/benchmarking_models.py
@@ -20,7 +20,7 @@
 
 if __name__ == "__main__":
     # Define models you want to benchmark
-    model_names = ["qwen3:4b", "llama3.2:3b"]
+    model_names = ["qwen2.5:3b", "llama3.2:3b"]
     vendors = ["ollama", "ollama"]
 
     # Define benchmarks that will be used
@@ -36,7 +36,7 @@
         extra_tool_calls=[0, 5],  # how many extra tool calls allowed to still pass
         task_types=[  # what types of tasks to include
             "basic",
-            "spatial_reasoning",
+            "manipulation",
             "custom_interfaces",
         ],
         N_shots=[0, 2],  # examples in system prompt
@@ -48,11 +48,6 @@
     test_models(
         model_names=model_names,
         vendors=vendors,
-        benchmark_configs=[mani_conf, tool_conf],
+        benchmark_configs=[tool_conf, mani_conf],
         out_dir=out_dir,
-        # if you want to pass any additinal args to model
-        additional_model_args=[
-            {"reasoning": False},
-            {},
-        ],
     )
diff --git a/src/rai_bench/rai_bench/examples/dual_agent.py b/src/rai_bench/rai_bench/examples/dual_agent.py
diff --git a/src/rai_bench/rai_bench/examples/vlm_benchmark.py b/src/rai_bench/rai_bench/examples/vlm_benchmark.py
@@ -0,0 +1,48 @@
+# Copyright (C) 2025 Robotec.AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+from rai_bench import (
+    define_benchmark_logger,
+    parse_vlm_benchmark_args,
+)
+from rai_bench.utils import get_llm_for_benchmark
+from rai_bench.vlm_benchmark import get_spatial_tasks, run_benchmark
+
+if __name__ == "__main__":
+    args = parse_vlm_benchmark_args()
+    experiment_dir = Path(args.out_dir)
+    experiment_dir.mkdir(parents=True, exist_ok=True)
+    bench_logger = define_benchmark_logger(out_dir=experiment_dir)
+    try:
+        tasks = get_spatial_tasks()
+        for task in tasks:
+            task.set_logger(bench_logger)
+
+        llm = get_llm_for_benchmark(
+            model_name=args.model_name,
+            vendor=args.vendor,
+        )
+        run_benchmark(
+            llm=llm,
+            out_dir=experiment_dir,
+            tasks=tasks,
+            bench_logger=bench_logger,
+        )
+    except Exception as e:
+        bench_logger.critical(
+            msg=f"Benchmark failed with error: {e}",
+            exc_info=True,
+        )
diff --git a/src/rai_bench/rai_bench/manipulation_o3de/__init__.py b/src/rai_bench/rai_bench/manipulation_o3de/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .benchmark import run_benchmark, run_benchmark_dual_agent
+from .benchmark import run_benchmark
 from .predefined.scenarios import get_scenarios
 
-__all__ = ["get_scenarios", "run_benchmark", "run_benchmark_dual_agent"]
+__all__ = ["get_scenarios", "run_benchmark"]