In [1]:
import json

from examples.gaia_agent.eval import tape_correct
from examples.gaia_agent.tape import GaiaTape
from tapeagents.io import load_tapes

tapes_a = load_tapes(
    GaiaTape, "../../../outputs/gaia/runs/gpt4o_mini_val_simplebrowser_fromconf1/tapes", file_extension=".json"
)
tapes_b = load_tapes(
    GaiaTape, "../../../outputs/gaia/runs/gpt4o_mini_val_browsergym_fromconf2/tapes", file_extension=".json"
)
print(f"A: {len(tapes_a)}, B: {len(tapes_b)}")
# Count the total number of correct tapes
correct_a = sum(1 for tape in tapes_a if tape_correct(tape))
correct_b = sum(1 for tape in tapes_b if tape_correct(tape))

print(f"Correct tapes in A: {correct_a}")
print(f"Correct tapes in B: {correct_b}")

  from .autonotebook import tqdm as notebook_tqdm


A: 165, B: 164
Correct tapes in A: 44
Correct tapes in B: 47


In [2]:
# Create a dictionary to store paired tapes
tape_pairs = {}
incorrect_pairs = []
correct_pairs = []

# Iterate through tapes_a and tapes_b to find matching task_ids
for tape_a in tapes_a:
    task_id_a = tape_a.metadata.task["task_id"]
    for tape_b in tapes_b:
        task_id_b = tape_b.metadata.task["task_id"]
        if task_id_a == task_id_b:
            # Check if one tape is correct and the other is not
            if tape_correct(tape_a) != tape_correct(tape_b):
                tape_pairs[task_id_a] = {"tape_a": tape_a, "tape_b": tape_b}
            elif tape_correct(tape_a) and tape_correct(tape_b):
                correct_pairs.append((tape_a, tape_b))
            else:
                incorrect_pairs.append((tape_a, tape_b))
            break

print(f"Found {len(tape_pairs)} matching tape pairs with different result.")
print(f"Found {len(correct_pairs)} both correct tape pairs.")
print(f"Found {len(incorrect_pairs)} both incorrect tape pairs.")

Found 19 matching tape pairs with different result.
Found 36 both correct tape pairs.
Found 109 both incorrect tape pairs.


In [3]:
# Count correct tapes in tape_pairs
correct_a = sum(1 for _, pair in tape_pairs.items() if tape_correct(pair["tape_a"]))
correct_b = sum(1 for _, pair in tape_pairs.items() if tape_correct(pair["tape_b"]))

print(f"Different pairs, correct tapes in A: {correct_a}")
print(f"Different pairs, correct tapes in B: {correct_b}")

Different pairs, correct tapes in A: 8
Different pairs, correct tapes in B: 11


In [4]:
from tapeagents.core import Observation


def render_tape(tape, name):
    lines = [f"Trace {name}"]
    views = []
    for i, step in enumerate(tape.steps):
        if isinstance(step, Observation):
            view = step.short_view()
        else:
            view = step.llm_view()
        views.append(f"Step {i}. {view}")
    lines.append("\n".join(views))
    lines.append("-" * 20)
    lines.append("This answer is correct" if tape_correct(tape) else "This answer is incorrect!")
    return "\n".join(lines)


def render_task(task_dict):
    lines = []
    lines.append(f"Task: {task_dict['Question']}")
    lines.append(f"Human solution steps:\n{task_dict['Annotator Metadata']['Steps']}")
    lines.append(f"Correct Answer: {task_dict['Final answer']}")
    return "\n".join(lines)


rendered_pairs = {}
for task_id, pair in tape_pairs.items():
    rendered_pairs[task_id] = {
        "task": render_task(pair["tape_a"].metadata.task),
        "tape_a": render_tape(pair["tape_a"], "SimpleB"),
        "tape_b": render_tape(pair["tape_b"], "Browsergym"),
        "correct": "A" if tape_correct(pair["tape_a"]) else "B",
    }

In [None]:
import os

from tapeagents.llms import LiteLLM

key = "sk-..."
os.environ["OPENAI_API_KEY"] = key
llm = LiteLLM(model_name="o3-mini-2025-01-31")

## LLM Analysis

In [6]:
# parameters

name_a = "SimpleB"
name_b = "BrowserGym"
difference = f"The agent {name_a} uses simple text browser to access the web, while agent {name_b} uses a more advanced browser with additional capabilities."
focus = "Pay attention to the differences in the web browsing capabilities of the agents as it is the main focus of the analysis."

In [7]:
from tqdm import tqdm

prompt_prefix = f"""You are the expert in analysing solutions of the complex information processing tasks. 
You are given the task description, its solution and the steps used by human to arrive at the solution.
You are also given traces of two solutions generated by two different AI agents {name_a} and {name_b}.
{difference}
Each trace annotated with the correctness of the solution in the end.
Your task is to analyse the two traces and the task description and answer the following questions:
- Which agent is correct?
- Why is the other agent incorrect?
- What are the main differences between the correct and incorrect traces?
- Which differences can be attributed to the web browsing capabilities of the agents?
- Which steps in the incorrect trace marks the beginning of the incorrect reasoning?
Base your answers on the traces and the task description only, do not guess or hypothesize anything about the agents thoughts which are not visible in the traces.
"""
prompt_postfix = """Thoroughly analyze the traces and answer the questions in detail. 
Do not assume or guess anything about agent decisions, only use the information provided in the traces.
Do not not use markdown formatting, answer in plain text."""
b_better = []
b_worse = []
for task_id, pair in tqdm(rendered_pairs.items()):
    prompt = f"{prompt_prefix}\n\n{pair['task']}\n\n{pair['tape_a']}\n\n{pair['tape_b']}\n\n{prompt_postfix}"
    result = llm.quick_response(prompt)
    if pair["correct"] == "A":
        b_worse.append({"task_id": task_id, "result": result})
    else:
        b_better.append({"task_id": task_id, "result": result})
print(f"Comparisons collected. Agent B is better in {len(b_better)} tasks and worse in {len(b_worse)} tasks.")

100%|██████████| 19/19 [04:25<00:00, 13.98s/it]

Comparisons collected. Agent B is better in 11 tasks and worse in 8 tasks.





In [8]:
comparison_results = {
    "b_better": b_better,
    "b_worse": b_worse,
}
with open(f"{name_a}_{name_b}_comparison_results.json", "w") as f:
    json.dump(comparison_results, f, indent=2, ensure_ascii=False)

In [9]:
# with open(f"{name_a}_{name_b}_comparison_results.json") as f:
#     comparison_results = json.load(f)
# b_better = comparison_results["b_better"]
# b_worse = comparison_results["b_worse"]
# print(f"Agent B is better in {len(b_better)} tasks and worse in {len(b_worse)} tasks.")

In [17]:
prompt_prefix = f"""You are the expert in indcuctive reasoning and analysis of complex information processing tasks.
You are given the cases where two different AI agents {name_a} and {name_b} solved the same task and their solutions were compared by an expert.
For all given cases, the agent {name_a} was better than {name_b}.
You task is to analyze all provided cases, find the common patterns of {name_b} errors and capabilities.
Based on your analysis, write a detailed report about the drawbacks and limitations of the agent {name_b}.
{focus}
"""
prompt_postfix = """Thoroughly analyze the cases and write a detailed report. 
It should contain practical generalizations and conclusions about the differences between the two agents.
Every statement in the report should be supported by the evidence from more than one case.
Do not assume or guess anything about agent decisions, only use the information provided in the comparisons.
Be consice and do not repeat yourself, limit the report to 600 words, make report structured and easy to read.
"""
cases = [f"Case {case['task_id'].split('-', maxsplit=1)[0]}:\n{case['result']}" for case in b_worse]
cases_str = "\n\n".join(cases)
prompt = f"{prompt_prefix}\n<CASES START>\n\n{cases_str}\n\n<CASES END>\n\n{prompt_postfix}"
b_worse_conclusions = llm.quick_response(prompt)
print(b_worse_conclusions)

Report on BrowserGym Drawbacks and Limitations

1. Overview  
An analysis of the provided cases reveals that, despite BrowserGym’s advanced web browsing features, its performance is consistently inferior to SimpleB. The evidence across all cases highlights that BrowserGym’s errors stem from misinterpretation of extracted data, deviation from prescribed methods, and unnecessary extra steps. These issues consistently compromise its final answer accuracy.

2. Misinterpretation and Incomplete Data Extraction  
Multiple cases (a1e91b78, 0ff53813, 544b7f0c) show that BrowserGym does not reliably extract all necessary details. In case a1e91b78, BrowserGym miscounted visible bird species by omitting one detail that SimpleB correctly captured. Similarly, in case 0ff53813, BrowserGym overlooked a critical textual reference in a document and incorrectly generalized “beta geometric” to “predictive model.” In the horror film task (544b7f0c), BrowserGym’s failure to locate the specific reference led

In [18]:
prompt_prefix = f"""You are the expert in indcuctive reasoning and analysis of complex information processing tasks.
You are given the cases where two different AI agents {name_a} and {name_b} solved the same task and their solutions were compared by an expert.
For all given cases, the agent {name_a} was worse than {name_b}.
You task is to analyze all provided cases, find the common patterns of {name_a} errors and capabilities.
Based on your analysis, write a detailed report about the drawbacks and limitations of the agent {name_a}.
{focus}
"""
prompt_postfix = """Thoroughly analyze the cases and write a detailed report. 
It should contain practical generalizations and conclusions about the differences between the two agents.
Every statement in the report should be supported by the evidence from more than one case.
Do not assume or guess anything about agent decisions, only use the information provided in the comparisons.
Be consice and do not repeat yourself, limit the report to 600 words, make report structured and easy to read.
"""
cases = [f"Case {case['task_id'].split('-', maxsplit=1)[0]}:\n{case['result']}" for case in b_better]
cases_str = "\n\n".join(cases)
prompt = f"{prompt_prefix}\n<CASES START>\n\n{cases_str}\n\n<CASES END>\n\n{prompt_postfix}"
b_better_conclusions = llm.quick_response(prompt)
print(b_better_conclusions)

Report: Comparative Analysis of BrowserGym and SimpleB

Overview
This report reviews multiple cases where BrowserGym and SimpleB attempted the same tasks, consistently showing BrowserGym’s superiority. The analysis highlights common errors and limitations in SimpleB’s approach, particularly regarding its web browsing and data extraction capabilities.

Key Issues with SimpleB

1. Data Extraction and Conversion Errors  
Across several cases (e1fc63a2, e0c10771, 99c9cc74), SimpleB repeatedly made mistakes while processing data. For example, it incorrectly converted a pace value—treating seconds as minutes—leading to catastrophic numeric errors. Similarly, SimpleB failed to capture full ingredient details by dropping qualifiers (e.g., “freshly squeezed” in a recipe ingredient). These errors are not isolated but reflect a broader limitation in handling nuanced data extraction and transformation correctly.

2. Inadequate File Processing and Limited Parsing  
In cases like 5cfb274c and de9887

In [12]:
import random

# Create a shuffled list with both b_better and b_worse results
b_all_shuffled = b_worse + b_better
random.shuffle(b_all_shuffled)
print(f"Shuffled mix contains {len(b_all_shuffled)} comparisons")

Shuffled mix contains 19 comparisons


In [19]:
prompt_prefix = f"""You are the expert in indcuctive reasoning and analysis of complex information processing tasks.
You are given the cases where two different AI agents {name_a} and {name_b} solved the same task and their solutions were compared by an expert.
In some cases, the agent {name_a} was worse than {name_b}, in some cases it was better.
You task is to analyze all provided cases, find the common patterns of agent errors and capabilities.
Based on your analysis, write a detailed report about the differences between the two agents.
{focus}
"""
prompt_postfix = """Thoroughly analyze the cases and write a detailed report. 
It should contain practical generalizations and conclusions about the differences between the two agents.
Every statement in the report should be supported by the evidence from more than one case.
Do not assume or guess anything about agent decisions, only use the information provided in the comparisons.
Be consice and do not repeat yourself, limit the report to 600 words, make report structured and easy to read.
"""
cases = [f"Case {case['task_id'].split('-', maxsplit=1)[0]}:\n{case['result']}" for case in b_all_shuffled]
cases_str = "\n\n".join(cases)
prompt = f"{prompt_prefix}\n<CASES START>\n\n{cases_str}\n\n<CASES END>\n\n{prompt_postfix}"
b_all_conclusions = llm.quick_response(prompt)
print(b_all_conclusions)

Report: Analysis of Differences in Web Browsing and Information Extraction Between SimpleB and BrowserGym

1. Overview  
The comparison of multiple cases reveals systematic differences in how the two agents navigate documents, execute code, and extract details from web sources. Both agents start with similar search steps; however, the manner in which they process content and follow prescribed procedures leads to clear differences in final outcomes.

2. Navigation and Data Extraction  
• SimpleB typically follows a straightforward, step‐by‐step navigation strategy. In cases such as 544b7f0c and 0ff53813, its repeated page_down and page_observation actions allowed it to extract the precise textual details (e.g., “A Nightmare on Elm Street” or “beta geometric”) required for the task. This method consistently led to correct answers when the task depended on careful scrolling and repeated reading of document pages.  
• In contrast, BrowserGym often leverages an advanced browser to perform m

In [14]:
conclusions = {
    "b_better": b_better_conclusions,
    "b_worse": b_worse_conclusions,
    "all": b_all_conclusions,
}
with open(f"{name_a}_{name_b}_conclusions.json", "w") as f:
    json.dump(conclusions, f, indent=2, ensure_ascii=False)