Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
91 commits
Select commit Hold shift + click to select a range
d5d067e
core functions
recursix Sep 4, 2024
df2aaeb
switch to dask
recursix Sep 4, 2024
edb162c
removing joblib dependency and adding dask
recursix Sep 4, 2024
82ff348
fixing imports
TLSDC Sep 4, 2024
0dbdd98
handles multiple backends
recursix Sep 11, 2024
7da5cac
ensure asyncio loop creation
recursix Sep 11, 2024
25e241a
more tests
recursix Sep 11, 2024
01c8652
setting dashboard address to None
recursix Sep 11, 2024
c6370bd
minor
recursix Sep 11, 2024
775f135
Merge branch 'main' into switch-to-dask
recursix Sep 11, 2024
0198811
Merge branch 'main' into switch-to-dask
recursix Sep 12, 2024
7ad0e67
Finally found a way to make it work
recursix Sep 16, 2024
a396d9a
initial reproducibility files
recursix Sep 16, 2024
3db84f7
Seems to be superflus
recursix Sep 19, 2024
ed9e568
adding a reproducibility journal
recursix Sep 19, 2024
85ac6fa
minor update
recursix Sep 19, 2024
ad5110e
more robust
recursix Sep 19, 2024
baf9afa
adding reproducibility tools
recursix Sep 19, 2024
b0268b6
fix white listing
recursix Sep 20, 2024
bb7ddb0
minor
recursix Sep 20, 2024
8b4884f
minor
recursix Sep 20, 2024
e685f10
minor
recursix Sep 20, 2024
c99bdf7
Merge branch 'main' into reproducibility
recursix Sep 20, 2024
ac8b7f8
minor
recursix Sep 20, 2024
295f010
minor fix
recursix Sep 20, 2024
5ac4a7c
more tests
recursix Sep 20, 2024
d4cf969
more results yay
recursix Sep 20, 2024
1dc720b
disabling this test
recursix Sep 20, 2024
82f6181
update
recursix Sep 20, 2024
eb871ac
update
recursix Sep 20, 2024
fa0c489
black
recursix Sep 20, 2024
abd3212
maybe fixing github workflow ?
TLSDC Sep 20, 2024
4ebee28
make get_git_username great again
recursix Sep 20, 2024
58f5ec7
trigger change
recursix Sep 20, 2024
37bbb5f
Merge branch 'reproducibility' of github.com:ServiceNow/AgentLab into…
recursix Sep 20, 2024
f621648
new browsergym
recursix Sep 20, 2024
60a1b22
GPT-4o result (and new comment column)
recursix Sep 21, 2024
dd9aa0d
Seems like there was a change to 4o flags, trying these
recursix Sep 21, 2024
54ea0af
minor comment
recursix Sep 21, 2024
24214e5
better xray
recursix Sep 21, 2024
b8da07b
minor fix
recursix Sep 21, 2024
1ecaf9b
addming a comment field
recursix Sep 21, 2024
5aba9bc
new agent
recursix Sep 21, 2024
fe561b9
Merge branch 'main' into reproducibility
recursix Sep 21, 2024
7bf424e
another test with GPT-4o
recursix Sep 21, 2024
7e0ab03
adding llama3 from openrouter
recursix Sep 21, 2024
03eae32
fix naming
recursix Sep 21, 2024
796c37e
unused import
recursix Sep 23, 2024
8fc49e9
new summary tools and remove "_args" from columns in results
recursix Sep 23, 2024
7e2afd3
add Llama
recursix Sep 23, 2024
f08e47b
initial code for reproducibility agent
recursix Sep 23, 2024
326710a
Merge branch 'main' into reproducibility
recursix Sep 23, 2024
f7494cb
adjust inspect results
recursix Sep 25, 2024
37d8961
Merge branch 'main' into reproducibility
recursix Sep 25, 2024
4066da3
infer from benchmark
recursix Sep 26, 2024
ef204d3
fix reproducibility agent
recursix Sep 26, 2024
5112abe
prevent the repro_dir to be an index variable
recursix Sep 26, 2024
5325c69
updating repro agent stats
recursix Sep 27, 2024
02e028f
Merge branch 'main' into reproducibility
recursix Sep 27, 2024
d8ad4bd
Reproducibility agent
recursix Oct 1, 2024
fe27819
instructions to setup workarena
recursix Oct 1, 2024
4a8f078
fixing tests
TLSDC Oct 1, 2024
6474558
handles better a few edge cases
recursix Oct 1, 2024
42fdcf1
Merge branch 'reproducibility' of github.com:ServiceNow/AgentLab into…
recursix Oct 1, 2024
628d1c8
default progress function to None
recursix Oct 2, 2024
69f147a
minor formatting
recursix Oct 2, 2024
146ad62
minor
recursix Oct 2, 2024
a7562c3
initial commit
recursix Oct 2, 2024
1b47afb
refactoring with Study class
recursix Oct 2, 2024
f58b5a0
refactor to adapt for study class
recursix Oct 3, 2024
95c787e
minor
recursix Oct 3, 2024
641320c
fix pricy test
recursix Oct 3, 2024
57b789f
Merge branch 'main' of github.com:ServiceNow/AgentLab into study-class
TLSDC Oct 3, 2024
3d14f5a
fixing tests
TLSDC Oct 3, 2024
431be1b
Merge branch 'study-class' of github.com:ServiceNow/AgentLab into stu…
TLSDC Oct 3, 2024
fd9d72a
tmp
recursix Oct 4, 2024
a452c88
print report
recursix Oct 4, 2024
de7eba1
minor fix
recursix Oct 4, 2024
e78e89c
refine little details about reproducibility
recursix Oct 4, 2024
a52d883
minor
recursix Oct 4, 2024
af712a0
Merge branch 'main' into reproducibility_again
recursix Oct 4, 2024
3804c89
no need for set_temp anymore
recursix Oct 4, 2024
aadf86b
sanity check before running main
recursix Oct 5, 2024
1620640
minor update
recursix Oct 5, 2024
ab447e9
minor
recursix Oct 5, 2024
43e2163
new results with 4o on workarena.l1
recursix Oct 5, 2024
3f6d179
sharing is caring
recursix Oct 5, 2024
a98fa24
add llama to main.py
recursix Oct 5, 2024
da30461
new hournal entry
recursix Oct 7, 2024
c847dbd
Merge branch 'main' into reproducibility_again
recursix Oct 7, 2024
35f408e
format
TLSDC Oct 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 26 additions & 21 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,27 @@
Note: This script is a convenience script to launch experiments instead of using
the command line.

Don't push your changes to this file to git unless you are making structural changes.
Copy this script and modify at will, but don't push your changes to the
repository.
"""

import logging

from agentlab.agents.generic_agent import AGENT_CUSTOM, RANDOM_SEARCH_AGENT, AGENT_4o, AGENT_4o_MINI
from agentlab.agents.generic_agent import (
RANDOM_SEARCH_AGENT,
AGENT_4o,
AGENT_4o_MINI,
AGENT_LLAMA3_70B,
AGENT_LLAMA31_70B,
)
from agentlab.analyze.inspect_results import get_most_recent_folder
from agentlab.experiments import study_generators
from agentlab.experiments.exp_utils import RESULTS_DIR

logging.getLogger().setLevel(logging.INFO)

# choose your agent or provide a new agent
agent_args = [AGENT_4o_MINI]
# agent = AGENT_4o

# agent_args = [AGENT_4o]

## select the benchmark to run on
benchmark = "miniwob_tiny_test"
Expand All @@ -27,34 +32,34 @@
# benchmark = "workarena.l3"
# benchmark = "webarena"

# Set reproducibility_mode = True for reproducibility
# this will "ask" agents to be deterministic. Also, it will prevent you from launching if you have
# local changes. For your custom agents you need to implement set_reproducibility_mode
reproducibility_mode = False

## select the kind of experiment (study)
## Or define new studies, you only have to return list of ExpArgs to run and a name for the study


## alternatively, relaunch an existing study
# study_dir = get_most_recent_folder(RESULTS_DIR, contains=None)
# exp_args_list, study_dir = relaunch_study(study_dir, relaunch_mode="incomplete_or_error")

# Set relaunch = True to relaunch an existing study, this will continue incomplete
# experiments and relaunch errored experiments
relaunch = False

## Number of parallel jobs
n_jobs = 1 # Make sure to use 1 job when debugging in VSCode
n_jobs = 4 # Make sure to use 1 job when debugging in VSCode
# n_jobs = -1 # to use all available cores

# run the experiments
if __name__ == "__main__":

if __name__ == "__main__": # necessary for dask backend

if reproducibility_mode:
[a.set_reproducibility_mode() for a in agent_args]

if relaunch:
# relaunch an existing study
study_dir = get_most_recent_folder(RESULTS_DIR, contains=None)
study_dir = get_most_recent_folder()
study = study_generators.make_relaunch_study(study_dir, relaunch_mode="incomplete_or_error")

else:
study = study_generators.run_agents_on_benchmark(agent_args, benchmark)

study.run(n_jobs=n_jobs, parallel_backend="joblib", strict_reproducibility=False)
study.run(n_jobs=n_jobs, parallel_backend="joblib", strict_reproducibility=reproducibility_mode)

# Uncomment the following line if you think your study represent a
# reproducible result. You can run in relaunch mode to avoid re-running the experiments.
# study.append_to_journal(strict_reproducibility=True)
if reproducibility_mode:
study.append_to_journal(strict_reproducibility=True)
3 changes: 3 additions & 0 deletions reproducibility_journal.csv
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,0
recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-20_22-09-43,0.656,0.019,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,f6216486d5faac2c8b3fb0a63e114e5a4bafde47,,0.6.4,8cef8fe34940ff490d0cc06b0c8f100180d09d43,
recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-21_12-04-39,0.656,0.019,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe561b93c5f053e9f9625358862f542523b5e14a,,0.7.0,ed6d6992ef64bfb91aca7002d33cb6ed5ec031ef,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-10-01_11-45-23,0.539,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe27819a99b163fd9240ba3e144e010413bff24d,,0.7.1,b0ad675572e01cac0d7255100112de0828877148,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21-27,0.23,0.023,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,aadf86b397cd36c581e1a61e491aec649ac5a140, M: main.py,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
17 changes: 17 additions & 0 deletions src/agentlab/agents/agent_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ class AgentArgs(AbstractAgentArgs):
def set_benchmark(self, benchmark: str, demo_mode: bool):
"""Optional method to set benchmark specific flags.

This allows the agent to have minor adjustments based on the benchmark.
E.g. using a benchmark specific action space. Or letting the agent see
HTML on MiniWoB since AXTree is not enough. Users should avoid making
extensive benchmark specific prompt engineering.

Args:
benchmark: str
Name of the benchmark.
Expand All @@ -14,3 +19,15 @@ def set_benchmark(self, benchmark: str, demo_mode: bool):
the demo_mode flag in the browsergym action space.
"""
pass

def set_reproducibility_mode(self):
"""Optional method to set the agent in a reproducibility mode.

This should adjust the agent configuration to make it as deterministic
as possible e.g. setting the temperature of the model to 0.

This is only called when reproducibility is requested.
"""
raise NotImplementedError(
f"set_reproducibility_mode is not implemented for agent_args {self.__class__.__name__}"
)
2 changes: 2 additions & 0 deletions src/agentlab/agents/generic_agent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
AGENT_3_5,
AGENT_8B,
AGENT_LLAMA3_70B,
AGENT_LLAMA31_70B,
AGENT_CUSTOM,
RANDOM_SEARCH_AGENT,
AGENT_4o,
Expand All @@ -15,6 +16,7 @@
"AGENT_4o_MINI",
"AGENT_4o_VISION",
"AGENT_LLAMA3_70B",
"AGENT_LLAMA31_70B",
"AGENT_8B",
"RANDOM_SEARCH_AGENT",
"AGENT_CUSTOM",
Expand Down
3 changes: 3 additions & 0 deletions src/agentlab/agents/generic_agent/generic_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ def set_benchmark(self, benchmark, demo_mode):
if demo_mode:
self.flags.action.demo_mode = "all_blue"

def set_reproducibility_mode(self):
self.chat_model_args.temperature = 0

def prepare(self):
return self.chat_model_args.prepare_server()

Expand Down
30 changes: 16 additions & 14 deletions src/agentlab/agents/most_basic_agent/most_basic_agent.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,44 @@
import logging
import os
import re
from dataclasses import asdict, dataclass
from typing import TYPE_CHECKING, Any

from browsergym.core.action.highlevel import HighLevelActionSet
from browsergym.experiments.agent import Agent, AgentInfo
from browsergym.experiments.loop import AbstractAgentArgs, EnvArgs, ExpArgs
import bgym

from agentlab.llm.chat_api import make_system_message, make_user_message
from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
from agentlab.llm.llm_utils import ParseError, extract_code_blocks, retry
from agentlab.llm.tracking import cost_tracker_decorator
from agentlab.agents.agent_args import AgentArgs

if TYPE_CHECKING:
from agentlab.llm.chat_api import BaseModelArgs


@dataclass
class MostBasicAgentArgs(AbstractAgentArgs):
class MostBasicAgentArgs(AgentArgs):
agent_name: str = "BasicAgent"
temperature: float = 0.1
use_chain_of_thought: bool = False
chat_model_args: "BaseModelArgs" = None

def make_agent(self) -> Agent:
def make_agent(self) -> bgym.Agent:
return MostBasicAgent(
temperature=self.temperature,
use_chain_of_thought=self.use_chain_of_thought,
chat_model_args=self.chat_model_args,
)

def set_reproducibility_mode(self):
self.temperature = 0

def prepare(self):
return self.chat_model_args.prepare_server()

def close(self):
return self.chat_model_args.close_server()


class MostBasicAgent(Agent):
class MostBasicAgent(bgym.Agent):
def __init__(
self, temperature: float, use_chain_of_thought: bool, chat_model_args: "BaseModelArgs"
):
Expand All @@ -47,7 +47,7 @@ def __init__(
self.chat = chat_model_args.make_model()
self.chat_model_args = chat_model_args

self.action_set = HighLevelActionSet(["bid"], multiaction=False)
self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False)

@cost_tracker_decorator
def get_action(self, obs: Any) -> tuple[str, dict]:
Expand Down Expand Up @@ -104,7 +104,7 @@ def parser(response: str) -> tuple[dict, bool, str]:

return (
action,
AgentInfo(
bgym.AgentInfo(
think=thought,
chat_messages=messages,
# put any stats that you care about as long as it is a number or a dict of numbers
Expand All @@ -115,17 +115,19 @@ def parser(response: str) -> tuple[dict, bool, str]:
)


env_args = EnvArgs(
# example for a single task
env_args = bgym.EnvArgs(
task_name="miniwob.click-button",
task_seed=0,
max_steps=10,
headless=True,
)

chat_model_args = CHAT_MODEL_ARGS_DICT["azure/gpt-35-turbo/gpt-35-turbo"]
chat_model_args = CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"]

# example for 2 experiments testing chain of thoughts on a miniwob task
exp_args = [
ExpArgs(
bgym.ExpArgs(
agent_args=MostBasicAgentArgs(
temperature=0.1,
use_chain_of_thought=True,
Expand All @@ -134,7 +136,7 @@ def parser(response: str) -> tuple[dict, bool, str]:
env_args=env_args,
logging_level=logging.INFO,
),
ExpArgs(
bgym.ExpArgs(
agent_args=MostBasicAgentArgs(
temperature=0.1,
use_chain_of_thought=False,
Expand Down
2 changes: 1 addition & 1 deletion src/agentlab/analyze/agent_xray.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ def run_gradio(results_dir: Path):
tabs.select(tab_select)

demo.queue()
demo.launch(server_port=7899)
demo.launch(server_port=7899, share=True)


def tab_select(evt: gr.SelectData):
Expand Down
66 changes: 1 addition & 65 deletions src/agentlab/experiments/reproducibility_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def get_reproducibility_info(
changes_white_list=( # Files that are often modified during experiments but do not affect reproducibility
"*/reproducibility_script.py",
"*reproducibility_journal.csv",
"*/launch_command.py",
"*main.py",
),
ignore_changes=False,
):
Expand Down Expand Up @@ -347,63 +347,6 @@ def _verify_report(report_df: pd.DataFrame, agent_names=list[str], strict_reprod
)
return report_df

# def add_reward(info, study_dir, ignore_incomplete=False):
# """Add the average reward and standard error to the info dict.

# Verifies that all tasks are completed and that there are no errors.
# """
# result_df = inspect_results.load_result_df(study_dir)
# report = inspect_results.summarize_study(result_df)

# if len(report) > 1:
# raise ValueError("Multi agent not implemented yet")

# if isinstance(info["agent_names"], (list, tuple)):
# if len(info["agent_names"]) > 1:
# raise ValueError("Multi agent not implemented yet")

# idx = report.index[0]
# n_err = report.loc[idx, "n_err"].item()
# n_completed, n_total = report.loc[idx, "n_completed"].split("/")
# if n_err > 0 and not ignore_incomplete:
# raise ValueError(
# f"Experiment has {n_err} errors. Please rerun the study and make sure all tasks are completed."
# )
# if n_completed != n_total and not ignore_incomplete:
# raise ValueError(
# f"Experiment has {n_completed} completed tasks out of {n_total}. "
# f"Please rerun the study and make sure all tasks are completed."
# )

# for key in ("avg_reward", "std_err", "n_err", "n_completed"):
# value = report.loc[idx, key]
# if hasattr(value, "item"):
# value = value.item()
# info[key] = value

if isinstance(info["agent_name"], (list, tuple)):
if len(info["agent_name"]) > 1:
raise ValueError("Multi agent not implemented yet")

idx = report.index[0]
n_err = report.loc[idx, "n_err"].item()
n_completed, n_total = report.loc[idx, "n_completed"].split("/")
if n_err > 0 and not ignore_incomplete:
raise ValueError(
f"Experiment has {n_err} errors. Please rerun the study and make sure all tasks are completed."
)
if n_completed != n_total and not ignore_incomplete:
raise ValueError(
f"Experiment has {n_completed} completed tasks out of {n_total}. "
f"Please rerun the study and make sure all tasks are completed."
)

for key in ("avg_reward", "std_err", "n_err", "n_completed"):
value = report.loc[idx, key]
if hasattr(value, "item"):
value = value.item()
info[key] = value


def _get_csv_headers(file_path: str) -> list[str]:
with open(file_path, "r", newline="") as file:
Expand Down Expand Up @@ -464,10 +407,3 @@ def append_to_journal(
writer = csv.writer(file)
for row in rows:
writer.writerow(row)


def set_temp(agent_args: GenericAgentArgs, temperature=0):
"""Set temperature to 0. Assumes a GenericAgent structure."""
agent_args = deepcopy(agent_args)
agent_args.chat_model_args.temperature = temperature
return agent_args
3 changes: 3 additions & 0 deletions src/agentlab/experiments/study_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False)
self.write_reproducibility_info(strict_reproducibility=strict_reproducibility)

run_experiments(n_jobs, self.exp_args_list, self.dir, parallel_backend=parallel_backend)
report_df = self.get_report(ignore_cache=True)
logging.info(f"Study {self.name} finished.")
logging.info("\n" + str(report_df))

def append_to_journal(self, strict_reproducibility=True):
"""Append the study to the journal.
Expand Down
2 changes: 0 additions & 2 deletions src/agentlab/llm/llm_configs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import os

from agentlab.llm.chat_api import (
AzureModelArgs,
OpenAIModelArgs,
Expand Down
5 changes: 0 additions & 5 deletions tests/experiments/test_reproducibility_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,6 @@
import json


def test_set_temp():
agent_args = reproducibility_util.set_temp(AGENT_4o_MINI)
assert agent_args.chat_model_args.temperature == 0


@pytest.mark.parametrize(
"benchmark_name",
["miniwob", "workarena.l1", "webarena", "visualwebarena"],
Expand Down
24 changes: 24 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from pathlib import Path
import subprocess
import pytest


@pytest.mark.pricy
def test_main_script_execution():
# this should trigger agent_4o_mini on miniwob_tiny_test unless this was
# reconfigured differently.
script_path = Path(__file__).parent.parent / "main.py"

# just make sure it's in the right state
main = __import__(script_path.stem)
assert main.benchmark == "miniwob_tiny_test"
assert main.reproducibility_mode == False
assert main.relaunch == False
assert main.n_jobs <= 10

result = subprocess.run(["python", script_path], capture_output=True, text=True, timeout=5 * 60)
assert result.returncode == 0


if __name__ == "__main__":
test_main_script_execution()