Skip to content

Commit

Permalink
Fix benchmark being stateful
Browse files Browse the repository at this point in the history
  • Loading branch information
waynehamadi committed Sep 18, 2023
1 parent 2cf350b commit 0afcde3
Show file tree
Hide file tree
Showing 16 changed files with 340 additions and 176 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/benchmark-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,5 +127,8 @@ jobs:
echo "Running the following command: ${prefix}agbenchmark --test=WriteFile"
${prefix}agbenchmark --test=WriteFile
sh run_benchmark &
sleep 5
python ../../benchmark/tests/test_web_server.py
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
2 changes: 2 additions & 0 deletions autogpts/forge/advanced_commands/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Advanced commands to develop on the forge and the benchmark.
Stability not guaranteed.
9 changes: 9 additions & 0 deletions autogpts/forge/advanced_commands/run_benchmark_dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

# Kill processes using port 8080 if any.
if lsof -t -i :8080; then
kill $(lsof -t -i :8080)
fi
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

uvicorn agbenchmark.app:app --host localhost --port 8080 --reload --log-level info --reload-dir "$SCRIPT_DIR/../../../benchmark/agbenchmark"
1 change: 0 additions & 1 deletion autogpts/forge/forge/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

load_dotenv()
import forge.sdk.forge_log

forge.sdk.forge_log.setup_logger()


Expand Down
6 changes: 4 additions & 2 deletions autogpts/forge/run_benchmark
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

kill $(lsof -t -i :8080)

# Kill processes using port 8080 if any.
if lsof -t -i :8080; then
kill $(lsof -t -i :8080)
fi
poetry run agbenchmark serve
91 changes: 42 additions & 49 deletions benchmark/agbenchmark/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@
from helicone.lock import HeliconeLockManager

from agbenchmark.app import app
from agbenchmark.reports.ReportManager import SingletonReportManager
from agbenchmark.utils.data_types import AgentBenchmarkConfig

from .reports.ReportManager import ReportManager
from .utils.data_types import AgentBenchmarkConfig

BENCHMARK_START_TIME_DT = datetime.now(timezone.utc)
BENCHMARK_START_TIME = BENCHMARK_START_TIME_DT.strftime("%Y-%m-%dT%H:%M:%S+00:00")
TEMP_FOLDER_ABS_PATH = Path.cwd() / "agbenchmark_config" / "temp_folder"
Expand All @@ -26,50 +24,6 @@
UPDATES_JSON_PATH = Path.cwd() / "agbenchmark_config" / "updates.json"


def get_agent_benchmark_config() -> AgentBenchmarkConfig:
agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
try:
with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
return agent_benchmark_config
except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.")
raise


def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
agent_benchmark_config = get_agent_benchmark_config()
# tests that consistently pass are considered regression tests
REGRESSION_MANAGER = ReportManager(
agent_benchmark_config.get_regression_reports_path(), BENCHMARK_START_TIME_DT
)

# print(f"Using {REPORTS_PATH} for reports")
# user facing reporting information
INFO_MANAGER = ReportManager(
str(
agent_benchmark_config.get_reports_path(
benchmark_start_time=BENCHMARK_START_TIME_DT
)
/ "report.json"
),
BENCHMARK_START_TIME_DT,
)

# internal db step in replacement track pass/fail rate
INTERNAL_INFO_MANAGER = ReportManager(
agent_benchmark_config.get_success_rate_path(), BENCHMARK_START_TIME_DT
)

return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER


(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()


if os.environ.get("HELICONE_API_KEY"):
HeliconeLockManager.write_custom_property(
"benchmark_start_time", BENCHMARK_START_TIME
Expand Down Expand Up @@ -122,6 +76,9 @@ def run_benchmark(
) -> int:
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
# Check if configuration file exists and is not empty

initialize_updates_file()
SingletonReportManager()
agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
try:
with open(agent_benchmark_config_path, "r") as f:
Expand Down Expand Up @@ -214,7 +171,8 @@ def run_benchmark(
current_dir = Path(__file__).resolve().parent
print(f"Current directory: {current_dir}")
pytest_args.extend((str(current_dir), "--cache-clear"))
return pytest.main(pytest_args)
exit_code = pytest.main(pytest_args)
SingletonReportManager().clear_instance()


@click.group(invoke_without_command=True)
Expand All @@ -226,7 +184,7 @@ def run_benchmark(
multiple=True,
help="Skips preventing the tests from this category from running",
)
@click.option("--test", help="Specific test to run")
@click.option("--test", multiple=True, help="Specific test to run")
@click.option("--maintain", is_flag=True, help="Runs only regression tests")
@click.option("--improve", is_flag=True, help="Run only non-regression tests")
@click.option(
Expand Down Expand Up @@ -314,12 +272,47 @@ def version():
print(f"Benchmark Tool Version {version}")


from pathlib import Path

# class CustomStatReload(StatReload):
# def __init__(self, *args, **kwargs):
# super().__init__(*args, **kwargs)
# # Overriding the directories to watch
# self.dirs = [Path(__file__).absolute().parent]

# def serve():
# current_file_path = Path(__file__).absolute().parent
# config = Config(
# "agbenchmark.app:app", # Reference to your FastAPI application
# host="localhost", # Host to bind
# port=8080, # Port to bind
# reload=True, # Enable reload
# log_level="info", # Logging level
# # reload_dirs=[str(current_file_path)], # Directories to watch
# )
# server = Server(config)
# server.run()


def serve():
import uvicorn

# Run the FastAPI application using uvicorn
uvicorn.run(app, host="0.0.0.0", port=8080)


def initialize_updates_file():
if os.path.exists(UPDATES_JSON_PATH):
# If the file already exists, overwrite it with an empty list
with open(UPDATES_JSON_PATH, "w") as file:
json.dump([], file, indent=2)
print("Initialized updates.json by overwriting with an empty array")
else:
# If the file doesn't exist, create it and write an empty list
with open(UPDATES_JSON_PATH, "w") as file:
json.dump([], file, indent=2)
print("Created updates.json and initialized it with an empty array")


if __name__ == "__main__":
cli()
85 changes: 4 additions & 81 deletions benchmark/agbenchmark/agent_interface.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,11 @@
import os
import platform
import queue
import select
import shutil
import subprocess
import sys
import time
from threading import Thread
from typing import Any, List
from typing import List

import psutil
from dotenv import load_dotenv

from agbenchmark.utils.data_types import AgentBenchmarkConfig
from agbenchmark.execute_sub_process import execute_subprocess

load_dotenv()

Expand All @@ -22,82 +15,12 @@
)


def run_linux_env(process: Any, start_time: float, timeout: float) -> None:
while True:
try:
# This checks if there's data to be read from stdout without blocking.
if process.stdout and select.select([process.stdout], [], [], 0)[0]:
output = process.stdout.readline()
print(output.strip())
except Exception as e:
continue

# Check if process has ended, has no more output, or exceeded timeout
if process.poll() is not None or (time.time() - start_time > timeout):
break

if time.time() - start_time > timeout:
print("The Python function has exceeded the time limit and was terminated.")
parent = psutil.Process(process.pid)
for child in parent.children(recursive=True):
child.kill()
parent.kill()

else:
print("The Python function has finished running.")


def enqueue_output(out: Any, my_queue: Any) -> None:
for line in iter(out.readline, b""):
my_queue.put(line)
out.close()


def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
my_queue: Any = queue.Queue()
thread = Thread(target=enqueue_output, args=(process.stdout, my_queue))
thread.daemon = True
thread.start()

while True:
try:
output = my_queue.get_nowait().strip()
print(output)
except queue.Empty:
pass

if process.poll() is not None or (time.time() - start_time > timeout):
break

if time.time() - start_time > timeout:
print("The Python function has exceeded the time limit and was terminated.")
process.terminate()


def run_agent(task: str, timeout: int, agent_config: AgentBenchmarkConfig) -> None:
def run_agent(task: str, timeout: int) -> None:
print(f"Running agbenchmark/benchmarks.py with timeout {timeout}")

command = [sys.executable, "-m", "agbenchmark_config.benchmarks", str(task)]

process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
bufsize=1,
)

start_time = time.time()

if platform.system() == "Windows":
run_windows_env(process, start_time, timeout)
else:
run_linux_env(process, start_time, timeout)

process.wait()

if process.returncode != 0:
print(f"The agent timed out")
execute_subprocess(command, timeout)


def get_list_of_file_paths(
Expand Down
Loading

0 comments on commit 0afcde3

Please sign in to comment.