Skip to content

Commit

Permalink
Fix benchmark being stateful
Browse files Browse the repository at this point in the history
Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
  • Loading branch information
waynehamadi committed Sep 18, 2023
1 parent 3ea9d6a commit fd8f601
Show file tree
Hide file tree
Showing 14 changed files with 290 additions and 177 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/benchmark-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,5 +127,9 @@ jobs:
echo "Running the following command: ${prefix}agbenchmark --test=WriteFile"
${prefix}agbenchmark --test=WriteFile
sh run_benchmark &
poetry run python ../../benchmark/tests/test_web_server.py &
poetry run ../../benchmark/tests/test_web_server.py &
poetry run ../../benchmark/tests/test_web_server.py
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
1 change: 0 additions & 1 deletion autogpts/forge/forge/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

load_dotenv()
import forge.sdk.forge_log

forge.sdk.forge_log.setup_logger()


Expand Down
4 changes: 3 additions & 1 deletion autogpts/forge/run_benchmark
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@

kill $(lsof -t -i :8080)

poetry run agbenchmark serve
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

uvicorn agbenchmark.app:app --host localhost --port 8080 --reload --log-level info --reload-dir "$SCRIPT_DIR/../../benchmark/agbenchmark"
90 changes: 37 additions & 53 deletions benchmark/agbenchmark/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,9 @@
import toml
from helicone.lock import HeliconeLockManager

from agbenchmark.app import app
from agbenchmark.reports.ReportManager import SingletonReportManager
from agbenchmark.utils.data_types import AgentBenchmarkConfig

from .reports.ReportManager import ReportManager
from .utils.data_types import AgentBenchmarkConfig

BENCHMARK_START_TIME_DT = datetime.now(timezone.utc)
BENCHMARK_START_TIME = BENCHMARK_START_TIME_DT.strftime("%Y-%m-%dT%H:%M:%S+00:00")
TEMP_FOLDER_ABS_PATH = Path.cwd() / "agbenchmark_config" / "temp_folder"
Expand All @@ -26,50 +23,6 @@
UPDATES_JSON_PATH = Path.cwd() / "agbenchmark_config" / "updates.json"


def get_agent_benchmark_config() -> AgentBenchmarkConfig:
agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
try:
with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
return agent_benchmark_config
except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.")
raise


def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
agent_benchmark_config = get_agent_benchmark_config()
# tests that consistently pass are considered regression tests
REGRESSION_MANAGER = ReportManager(
agent_benchmark_config.get_regression_reports_path(), BENCHMARK_START_TIME_DT
)

# print(f"Using {REPORTS_PATH} for reports")
# user facing reporting information
INFO_MANAGER = ReportManager(
str(
agent_benchmark_config.get_reports_path(
benchmark_start_time=BENCHMARK_START_TIME_DT
)
/ "report.json"
),
BENCHMARK_START_TIME_DT,
)

# internal db step in replacement track pass/fail rate
INTERNAL_INFO_MANAGER = ReportManager(
agent_benchmark_config.get_success_rate_path(), BENCHMARK_START_TIME_DT
)

return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER


(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()


if os.environ.get("HELICONE_API_KEY"):
HeliconeLockManager.write_custom_property(
"benchmark_start_time", BENCHMARK_START_TIME
Expand Down Expand Up @@ -122,6 +75,8 @@ def run_benchmark(
) -> int:
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
# Check if configuration file exists and is not empty
initialize_updates_file()
SingletonReportManager()
agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
try:
with open(agent_benchmark_config_path, "r") as f:
Expand Down Expand Up @@ -214,7 +169,8 @@ def run_benchmark(
current_dir = Path(__file__).resolve().parent
print(f"Current directory: {current_dir}")
pytest_args.extend((str(current_dir), "--cache-clear"))
return pytest.main(pytest_args)
exit_code = pytest.main(pytest_args)
SingletonReportManager().clear_instance()


@click.group(invoke_without_command=True)
Expand Down Expand Up @@ -314,11 +270,39 @@ def version():
print(f"Benchmark Tool Version {version}")


def serve():
import uvicorn
from pathlib import Path

# Run the FastAPI application using uvicorn
uvicorn.run(app, host="0.0.0.0", port=8080)
# class CustomStatReload(StatReload):
# def __init__(self, *args, **kwargs):
# super().__init__(*args, **kwargs)
# # Overriding the directories to watch
# self.dirs = [Path(__file__).absolute().parent]

# def serve():
# current_file_path = Path(__file__).absolute().parent
# config = Config(
# "agbenchmark.app:app", # Reference to your FastAPI application
# host="localhost", # Host to bind
# port=8080, # Port to bind
# reload=True, # Enable reload
# log_level="info", # Logging level
# # reload_dirs=[str(current_file_path)], # Directories to watch
# )
# server = Server(config)
# server.run()


def initialize_updates_file():
if os.path.exists(UPDATES_JSON_PATH):
# If the file already exists, overwrite it with an empty list
with open(UPDATES_JSON_PATH, "w") as file:
json.dump([], file, indent=2)
print("Initialized updates.json by overwriting with an empty array")
else:
# If the file doesn't exist, create it and write an empty list
with open(UPDATES_JSON_PATH, "w") as file:
json.dump([], file, indent=2)
print("Created updates.json and initialized it with an empty array")


if __name__ == "__main__":
Expand Down
85 changes: 4 additions & 81 deletions benchmark/agbenchmark/agent_interface.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,11 @@
import os
import platform
import queue
import select
import shutil
import subprocess
import sys
import time
from threading import Thread
from typing import Any, List
from typing import List

import psutil
from dotenv import load_dotenv

from agbenchmark.utils.data_types import AgentBenchmarkConfig
from agbenchmark.execute_sub_process import execute_subprocess

load_dotenv()

Expand All @@ -22,82 +15,12 @@
)


def run_linux_env(process: Any, start_time: float, timeout: float) -> None:
while True:
try:
# This checks if there's data to be read from stdout without blocking.
if process.stdout and select.select([process.stdout], [], [], 0)[0]:
output = process.stdout.readline()
print(output.strip())
except Exception as e:
continue

# Check if process has ended, has no more output, or exceeded timeout
if process.poll() is not None or (time.time() - start_time > timeout):
break

if time.time() - start_time > timeout:
print("The Python function has exceeded the time limit and was terminated.")
parent = psutil.Process(process.pid)
for child in parent.children(recursive=True):
child.kill()
parent.kill()

else:
print("The Python function has finished running.")


def enqueue_output(out: Any, my_queue: Any) -> None:
for line in iter(out.readline, b""):
my_queue.put(line)
out.close()


def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
my_queue: Any = queue.Queue()
thread = Thread(target=enqueue_output, args=(process.stdout, my_queue))
thread.daemon = True
thread.start()

while True:
try:
output = my_queue.get_nowait().strip()
print(output)
except queue.Empty:
pass

if process.poll() is not None or (time.time() - start_time > timeout):
break

if time.time() - start_time > timeout:
print("The Python function has exceeded the time limit and was terminated.")
process.terminate()


def run_agent(task: str, timeout: int, agent_config: AgentBenchmarkConfig) -> None:
def run_agent(task: str, timeout: int) -> None:
print(f"Running agbenchmark/benchmarks.py with timeout {timeout}")

command = [sys.executable, "-m", "agbenchmark_config.benchmarks", str(task)]

process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
bufsize=1,
)

start_time = time.time()

if platform.system() == "Windows":
run_windows_env(process, start_time, timeout)
else:
run_linux_env(process, start_time, timeout)

process.wait()

if process.returncode != 0:
print(f"The agent timed out")
execute_subprocess(command, timeout)


def get_list_of_file_paths(
Expand Down
26 changes: 17 additions & 9 deletions benchmark/agbenchmark/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
from fastapi import Request, Response
from fastapi.middleware.cors import CORSMiddleware

# from agbenchmark.app import app
from agbenchmark.execute_sub_process import execute_subprocess

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from fastapi import FastAPI
from pydantic import BaseModel
from pydantic import BaseModel, Extra

# Change the current working directory to the benchmark path
# home_path = find_absolute_benchmark_path()
Expand All @@ -28,6 +28,9 @@ class CreateReportRequest(BaseModel):
category: Optional[str] = []
mock: Optional[bool] = False

class Config:
extra = Extra.forbid # this will forbid any extra fields


updates_list = []

Expand All @@ -50,25 +53,29 @@ class CreateReportRequest(BaseModel):
)


def stream_output(pipe):
for line in pipe:
print(line, end="")


@app.post("/reports")
def run_single_test(body: CreateReportRequest) -> Any:
from agbenchmark.__main__ import run_benchmark

print(body.dict())
# it's a hack because other parts of the code are using sys.argv
sys.argv = [sys.argv[0]]
sys.argv.append("start")
print(os.getcwd())
command_options = ["agbenchmark"]
if body.category:
sys.argv.append(f"--category={body.category}")
for body_test in body.tests:
sys.argv.append(f"--test={body_test}")
categories = None
command_options.append(f"--test={body_test}")
if body.category:
categories = tuple([body.category])

run_benchmark(category=categories, mock=body.mock, test=tuple(body.tests))
execute_subprocess(command_options, 200)
import json
from pathlib import Path

print("finished running")
# List all folders in the current working directory
path_reports = Path.cwd() / "agbenchmark_config" / "reports"
folders = [folder for folder in path_reports.iterdir() if folder.is_dir()]
Expand All @@ -82,6 +89,7 @@ def run_single_test(body: CreateReportRequest) -> Any:
# Read report.json from this folder
if last_folder:
report_path = last_folder / "report.json"
print(report_path)
if report_path.exists():
with report_path.open() as file:
data = json.load(file)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
},
"info": {
"difficulty": "basic",
"description": "s ability to generate content based on the content of 2 files.",
"description": "ability to generate content based on the content of 2 files.",
"side_effects": []
}
}
Loading

0 comments on commit fd8f601

Please sign in to comment.