Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement old polling mechanism #5248

Merged
merged 2 commits into from
Sep 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/benchmark-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,5 +127,8 @@ jobs:

echo "Running the following command: ${prefix}agbenchmark --test=WriteFile"
${prefix}agbenchmark --test=WriteFile
sh run_benchmark &
sleep 5
python ../../benchmark/tests/test_web_server.py
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
2 changes: 2 additions & 0 deletions autogpts/forge/advanced_commands/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Advanced commands to develop on the forge and the benchmark.
Stability not guaranteed.
9 changes: 9 additions & 0 deletions autogpts/forge/advanced_commands/run_benchmark_dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

# Kill processes using port 8080 if any.
if lsof -t -i :8080; then
kill $(lsof -t -i :8080)
fi
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

uvicorn agbenchmark.app:app --host localhost --port 8080 --reload --log-level info --reload-dir "$SCRIPT_DIR/../../../benchmark/agbenchmark"
1 change: 0 additions & 1 deletion autogpts/forge/forge/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

load_dotenv()
import forge.sdk.forge_log

forge.sdk.forge_log.setup_logger()


Expand Down
6 changes: 4 additions & 2 deletions autogpts/forge/run_benchmark
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

kill $(lsof -t -i :8080)

# Kill processes using port 8080 if any.
if lsof -t -i :8080; then
kill $(lsof -t -i :8080)
fi
poetry run agbenchmark serve
72 changes: 23 additions & 49 deletions benchmark/agbenchmark/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@
from helicone.lock import HeliconeLockManager

from agbenchmark.app import app
from agbenchmark.reports.ReportManager import SingletonReportManager
from agbenchmark.utils.data_types import AgentBenchmarkConfig

from .reports.ReportManager import ReportManager
from .utils.data_types import AgentBenchmarkConfig

BENCHMARK_START_TIME_DT = datetime.now(timezone.utc)
BENCHMARK_START_TIME = BENCHMARK_START_TIME_DT.strftime("%Y-%m-%dT%H:%M:%S+00:00")
TEMP_FOLDER_ABS_PATH = Path.cwd() / "agbenchmark_config" / "temp_folder"
Expand All @@ -26,50 +24,6 @@
UPDATES_JSON_PATH = Path.cwd() / "agbenchmark_config" / "updates.json"


def get_agent_benchmark_config() -> AgentBenchmarkConfig:
agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
try:
with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
return agent_benchmark_config
except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.")
raise


def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
agent_benchmark_config = get_agent_benchmark_config()
# tests that consistently pass are considered regression tests
REGRESSION_MANAGER = ReportManager(
agent_benchmark_config.get_regression_reports_path(), BENCHMARK_START_TIME_DT
)

# print(f"Using {REPORTS_PATH} for reports")
# user facing reporting information
INFO_MANAGER = ReportManager(
str(
agent_benchmark_config.get_reports_path(
benchmark_start_time=BENCHMARK_START_TIME_DT
)
/ "report.json"
),
BENCHMARK_START_TIME_DT,
)

# internal db step in replacement track pass/fail rate
INTERNAL_INFO_MANAGER = ReportManager(
agent_benchmark_config.get_success_rate_path(), BENCHMARK_START_TIME_DT
)

return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER


(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()


if os.environ.get("HELICONE_API_KEY"):
HeliconeLockManager.write_custom_property(
"benchmark_start_time", BENCHMARK_START_TIME
Expand Down Expand Up @@ -122,6 +76,9 @@ def run_benchmark(
) -> int:
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
# Check if configuration file exists and is not empty

initialize_updates_file()
SingletonReportManager()
agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
try:
with open(agent_benchmark_config_path, "r") as f:
Expand Down Expand Up @@ -214,7 +171,8 @@ def run_benchmark(
current_dir = Path(__file__).resolve().parent
print(f"Current directory: {current_dir}")
pytest_args.extend((str(current_dir), "--cache-clear"))
return pytest.main(pytest_args)
exit_code = pytest.main(pytest_args)
SingletonReportManager().clear_instance()


@click.group(invoke_without_command=True)
Expand All @@ -226,7 +184,7 @@ def run_benchmark(
multiple=True,
help="Skips preventing the tests from this category from running",
)
@click.option("--test", help="Specific test to run")
@click.option("--test", multiple=True, help="Specific test to run")
@click.option("--maintain", is_flag=True, help="Runs only regression tests")
@click.option("--improve", is_flag=True, help="Run only non-regression tests")
@click.option(
Expand Down Expand Up @@ -314,12 +272,28 @@ def version():
print(f"Benchmark Tool Version {version}")


from pathlib import Path


def serve():
import uvicorn

# Run the FastAPI application using uvicorn
uvicorn.run(app, host="0.0.0.0", port=8080)


def initialize_updates_file():
if os.path.exists(UPDATES_JSON_PATH):
# If the file already exists, overwrite it with an empty list
with open(UPDATES_JSON_PATH, "w") as file:
json.dump([], file, indent=2)
print("Initialized updates.json by overwriting with an empty array")
else:
# If the file doesn't exist, create it and write an empty list
with open(UPDATES_JSON_PATH, "w") as file:
json.dump([], file, indent=2)
print("Created updates.json and initialized it with an empty array")


if __name__ == "__main__":
cli()
85 changes: 4 additions & 81 deletions benchmark/agbenchmark/agent_interface.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,11 @@
import os
import platform
import queue
import select
import shutil
import subprocess
import sys
import time
from threading import Thread
from typing import Any, List
from typing import List

import psutil
from dotenv import load_dotenv

from agbenchmark.utils.data_types import AgentBenchmarkConfig
from agbenchmark.execute_sub_process import execute_subprocess

load_dotenv()

Expand All @@ -22,82 +15,12 @@
)


def run_linux_env(process: Any, start_time: float, timeout: float) -> None:
while True:
try:
# This checks if there's data to be read from stdout without blocking.
if process.stdout and select.select([process.stdout], [], [], 0)[0]:
output = process.stdout.readline()
print(output.strip())
except Exception as e:
continue

# Check if process has ended, has no more output, or exceeded timeout
if process.poll() is not None or (time.time() - start_time > timeout):
break

if time.time() - start_time > timeout:
print("The Python function has exceeded the time limit and was terminated.")
parent = psutil.Process(process.pid)
for child in parent.children(recursive=True):
child.kill()
parent.kill()

else:
print("The Python function has finished running.")


def enqueue_output(out: Any, my_queue: Any) -> None:
for line in iter(out.readline, b""):
my_queue.put(line)
out.close()


def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
my_queue: Any = queue.Queue()
thread = Thread(target=enqueue_output, args=(process.stdout, my_queue))
thread.daemon = True
thread.start()

while True:
try:
output = my_queue.get_nowait().strip()
print(output)
except queue.Empty:
pass

if process.poll() is not None or (time.time() - start_time > timeout):
break

if time.time() - start_time > timeout:
print("The Python function has exceeded the time limit and was terminated.")
process.terminate()


def run_agent(task: str, timeout: int, agent_config: AgentBenchmarkConfig) -> None:
def run_agent(task: str, timeout: int) -> None:
print(f"Running agbenchmark/benchmarks.py with timeout {timeout}")

command = [sys.executable, "-m", "agbenchmark_config.benchmarks", str(task)]

process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
bufsize=1,
)

start_time = time.time()

if platform.system() == "Windows":
run_windows_env(process, start_time, timeout)
else:
run_linux_env(process, start_time, timeout)

process.wait()

if process.returncode != 0:
print(f"The agent timed out")
execute_subprocess(command, timeout)


def get_list_of_file_paths(
Expand Down
76 changes: 58 additions & 18 deletions benchmark/agbenchmark/app.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,67 @@
import json
import os
import sys
from typing import Any, List, Optional
from typing import Any, Optional

import psutil
from fastapi import FastAPI
from fastapi import (
HTTPException as FastAPIHTTPException, # Import HTTPException from FastAPI
)
from fastapi import Request, Response
from fastapi.middleware.cors import CORSMiddleware

# from agbenchmark.app import app
from agbenchmark.execute_sub_process import execute_subprocess

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from fastapi import FastAPI
from pydantic import BaseModel
from pydantic import BaseModel, Extra

# Change the current working directory to the benchmark path
# home_path = find_absolute_benchmark_path()
# os.chdir(home_path)

general_command = ["poetry", "run", "agbenchmark", "start", "--backend"]

import psutil


def find_agbenchmark_without_uvicorn():
pids = []
for process in psutil.process_iter(
attrs=[
"pid",
"cmdline",
"name",
"username",
"status",
"cpu_percent",
"memory_info",
"create_time",
"cwd",
"connections",
]
):
try:
# Convert the process.info dictionary values to strings and concatenate them
full_info = " ".join([str(v) for k, v in process.info.items()])

if "agbenchmark" in full_info and "uvicorn" not in full_info:
pids.append(process.info["pid"])
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
pass
return pids


class CreateReportRequest(BaseModel):
tests: Optional[List[str]] = []
category: Optional[str] = []
test: str = None
test_run_id: str = None
# category: Optional[str] = []
mock: Optional[bool] = False

class Config:
extra = Extra.forbid # this will forbid any extra fields


updates_list = []

Expand All @@ -50,25 +84,30 @@ class CreateReportRequest(BaseModel):
)


def stream_output(pipe):
for line in pipe:
print(line, end="")


@app.post("/reports")
def run_single_test(body: CreateReportRequest) -> Any:
from agbenchmark.__main__ import run_benchmark

pids = find_agbenchmark_without_uvicorn()
print(f"pids already running with agbenchmark: {pids}")
print(body.dict())
# it's a hack because other parts of the code are using sys.argv
sys.argv = [sys.argv[0]]
sys.argv.append("start")
if body.category:
sys.argv.append(f"--category={body.category}")
for body_test in body.tests:
sys.argv.append(f"--test={body_test}")
categories = None
if body.category:
categories = tuple([body.category])

run_benchmark(category=categories, mock=body.mock, test=tuple(body.tests))
print(os.getcwd())
command_options = ["agbenchmark"]
# if body.category:
# sys.argv.append(f"--category={body.category}")
command_options.append(f"--test={body.test}")
if body.mock:
command_options.append("--mock")

execute_subprocess(command_options, 200)
import json
from pathlib import Path

print("finished running")
# List all folders in the current working directory
path_reports = Path.cwd() / "agbenchmark_config" / "reports"
folders = [folder for folder in path_reports.iterdir() if folder.is_dir()]
Expand All @@ -82,6 +121,7 @@ def run_single_test(body: CreateReportRequest) -> Any:
# Read report.json from this folder
if last_folder:
report_path = last_folder / "report.json"
print(report_path)
if report_path.exists():
with report_path.open() as file:
data = json.load(file)
Expand Down
Loading
Loading