Fix benchmark being stateful

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
Significant-Gravitas · Sep 18, 2023 · fd8f601 · fd8f601
1 parent 3ea9d6a
commit fd8f601
Show file tree

Hide file tree

Showing 14 changed files with 290 additions and 177 deletions.
diff --git a/.github/workflows/benchmark-ci.yml b/.github/workflows/benchmark-ci.yml
@@ -127,5 +127,9 @@ jobs:
 
           echo "Running the following command: ${prefix}agbenchmark --test=WriteFile"
           ${prefix}agbenchmark --test=WriteFile
+          sh run_benchmark &
+          poetry run python ../../benchmark/tests/test_web_server.py &
+          poetry run ../../benchmark/tests/test_web_server.py &
+          poetry run ../../benchmark/tests/test_web_server.py
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/autogpts/forge/forge/__main__.py b/autogpts/forge/forge/__main__.py
@@ -4,7 +4,6 @@
 
 load_dotenv()
 import forge.sdk.forge_log
-
 forge.sdk.forge_log.setup_logger()
 
 

diff --git a/autogpts/forge/run_benchmark b/autogpts/forge/run_benchmark
@@ -2,4 +2,6 @@
 
 kill $(lsof -t -i :8080)
 
-poetry run agbenchmark serve
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+uvicorn agbenchmark.app:app --host localhost --port 8080 --reload --log-level info --reload-dir "$SCRIPT_DIR/../../benchmark/agbenchmark"
diff --git a/benchmark/agbenchmark/__main__.py b/benchmark/agbenchmark/__main__.py
@@ -11,12 +11,9 @@
 import toml
 from helicone.lock import HeliconeLockManager
 
-from agbenchmark.app import app
+from agbenchmark.reports.ReportManager import SingletonReportManager
 from agbenchmark.utils.data_types import AgentBenchmarkConfig
 
-from .reports.ReportManager import ReportManager
-from .utils.data_types import AgentBenchmarkConfig
-
 BENCHMARK_START_TIME_DT = datetime.now(timezone.utc)
 BENCHMARK_START_TIME = BENCHMARK_START_TIME_DT.strftime("%Y-%m-%dT%H:%M:%S+00:00")
 TEMP_FOLDER_ABS_PATH = Path.cwd() / "agbenchmark_config" / "temp_folder"
@@ -26,50 +23,6 @@
 UPDATES_JSON_PATH = Path.cwd() / "agbenchmark_config" / "updates.json"
 
 
-def get_agent_benchmark_config() -> AgentBenchmarkConfig:
-    agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
-    try:
-        with open(agent_benchmark_config_path, "r") as f:
-            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = (
-                agent_benchmark_config_path
-            )
-            return agent_benchmark_config
-    except json.JSONDecodeError:
-        print("Error: benchmark_config.json is not a valid JSON file.")
-        raise
-
-
-def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
-    agent_benchmark_config = get_agent_benchmark_config()
-    # tests that consistently pass are considered regression tests
-    REGRESSION_MANAGER = ReportManager(
-        agent_benchmark_config.get_regression_reports_path(), BENCHMARK_START_TIME_DT
-    )
-
-    # print(f"Using {REPORTS_PATH} for reports")
-    # user facing reporting information
-    INFO_MANAGER = ReportManager(
-        str(
-            agent_benchmark_config.get_reports_path(
-                benchmark_start_time=BENCHMARK_START_TIME_DT
-            )
-            / "report.json"
-        ),
-        BENCHMARK_START_TIME_DT,
-    )
-
-    # internal db step in replacement track pass/fail rate
-    INTERNAL_INFO_MANAGER = ReportManager(
-        agent_benchmark_config.get_success_rate_path(), BENCHMARK_START_TIME_DT
-    )
-
-    return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
-
-
-(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()
-
-
 if os.environ.get("HELICONE_API_KEY"):
     HeliconeLockManager.write_custom_property(
         "benchmark_start_time", BENCHMARK_START_TIME
@@ -122,6 +75,8 @@ def run_benchmark(
 ) -> int:
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     # Check if configuration file exists and is not empty
+    initialize_updates_file()
+    SingletonReportManager()
     agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
     try:
         with open(agent_benchmark_config_path, "r") as f:
@@ -214,7 +169,8 @@ def run_benchmark(
     current_dir = Path(__file__).resolve().parent
     print(f"Current directory: {current_dir}")
     pytest_args.extend((str(current_dir), "--cache-clear"))
-    return pytest.main(pytest_args)
+    exit_code = pytest.main(pytest_args)
+    SingletonReportManager().clear_instance()
 
 
 @click.group(invoke_without_command=True)
@@ -314,11 +270,39 @@ def version():
     print(f"Benchmark Tool Version {version}")
 
 
-def serve():
-    import uvicorn
+from pathlib import Path
 
-    # Run the FastAPI application using uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8080)
+# class CustomStatReload(StatReload):
+#     def __init__(self, *args, **kwargs):
+#         super().__init__(*args, **kwargs)
+#         # Overriding the directories to watch
+#         self.dirs = [Path(__file__).absolute().parent]
+
+# def serve():
+#     current_file_path = Path(__file__).absolute().parent
+#     config = Config(
+#         "agbenchmark.app:app",    # Reference to your FastAPI application
+#         host="localhost",         # Host to bind
+#         port=8080,                # Port to bind
+#         reload=True,              # Enable reload
+#         log_level="info",         # Logging level
+#         # reload_dirs=[str(current_file_path)],  # Directories to watch
+#     )
+#     server = Server(config)
+#     server.run()
+
+
+def initialize_updates_file():
+    if os.path.exists(UPDATES_JSON_PATH):
+        # If the file already exists, overwrite it with an empty list
+        with open(UPDATES_JSON_PATH, "w") as file:
+            json.dump([], file, indent=2)
+        print("Initialized updates.json by overwriting with an empty array")
+    else:
+        # If the file doesn't exist, create it and write an empty list
+        with open(UPDATES_JSON_PATH, "w") as file:
+            json.dump([], file, indent=2)
+        print("Created updates.json and initialized it with an empty array")
 
 
 if __name__ == "__main__":

diff --git a/benchmark/agbenchmark/agent_interface.py b/benchmark/agbenchmark/agent_interface.py
@@ -1,18 +1,11 @@
 import os
-import platform
-import queue
-import select
 import shutil
-import subprocess
 import sys
-import time
-from threading import Thread
-from typing import Any, List
+from typing import List
 
-import psutil
 from dotenv import load_dotenv
 
-from agbenchmark.utils.data_types import AgentBenchmarkConfig
+from agbenchmark.execute_sub_process import execute_subprocess
 
 load_dotenv()
 
@@ -22,82 +15,12 @@
 )
 
 
-def run_linux_env(process: Any, start_time: float, timeout: float) -> None:
-    while True:
-        try:
-            # This checks if there's data to be read from stdout without blocking.
-            if process.stdout and select.select([process.stdout], [], [], 0)[0]:
-                output = process.stdout.readline()
-                print(output.strip())
-        except Exception as e:
-            continue
-
-        # Check if process has ended, has no more output, or exceeded timeout
-        if process.poll() is not None or (time.time() - start_time > timeout):
-            break
-
-    if time.time() - start_time > timeout:
-        print("The Python function has exceeded the time limit and was terminated.")
-        parent = psutil.Process(process.pid)
-        for child in parent.children(recursive=True):
-            child.kill()
-        parent.kill()
-
-    else:
-        print("The Python function has finished running.")
-
-
-def enqueue_output(out: Any, my_queue: Any) -> None:
-    for line in iter(out.readline, b""):
-        my_queue.put(line)
-    out.close()
-
-
-def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
-    my_queue: Any = queue.Queue()
-    thread = Thread(target=enqueue_output, args=(process.stdout, my_queue))
-    thread.daemon = True
-    thread.start()
-
-    while True:
-        try:
-            output = my_queue.get_nowait().strip()
-            print(output)
-        except queue.Empty:
-            pass
-
-        if process.poll() is not None or (time.time() - start_time > timeout):
-            break
-
-    if time.time() - start_time > timeout:
-        print("The Python function has exceeded the time limit and was terminated.")
-        process.terminate()
-
-
-def run_agent(task: str, timeout: int, agent_config: AgentBenchmarkConfig) -> None:
+def run_agent(task: str, timeout: int) -> None:
     print(f"Running agbenchmark/benchmarks.py with timeout {timeout}")
 
     command = [sys.executable, "-m", "agbenchmark_config.benchmarks", str(task)]
 
-    process = subprocess.Popen(
-        command,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        universal_newlines=True,
-        bufsize=1,
-    )
-
-    start_time = time.time()
-
-    if platform.system() == "Windows":
-        run_windows_env(process, start_time, timeout)
-    else:
-        run_linux_env(process, start_time, timeout)
-
-    process.wait()
-
-    if process.returncode != 0:
-        print(f"The agent timed out")
+    execute_subprocess(command, timeout)
 
 
 def get_list_of_file_paths(

diff --git a/benchmark/agbenchmark/app.py b/benchmark/agbenchmark/app.py
@@ -10,11 +10,11 @@
 from fastapi import Request, Response
 from fastapi.middleware.cors import CORSMiddleware
 
-# from agbenchmark.app import app
+from agbenchmark.execute_sub_process import execute_subprocess
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from fastapi import FastAPI
-from pydantic import BaseModel
+from pydantic import BaseModel, Extra
 
 # Change the current working directory to the benchmark path
 # home_path = find_absolute_benchmark_path()
@@ -28,6 +28,9 @@ class CreateReportRequest(BaseModel):
     category: Optional[str] = []
     mock: Optional[bool] = False
 
+    class Config:
+        extra = Extra.forbid  # this will forbid any extra fields
+
 
 updates_list = []
 
@@ -50,25 +53,29 @@ class CreateReportRequest(BaseModel):
 )
 
 
+def stream_output(pipe):
+    for line in pipe:
+        print(line, end="")
+
+
 @app.post("/reports")
 def run_single_test(body: CreateReportRequest) -> Any:
-    from agbenchmark.__main__ import run_benchmark
-
+    print(body.dict())
     # it's a hack because other parts of the code are using sys.argv
-    sys.argv = [sys.argv[0]]
-    sys.argv.append("start")
+    print(os.getcwd())
+    command_options = ["agbenchmark"]
     if body.category:
         sys.argv.append(f"--category={body.category}")
     for body_test in body.tests:
-        sys.argv.append(f"--test={body_test}")
-    categories = None
+        command_options.append(f"--test={body_test}")
     if body.category:
         categories = tuple([body.category])
 
-    run_benchmark(category=categories, mock=body.mock, test=tuple(body.tests))
+    execute_subprocess(command_options, 200)
     import json
     from pathlib import Path
 
+    print("finished running")
     # List all folders in the current working directory
     path_reports = Path.cwd() / "agbenchmark_config" / "reports"
     folders = [folder for folder in path_reports.iterdir() if folder.is_dir()]
@@ -82,6 +89,7 @@ def run_single_test(body: CreateReportRequest) -> Any:
     # Read report.json from this folder
     if last_folder:
         report_path = last_folder / "report.json"
+        print(report_path)
         if report_path.exists():
             with report_path.open() as file:
                 data = json.load(file)

diff --git a/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json b/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json
@@ -17,7 +17,7 @@
   },
   "info": {
     "difficulty": "basic",
-    "description": "s ability to generate content based on the content of 2 files.",
+    "description": "ability to generate content based on the content of 2 files.",
     "side_effects": []
   }
 }