Fix benchmark being stateful

Significant-Gravitas · Sep 18, 2023 · 0afcde3 · 0afcde3
1 parent 2cf350b
commit 0afcde3
Show file tree

Hide file tree

Showing 16 changed files with 340 additions and 176 deletions.
diff --git a/.github/workflows/benchmark-ci.yml b/.github/workflows/benchmark-ci.yml
@@ -127,5 +127,8 @@ jobs:
 
           echo "Running the following command: ${prefix}agbenchmark --test=WriteFile"
           ${prefix}agbenchmark --test=WriteFile
+          sh run_benchmark &
+          sleep 5
+          python ../../benchmark/tests/test_web_server.py
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/autogpts/forge/advanced_commands/README.md b/autogpts/forge/advanced_commands/README.md
@@ -0,0 +1,2 @@
+Advanced commands to develop on the forge and the benchmark.
+Stability not guaranteed.
diff --git a/autogpts/forge/advanced_commands/run_benchmark_dev b/autogpts/forge/advanced_commands/run_benchmark_dev
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Kill processes using port 8080 if any.
+if lsof -t -i :8080; then
+    kill $(lsof -t -i :8080)
+fi
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+uvicorn agbenchmark.app:app --host localhost --port 8080 --reload --log-level info --reload-dir "$SCRIPT_DIR/../../../benchmark/agbenchmark"
diff --git a/autogpts/forge/forge/__main__.py b/autogpts/forge/forge/__main__.py
@@ -4,7 +4,6 @@
 
 load_dotenv()
 import forge.sdk.forge_log
-
 forge.sdk.forge_log.setup_logger()
 
 

diff --git a/autogpts/forge/run_benchmark b/autogpts/forge/run_benchmark
@@ -1,5 +1,7 @@
 #!/bin/bash
 
-kill $(lsof -t -i :8080)
-
+# Kill processes using port 8080 if any.
+if lsof -t -i :8080; then
+    kill $(lsof -t -i :8080)
+fi
 poetry run agbenchmark serve
diff --git a/benchmark/agbenchmark/__main__.py b/benchmark/agbenchmark/__main__.py
@@ -12,11 +12,9 @@
 from helicone.lock import HeliconeLockManager
 
 from agbenchmark.app import app
+from agbenchmark.reports.ReportManager import SingletonReportManager
 from agbenchmark.utils.data_types import AgentBenchmarkConfig
 
-from .reports.ReportManager import ReportManager
-from .utils.data_types import AgentBenchmarkConfig
-
 BENCHMARK_START_TIME_DT = datetime.now(timezone.utc)
 BENCHMARK_START_TIME = BENCHMARK_START_TIME_DT.strftime("%Y-%m-%dT%H:%M:%S+00:00")
 TEMP_FOLDER_ABS_PATH = Path.cwd() / "agbenchmark_config" / "temp_folder"
@@ -26,50 +24,6 @@
 UPDATES_JSON_PATH = Path.cwd() / "agbenchmark_config" / "updates.json"
 
 
-def get_agent_benchmark_config() -> AgentBenchmarkConfig:
-    agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
-    try:
-        with open(agent_benchmark_config_path, "r") as f:
-            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = (
-                agent_benchmark_config_path
-            )
-            return agent_benchmark_config
-    except json.JSONDecodeError:
-        print("Error: benchmark_config.json is not a valid JSON file.")
-        raise
-
-
-def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
-    agent_benchmark_config = get_agent_benchmark_config()
-    # tests that consistently pass are considered regression tests
-    REGRESSION_MANAGER = ReportManager(
-        agent_benchmark_config.get_regression_reports_path(), BENCHMARK_START_TIME_DT
-    )
-
-    # print(f"Using {REPORTS_PATH} for reports")
-    # user facing reporting information
-    INFO_MANAGER = ReportManager(
-        str(
-            agent_benchmark_config.get_reports_path(
-                benchmark_start_time=BENCHMARK_START_TIME_DT
-            )
-            / "report.json"
-        ),
-        BENCHMARK_START_TIME_DT,
-    )
-
-    # internal db step in replacement track pass/fail rate
-    INTERNAL_INFO_MANAGER = ReportManager(
-        agent_benchmark_config.get_success_rate_path(), BENCHMARK_START_TIME_DT
-    )
-
-    return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
-
-
-(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()
-
-
 if os.environ.get("HELICONE_API_KEY"):
     HeliconeLockManager.write_custom_property(
         "benchmark_start_time", BENCHMARK_START_TIME
@@ -122,6 +76,9 @@ def run_benchmark(
 ) -> int:
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     # Check if configuration file exists and is not empty
+
+    initialize_updates_file()
+    SingletonReportManager()
     agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
     try:
         with open(agent_benchmark_config_path, "r") as f:
@@ -214,7 +171,8 @@ def run_benchmark(
     current_dir = Path(__file__).resolve().parent
     print(f"Current directory: {current_dir}")
     pytest_args.extend((str(current_dir), "--cache-clear"))
-    return pytest.main(pytest_args)
+    exit_code = pytest.main(pytest_args)
+    SingletonReportManager().clear_instance()
 
 
 @click.group(invoke_without_command=True)
@@ -226,7 +184,7 @@ def run_benchmark(
     multiple=True,
     help="Skips preventing the tests from this category from running",
 )
-@click.option("--test", help="Specific test to run")
+@click.option("--test", multiple=True, help="Specific test to run")
 @click.option("--maintain", is_flag=True, help="Runs only regression tests")
 @click.option("--improve", is_flag=True, help="Run only non-regression tests")
 @click.option(
@@ -314,12 +272,47 @@ def version():
     print(f"Benchmark Tool Version {version}")
 
 
+from pathlib import Path
+
+# class CustomStatReload(StatReload):
+#     def __init__(self, *args, **kwargs):
+#         super().__init__(*args, **kwargs)
+#         # Overriding the directories to watch
+#         self.dirs = [Path(__file__).absolute().parent]
+
+# def serve():
+#     current_file_path = Path(__file__).absolute().parent
+#     config = Config(
+#         "agbenchmark.app:app",    # Reference to your FastAPI application
+#         host="localhost",         # Host to bind
+#         port=8080,                # Port to bind
+#         reload=True,              # Enable reload
+#         log_level="info",         # Logging level
+#         # reload_dirs=[str(current_file_path)],  # Directories to watch
+#     )
+#     server = Server(config)
+#     server.run()
+
+
 def serve():
     import uvicorn
 
     # Run the FastAPI application using uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8080)
 
 
+def initialize_updates_file():
+    if os.path.exists(UPDATES_JSON_PATH):
+        # If the file already exists, overwrite it with an empty list
+        with open(UPDATES_JSON_PATH, "w") as file:
+            json.dump([], file, indent=2)
+        print("Initialized updates.json by overwriting with an empty array")
+    else:
+        # If the file doesn't exist, create it and write an empty list
+        with open(UPDATES_JSON_PATH, "w") as file:
+            json.dump([], file, indent=2)
+        print("Created updates.json and initialized it with an empty array")
+
+
 if __name__ == "__main__":
     cli()
diff --git a/benchmark/agbenchmark/agent_interface.py b/benchmark/agbenchmark/agent_interface.py
@@ -1,18 +1,11 @@
 import os
-import platform
-import queue
-import select
 import shutil
-import subprocess
 import sys
-import time
-from threading import Thread
-from typing import Any, List
+from typing import List
 
-import psutil
 from dotenv import load_dotenv
 
-from agbenchmark.utils.data_types import AgentBenchmarkConfig
+from agbenchmark.execute_sub_process import execute_subprocess
 
 load_dotenv()
 
@@ -22,82 +15,12 @@
 )
 
 
-def run_linux_env(process: Any, start_time: float, timeout: float) -> None:
-    while True:
-        try:
-            # This checks if there's data to be read from stdout without blocking.
-            if process.stdout and select.select([process.stdout], [], [], 0)[0]:
-                output = process.stdout.readline()
-                print(output.strip())
-        except Exception as e:
-            continue
-
-        # Check if process has ended, has no more output, or exceeded timeout
-        if process.poll() is not None or (time.time() - start_time > timeout):
-            break
-
-    if time.time() - start_time > timeout:
-        print("The Python function has exceeded the time limit and was terminated.")
-        parent = psutil.Process(process.pid)
-        for child in parent.children(recursive=True):
-            child.kill()
-        parent.kill()
-
-    else:
-        print("The Python function has finished running.")
-
-
-def enqueue_output(out: Any, my_queue: Any) -> None:
-    for line in iter(out.readline, b""):
-        my_queue.put(line)
-    out.close()
-
-
-def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
-    my_queue: Any = queue.Queue()
-    thread = Thread(target=enqueue_output, args=(process.stdout, my_queue))
-    thread.daemon = True
-    thread.start()
-
-    while True:
-        try:
-            output = my_queue.get_nowait().strip()
-            print(output)
-        except queue.Empty:
-            pass
-
-        if process.poll() is not None or (time.time() - start_time > timeout):
-            break
-
-    if time.time() - start_time > timeout:
-        print("The Python function has exceeded the time limit and was terminated.")
-        process.terminate()
-
-
-def run_agent(task: str, timeout: int, agent_config: AgentBenchmarkConfig) -> None:
+def run_agent(task: str, timeout: int) -> None:
     print(f"Running agbenchmark/benchmarks.py with timeout {timeout}")
 
     command = [sys.executable, "-m", "agbenchmark_config.benchmarks", str(task)]
 
-    process = subprocess.Popen(
-        command,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        universal_newlines=True,
-        bufsize=1,
-    )
-
-    start_time = time.time()
-
-    if platform.system() == "Windows":
-        run_windows_env(process, start_time, timeout)
-    else:
-        run_linux_env(process, start_time, timeout)
-
-    process.wait()
-
-    if process.returncode != 0:
-        print(f"The agent timed out")
+    execute_subprocess(command, timeout)
 
 
 def get_list_of_file_paths(