Significant-Gravitas · waynehamadi · Sep 18, 2023 · Sep 18, 2023 · Sep 18, 2023
@@ -127,5 +127,8 @@ jobs:
 
           echo "Running the following command: ${prefix}agbenchmark --test=WriteFile"
           ${prefix}agbenchmark --test=WriteFile
+          sh run_benchmark &
+          sleep 5
+          python ../../benchmark/tests/test_web_server.py
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/autogpts/forge/advanced_commands/README.md b/autogpts/forge/advanced_commands/README.md
@@ -0,0 +1,2 @@
+Advanced commands to develop on the forge and the benchmark.
+Stability not guaranteed.
diff --git a/autogpts/forge/advanced_commands/run_benchmark_dev b/autogpts/forge/advanced_commands/run_benchmark_dev
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Kill processes using port 8080 if any.
+if lsof -t -i :8080; then
+    kill $(lsof -t -i :8080)
+fi
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+uvicorn agbenchmark.app:app --host localhost --port 8080 --reload --log-level info --reload-dir "$SCRIPT_DIR/../../../benchmark/agbenchmark"
diff --git a/autogpts/forge/forge/__main__.py b/autogpts/forge/forge/__main__.py
@@ -4,7 +4,6 @@
 
 load_dotenv()
 import forge.sdk.forge_log
-
 forge.sdk.forge_log.setup_logger()
 
 

diff --git a/autogpts/forge/run_benchmark b/autogpts/forge/run_benchmark
@@ -1,5 +1,7 @@
 #!/bin/bash
 
-kill $(lsof -t -i :8080)
-
+# Kill processes using port 8080 if any.
+if lsof -t -i :8080; then
+    kill $(lsof -t -i :8080)
+fi
 poetry run agbenchmark serve
diff --git a/benchmark/agbenchmark/__main__.py b/benchmark/agbenchmark/__main__.py
@@ -12,11 +12,9 @@
 from helicone.lock import HeliconeLockManager
 
 from agbenchmark.app import app
+from agbenchmark.reports.ReportManager import SingletonReportManager
 from agbenchmark.utils.data_types import AgentBenchmarkConfig
 
-from .reports.ReportManager import ReportManager
-from .utils.data_types import AgentBenchmarkConfig
-
 BENCHMARK_START_TIME_DT = datetime.now(timezone.utc)
 BENCHMARK_START_TIME = BENCHMARK_START_TIME_DT.strftime("%Y-%m-%dT%H:%M:%S+00:00")
 TEMP_FOLDER_ABS_PATH = Path.cwd() / "agbenchmark_config" / "temp_folder"
@@ -26,50 +24,6 @@
 UPDATES_JSON_PATH = Path.cwd() / "agbenchmark_config" / "updates.json"
 
 
-def get_agent_benchmark_config() -> AgentBenchmarkConfig:
-    agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
-    try:
-        with open(agent_benchmark_config_path, "r") as f:
-            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = (
-                agent_benchmark_config_path
-            )
-            return agent_benchmark_config
-    except json.JSONDecodeError:
-        print("Error: benchmark_config.json is not a valid JSON file.")
-        raise
-
-
-def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
-    agent_benchmark_config = get_agent_benchmark_config()
-    # tests that consistently pass are considered regression tests
-    REGRESSION_MANAGER = ReportManager(
-        agent_benchmark_config.get_regression_reports_path(), BENCHMARK_START_TIME_DT
-    )
-
-    # print(f"Using {REPORTS_PATH} for reports")
-    # user facing reporting information
-    INFO_MANAGER = ReportManager(
-        str(
-            agent_benchmark_config.get_reports_path(
-                benchmark_start_time=BENCHMARK_START_TIME_DT
-            )
-            / "report.json"
-        ),
-        BENCHMARK_START_TIME_DT,
-    )
-
-    # internal db step in replacement track pass/fail rate
-    INTERNAL_INFO_MANAGER = ReportManager(
-        agent_benchmark_config.get_success_rate_path(), BENCHMARK_START_TIME_DT
-    )
-
-    return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
-
-
-(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()
-
-
 if os.environ.get("HELICONE_API_KEY"):
     HeliconeLockManager.write_custom_property(
         "benchmark_start_time", BENCHMARK_START_TIME
@@ -122,6 +76,9 @@ def run_benchmark(
 ) -> int:
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     # Check if configuration file exists and is not empty
+
+    initialize_updates_file()
+    SingletonReportManager()
     agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
     try:
         with open(agent_benchmark_config_path, "r") as f:
@@ -214,7 +171,8 @@ def run_benchmark(
     current_dir = Path(__file__).resolve().parent
     print(f"Current directory: {current_dir}")
     pytest_args.extend((str(current_dir), "--cache-clear"))
-    return pytest.main(pytest_args)
+    exit_code = pytest.main(pytest_args)
+    SingletonReportManager().clear_instance()
 
 
 @click.group(invoke_without_command=True)
@@ -226,7 +184,7 @@ def run_benchmark(
     multiple=True,
     help="Skips preventing the tests from this category from running",
 )
-@click.option("--test", help="Specific test to run")
+@click.option("--test", multiple=True, help="Specific test to run")
 @click.option("--maintain", is_flag=True, help="Runs only regression tests")
 @click.option("--improve", is_flag=True, help="Run only non-regression tests")
 @click.option(
@@ -314,12 +272,28 @@ def version():
     print(f"Benchmark Tool Version {version}")
 
 
+from pathlib import Path
+
+
 def serve():
     import uvicorn
 
     # Run the FastAPI application using uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8080)
 
 
+def initialize_updates_file():
+    if os.path.exists(UPDATES_JSON_PATH):
+        # If the file already exists, overwrite it with an empty list
+        with open(UPDATES_JSON_PATH, "w") as file:
+            json.dump([], file, indent=2)
+        print("Initialized updates.json by overwriting with an empty array")
+    else:
+        # If the file doesn't exist, create it and write an empty list
+        with open(UPDATES_JSON_PATH, "w") as file:
+            json.dump([], file, indent=2)
+        print("Created updates.json and initialized it with an empty array")
+
+
 if __name__ == "__main__":
     cli()
diff --git a/benchmark/agbenchmark/agent_interface.py b/benchmark/agbenchmark/agent_interface.py
@@ -1,18 +1,11 @@
 import os
-import platform
-import queue
-import select
 import shutil
-import subprocess
 import sys
-import time
-from threading import Thread
-from typing import Any, List
+from typing import List
 
-import psutil
 from dotenv import load_dotenv
 
-from agbenchmark.utils.data_types import AgentBenchmarkConfig
+from agbenchmark.execute_sub_process import execute_subprocess
 
 load_dotenv()
 
@@ -22,82 +15,12 @@
 )
 
 
-def run_linux_env(process: Any, start_time: float, timeout: float) -> None:
-    while True:
-        try:
-            # This checks if there's data to be read from stdout without blocking.
-            if process.stdout and select.select([process.stdout], [], [], 0)[0]:
-                output = process.stdout.readline()
-                print(output.strip())
-        except Exception as e:
-            continue
-
-        # Check if process has ended, has no more output, or exceeded timeout
-        if process.poll() is not None or (time.time() - start_time > timeout):
-            break
-
-    if time.time() - start_time > timeout:
-        print("The Python function has exceeded the time limit and was terminated.")
-        parent = psutil.Process(process.pid)
-        for child in parent.children(recursive=True):
-            child.kill()
-        parent.kill()
-
-    else:
-        print("The Python function has finished running.")
-
-
-def enqueue_output(out: Any, my_queue: Any) -> None:
-    for line in iter(out.readline, b""):
-        my_queue.put(line)
-    out.close()
-
-
-def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
-    my_queue: Any = queue.Queue()
-    thread = Thread(target=enqueue_output, args=(process.stdout, my_queue))
-    thread.daemon = True
-    thread.start()
-
-    while True:
-        try:
-            output = my_queue.get_nowait().strip()
-            print(output)
-        except queue.Empty:
-            pass
-
-        if process.poll() is not None or (time.time() - start_time > timeout):
-            break
-
-    if time.time() - start_time > timeout:
-        print("The Python function has exceeded the time limit and was terminated.")
-        process.terminate()
-
-
-def run_agent(task: str, timeout: int, agent_config: AgentBenchmarkConfig) -> None:
+def run_agent(task: str, timeout: int) -> None:
     print(f"Running agbenchmark/benchmarks.py with timeout {timeout}")
 
     command = [sys.executable, "-m", "agbenchmark_config.benchmarks", str(task)]
 
-    process = subprocess.Popen(
-        command,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        universal_newlines=True,
-        bufsize=1,
-    )
-
-    start_time = time.time()
-
-    if platform.system() == "Windows":
-        run_windows_env(process, start_time, timeout)
-    else:
-        run_linux_env(process, start_time, timeout)
-
-    process.wait()
-
-    if process.returncode != 0:
-        print(f"The agent timed out")
+    execute_subprocess(command, timeout)
 
 
 def get_list_of_file_paths(

diff --git a/benchmark/agbenchmark/app.py b/benchmark/agbenchmark/app.py
@@ -1,33 +1,67 @@
 import json
 import os
 import sys
-from typing import Any, List, Optional
+from typing import Any, Optional
 
+import psutil
 from fastapi import FastAPI
 from fastapi import (
     HTTPException as FastAPIHTTPException,  # Import HTTPException from FastAPI
 )
 from fastapi import Request, Response
 from fastapi.middleware.cors import CORSMiddleware
 
-# from agbenchmark.app import app
+from agbenchmark.execute_sub_process import execute_subprocess
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from fastapi import FastAPI
-from pydantic import BaseModel
+from pydantic import BaseModel, Extra
 
 # Change the current working directory to the benchmark path
 # home_path = find_absolute_benchmark_path()
 # os.chdir(home_path)
 
 general_command = ["poetry", "run", "agbenchmark", "start", "--backend"]
 
+import psutil
+
+
+def find_agbenchmark_without_uvicorn():
+    pids = []
+    for process in psutil.process_iter(
+        attrs=[
+            "pid",
+            "cmdline",
+            "name",
+            "username",
+            "status",
+            "cpu_percent",
+            "memory_info",
+            "create_time",
+            "cwd",
+            "connections",
+        ]
+    ):
+        try:
+            # Convert the process.info dictionary values to strings and concatenate them
+            full_info = " ".join([str(v) for k, v in process.info.items()])
+
+            if "agbenchmark" in full_info and "uvicorn" not in full_info:
+                pids.append(process.info["pid"])
+        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+            pass
+    return pids
+
 
 class CreateReportRequest(BaseModel):
-    tests: Optional[List[str]] = []
-    category: Optional[str] = []
+    test: str = None
+    test_run_id: str = None
+    # category: Optional[str] = []
     mock: Optional[bool] = False
 
+    class Config:
+        extra = Extra.forbid  # this will forbid any extra fields
+
 
 updates_list = []
 
@@ -50,25 +84,30 @@ class CreateReportRequest(BaseModel):
 )
 
 
+def stream_output(pipe):
+    for line in pipe:
+        print(line, end="")
+
+
 @app.post("/reports")
 def run_single_test(body: CreateReportRequest) -> Any:
-    from agbenchmark.__main__ import run_benchmark
-
+    pids = find_agbenchmark_without_uvicorn()
+    print(f"pids already running with agbenchmark: {pids}")
+    print(body.dict())
     # it's a hack because other parts of the code are using sys.argv
-    sys.argv = [sys.argv[0]]
-    sys.argv.append("start")
-    if body.category:
-        sys.argv.append(f"--category={body.category}")
-    for body_test in body.tests:
-        sys.argv.append(f"--test={body_test}")
-    categories = None
-    if body.category:
-        categories = tuple([body.category])
-
-    run_benchmark(category=categories, mock=body.mock, test=tuple(body.tests))
+    print(os.getcwd())
+    command_options = ["agbenchmark"]
+    # if body.category:
+    #     sys.argv.append(f"--category={body.category}")
+    command_options.append(f"--test={body.test}")
+    if body.mock:
+        command_options.append("--mock")
+
+    execute_subprocess(command_options, 200)
     import json
     from pathlib import Path
 
+    print("finished running")
     # List all folders in the current working directory
     path_reports = Path.cwd() / "agbenchmark_config" / "reports"
     folders = [folder for folder in path_reports.iterdir() if folder.is_dir()]
@@ -82,6 +121,7 @@ def run_single_test(body: CreateReportRequest) -> Any:
     # Read report.json from this folder
     if last_folder:
         report_path = last_folder / "report.json"
+        print(report_path)
         if report_path.exists():
             with report_path.open() as file:
                 data = json.load(file)