From fd8f601be136c22984ec9af4718b181bfa46d20c Mon Sep 17 00:00:00 2001
From: Merwane Hamadi <merwanehamadi@gmail.com>
Date: Sun, 17 Sep 2023 17:11:23 -0700
Subject: [PATCH] Fix benchmark being stateful

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 .github/workflows/benchmark-ci.yml            |  4 +
 autogpts/forge/forge/__main__.py              |  1 -
 autogpts/forge/run_benchmark                  |  4 +-
 benchmark/agbenchmark/__main__.py             | 90 ++++++++-----------
 benchmark/agbenchmark/agent_interface.py      | 85 +-----------------
 benchmark/agbenchmark/app.py                  | 26 ++++--
 .../deprecated/content_gen/2_plan/data.json   |  2 +-
 benchmark/agbenchmark/execute_sub_process.py  | 79 ++++++++++++++++
 benchmark/agbenchmark/generate_test.py        | 16 +---
 .../agbenchmark/reports/ReportManager.py      | 46 +++++++++-
 .../reports/agent_benchmark_config.py         | 18 ++++
 benchmark/agbenchmark/reports/reports.py      | 30 +++----
 benchmark/tests/__init__.py                   |  0
 benchmark/tests/test_web_server.py            | 66 ++++++++++++++
 14 files changed, 290 insertions(+), 177 deletions(-)
 create mode 100644 benchmark/agbenchmark/execute_sub_process.py
 create mode 100644 benchmark/agbenchmark/reports/agent_benchmark_config.py
 create mode 100644 benchmark/tests/__init__.py
 create mode 100644 benchmark/tests/test_web_server.py

diff --git a/.github/workflows/benchmark-ci.yml b/.github/workflows/benchmark-ci.yml
index 426f0d45c46..148038d4584 100644
--- a/.github/workflows/benchmark-ci.yml
+++ b/.github/workflows/benchmark-ci.yml
@@ -127,5 +127,9 @@ jobs:
 
           echo "Running the following command: ${prefix}agbenchmark --test=WriteFile"
           ${prefix}agbenchmark --test=WriteFile
+          sh run_benchmark &
+          poetry run python ../../benchmark/tests/test_web_server.py &
+          poetry run ../../benchmark/tests/test_web_server.py &
+          poetry run ../../benchmark/tests/test_web_server.py
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/autogpts/forge/forge/__main__.py b/autogpts/forge/forge/__main__.py
index f20644b7da9..2dad519dfe1 100644
--- a/autogpts/forge/forge/__main__.py
+++ b/autogpts/forge/forge/__main__.py
@@ -4,7 +4,6 @@
 
 load_dotenv()
 import forge.sdk.forge_log
-
 forge.sdk.forge_log.setup_logger()
 
 
diff --git a/autogpts/forge/run_benchmark b/autogpts/forge/run_benchmark
index fa95ee76935..9d9253959bb 100755
--- a/autogpts/forge/run_benchmark
+++ b/autogpts/forge/run_benchmark
@@ -2,4 +2,6 @@
 
 kill $(lsof -t -i :8080)
 
-poetry run agbenchmark serve
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+uvicorn agbenchmark.app:app --host localhost --port 8080 --reload --log-level info --reload-dir "$SCRIPT_DIR/../../benchmark/agbenchmark"
diff --git a/benchmark/agbenchmark/__main__.py b/benchmark/agbenchmark/__main__.py
index a2de579e0f0..caafa14f425 100644
--- a/benchmark/agbenchmark/__main__.py
+++ b/benchmark/agbenchmark/__main__.py
@@ -11,12 +11,9 @@
 import toml
 from helicone.lock import HeliconeLockManager
 
-from agbenchmark.app import app
+from agbenchmark.reports.ReportManager import SingletonReportManager
 from agbenchmark.utils.data_types import AgentBenchmarkConfig
 
-from .reports.ReportManager import ReportManager
-from .utils.data_types import AgentBenchmarkConfig
-
 BENCHMARK_START_TIME_DT = datetime.now(timezone.utc)
 BENCHMARK_START_TIME = BENCHMARK_START_TIME_DT.strftime("%Y-%m-%dT%H:%M:%S+00:00")
 TEMP_FOLDER_ABS_PATH = Path.cwd() / "agbenchmark_config" / "temp_folder"
@@ -26,50 +23,6 @@
 UPDATES_JSON_PATH = Path.cwd() / "agbenchmark_config" / "updates.json"
 
 
-def get_agent_benchmark_config() -> AgentBenchmarkConfig:
-    agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
-    try:
-        with open(agent_benchmark_config_path, "r") as f:
-            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = (
-                agent_benchmark_config_path
-            )
-            return agent_benchmark_config
-    except json.JSONDecodeError:
-        print("Error: benchmark_config.json is not a valid JSON file.")
-        raise
-
-
-def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
-    agent_benchmark_config = get_agent_benchmark_config()
-    # tests that consistently pass are considered regression tests
-    REGRESSION_MANAGER = ReportManager(
-        agent_benchmark_config.get_regression_reports_path(), BENCHMARK_START_TIME_DT
-    )
-
-    # print(f"Using {REPORTS_PATH} for reports")
-    # user facing reporting information
-    INFO_MANAGER = ReportManager(
-        str(
-            agent_benchmark_config.get_reports_path(
-                benchmark_start_time=BENCHMARK_START_TIME_DT
-            )
-            / "report.json"
-        ),
-        BENCHMARK_START_TIME_DT,
-    )
-
-    # internal db step in replacement track pass/fail rate
-    INTERNAL_INFO_MANAGER = ReportManager(
-        agent_benchmark_config.get_success_rate_path(), BENCHMARK_START_TIME_DT
-    )
-
-    return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
-
-
-(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()
-
-
 if os.environ.get("HELICONE_API_KEY"):
     HeliconeLockManager.write_custom_property(
         "benchmark_start_time", BENCHMARK_START_TIME
@@ -122,6 +75,8 @@ def run_benchmark(
 ) -> int:
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     # Check if configuration file exists and is not empty
+    initialize_updates_file()
+    SingletonReportManager()
     agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
     try:
         with open(agent_benchmark_config_path, "r") as f:
@@ -214,7 +169,8 @@ def run_benchmark(
     current_dir = Path(__file__).resolve().parent
     print(f"Current directory: {current_dir}")
     pytest_args.extend((str(current_dir), "--cache-clear"))
-    return pytest.main(pytest_args)
+    exit_code = pytest.main(pytest_args)
+    SingletonReportManager().clear_instance()
 
 
 @click.group(invoke_without_command=True)
@@ -314,11 +270,39 @@ def version():
     print(f"Benchmark Tool Version {version}")
 
 
-def serve():
-    import uvicorn
+from pathlib import Path
 
-    # Run the FastAPI application using uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8080)
+# class CustomStatReload(StatReload):
+#     def __init__(self, *args, **kwargs):
+#         super().__init__(*args, **kwargs)
+#         # Overriding the directories to watch
+#         self.dirs = [Path(__file__).absolute().parent]
+
+# def serve():
+#     current_file_path = Path(__file__).absolute().parent
+#     config = Config(
+#         "agbenchmark.app:app",    # Reference to your FastAPI application
+#         host="localhost",         # Host to bind
+#         port=8080,                # Port to bind
+#         reload=True,              # Enable reload
+#         log_level="info",         # Logging level
+#         # reload_dirs=[str(current_file_path)],  # Directories to watch
+#     )
+#     server = Server(config)
+#     server.run()
+
+
+def initialize_updates_file():
+    if os.path.exists(UPDATES_JSON_PATH):
+        # If the file already exists, overwrite it with an empty list
+        with open(UPDATES_JSON_PATH, "w") as file:
+            json.dump([], file, indent=2)
+        print("Initialized updates.json by overwriting with an empty array")
+    else:
+        # If the file doesn't exist, create it and write an empty list
+        with open(UPDATES_JSON_PATH, "w") as file:
+            json.dump([], file, indent=2)
+        print("Created updates.json and initialized it with an empty array")
 
 
 if __name__ == "__main__":
diff --git a/benchmark/agbenchmark/agent_interface.py b/benchmark/agbenchmark/agent_interface.py
index 5d1b24c58b3..269e8f8ff49 100644
--- a/benchmark/agbenchmark/agent_interface.py
+++ b/benchmark/agbenchmark/agent_interface.py
@@ -1,18 +1,11 @@
 import os
-import platform
-import queue
-import select
 import shutil
-import subprocess
 import sys
-import time
-from threading import Thread
-from typing import Any, List
+from typing import List
 
-import psutil
 from dotenv import load_dotenv
 
-from agbenchmark.utils.data_types import AgentBenchmarkConfig
+from agbenchmark.execute_sub_process import execute_subprocess
 
 load_dotenv()
 
@@ -22,82 +15,12 @@
 )
 
 
-def run_linux_env(process: Any, start_time: float, timeout: float) -> None:
-    while True:
-        try:
-            # This checks if there's data to be read from stdout without blocking.
-            if process.stdout and select.select([process.stdout], [], [], 0)[0]:
-                output = process.stdout.readline()
-                print(output.strip())
-        except Exception as e:
-            continue
-
-        # Check if process has ended, has no more output, or exceeded timeout
-        if process.poll() is not None or (time.time() - start_time > timeout):
-            break
-
-    if time.time() - start_time > timeout:
-        print("The Python function has exceeded the time limit and was terminated.")
-        parent = psutil.Process(process.pid)
-        for child in parent.children(recursive=True):
-            child.kill()
-        parent.kill()
-
-    else:
-        print("The Python function has finished running.")
-
-
-def enqueue_output(out: Any, my_queue: Any) -> None:
-    for line in iter(out.readline, b""):
-        my_queue.put(line)
-    out.close()
-
-
-def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
-    my_queue: Any = queue.Queue()
-    thread = Thread(target=enqueue_output, args=(process.stdout, my_queue))
-    thread.daemon = True
-    thread.start()
-
-    while True:
-        try:
-            output = my_queue.get_nowait().strip()
-            print(output)
-        except queue.Empty:
-            pass
-
-        if process.poll() is not None or (time.time() - start_time > timeout):
-            break
-
-    if time.time() - start_time > timeout:
-        print("The Python function has exceeded the time limit and was terminated.")
-        process.terminate()
-
-
-def run_agent(task: str, timeout: int, agent_config: AgentBenchmarkConfig) -> None:
+def run_agent(task: str, timeout: int) -> None:
     print(f"Running agbenchmark/benchmarks.py with timeout {timeout}")
 
     command = [sys.executable, "-m", "agbenchmark_config.benchmarks", str(task)]
 
-    process = subprocess.Popen(
-        command,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        universal_newlines=True,
-        bufsize=1,
-    )
-
-    start_time = time.time()
-
-    if platform.system() == "Windows":
-        run_windows_env(process, start_time, timeout)
-    else:
-        run_linux_env(process, start_time, timeout)
-
-    process.wait()
-
-    if process.returncode != 0:
-        print(f"The agent timed out")
+    execute_subprocess(command, timeout)
 
 
 def get_list_of_file_paths(
diff --git a/benchmark/agbenchmark/app.py b/benchmark/agbenchmark/app.py
index 0485528b484..49be8c9c95d 100644
--- a/benchmark/agbenchmark/app.py
+++ b/benchmark/agbenchmark/app.py
@@ -10,11 +10,11 @@
 from fastapi import Request, Response
 from fastapi.middleware.cors import CORSMiddleware
 
-# from agbenchmark.app import app
+from agbenchmark.execute_sub_process import execute_subprocess
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from fastapi import FastAPI
-from pydantic import BaseModel
+from pydantic import BaseModel, Extra
 
 # Change the current working directory to the benchmark path
 # home_path = find_absolute_benchmark_path()
@@ -28,6 +28,9 @@ class CreateReportRequest(BaseModel):
     category: Optional[str] = []
     mock: Optional[bool] = False
 
+    class Config:
+        extra = Extra.forbid  # this will forbid any extra fields
+
 
 updates_list = []
 
@@ -50,25 +53,29 @@ class CreateReportRequest(BaseModel):
 )
 
 
+def stream_output(pipe):
+    for line in pipe:
+        print(line, end="")
+
+
 @app.post("/reports")
 def run_single_test(body: CreateReportRequest) -> Any:
-    from agbenchmark.__main__ import run_benchmark
-
+    print(body.dict())
     # it's a hack because other parts of the code are using sys.argv
-    sys.argv = [sys.argv[0]]
-    sys.argv.append("start")
+    print(os.getcwd())
+    command_options = ["agbenchmark"]
     if body.category:
         sys.argv.append(f"--category={body.category}")
     for body_test in body.tests:
-        sys.argv.append(f"--test={body_test}")
-    categories = None
+        command_options.append(f"--test={body_test}")
     if body.category:
         categories = tuple([body.category])
 
-    run_benchmark(category=categories, mock=body.mock, test=tuple(body.tests))
+    execute_subprocess(command_options, 200)
     import json
     from pathlib import Path
 
+    print("finished running")
     # List all folders in the current working directory
     path_reports = Path.cwd() / "agbenchmark_config" / "reports"
     folders = [folder for folder in path_reports.iterdir() if folder.is_dir()]
@@ -82,6 +89,7 @@ def run_single_test(body: CreateReportRequest) -> Any:
     # Read report.json from this folder
     if last_folder:
         report_path = last_folder / "report.json"
+        print(report_path)
         if report_path.exists():
             with report_path.open() as file:
                 data = json.load(file)
diff --git a/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json b/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json
index e96994cf267..ed60d428cf9 100644
--- a/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json
+++ b/benchmark/agbenchmark/challenges/deprecated/content_gen/2_plan/data.json
@@ -17,7 +17,7 @@
   },
   "info": {
     "difficulty": "basic",
-    "description": "s ability to generate content based on the content of 2 files.",
+    "description": "ability to generate content based on the content of 2 files.",
     "side_effects": []
   }
 }
diff --git a/benchmark/agbenchmark/execute_sub_process.py b/benchmark/agbenchmark/execute_sub_process.py
new file mode 100644
index 00000000000..b981e6be57c
--- /dev/null
+++ b/benchmark/agbenchmark/execute_sub_process.py
@@ -0,0 +1,79 @@
+import platform
+import queue
+import select
+import subprocess
+import time
+from threading import Thread
+from typing import Any
+
+import psutil
+
+
+def run_linux_env(process: Any, start_time: float, timeout: float) -> None:
+    while True:
+        try:
+            # This checks if there's data to be read from stdout without blocking.
+            if process.stdout and select.select([process.stdout], [], [], 0)[0]:
+                output = process.stdout.readline()
+                print(output.strip())
+        except Exception as e:
+            continue
+
+        # Check if process has ended, has no more output, or exceeded timeout
+        if process.poll() is not None or (time.time() - start_time > timeout):
+            break
+
+    if time.time() - start_time > timeout:
+        print("The Python function has exceeded the time limit and was terminated.")
+        parent = psutil.Process(process.pid)
+        for child in parent.children(recursive=True):
+            child.kill()
+        parent.kill()
+
+    else:
+        print("The Python function has finished running.")
+
+
+def enqueue_output(out: Any, my_queue: Any) -> None:
+    for line in iter(out.readline, b""):
+        my_queue.put(line)
+    out.close()
+
+
+def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
+    my_queue: Any = queue.Queue()
+    thread = Thread(target=enqueue_output, args=(process.stdout, my_queue))
+    thread.daemon = True
+    thread.start()
+
+    while True:
+        try:
+            output = my_queue.get_nowait().strip()
+            print(output)
+        except queue.Empty:
+            pass
+
+        if process.poll() is not None or (time.time() - start_time > timeout):
+            break
+
+    if time.time() - start_time > timeout:
+        print("The Python function has exceeded the time limit and was terminated.")
+        process.terminate()
+
+
+def execute_subprocess(command, timeout):
+    process = subprocess.Popen(
+        command,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        universal_newlines=True,
+        bufsize=1,
+    )
+    start_time = time.time()
+    if platform.system() == "Windows":
+        run_windows_env(process, start_time, timeout)
+    else:
+        run_linux_env(process, start_time, timeout)
+    process.wait()
+    if process.returncode != 0:
+        print(f"The agent timed out")
diff --git a/benchmark/agbenchmark/generate_test.py b/benchmark/agbenchmark/generate_test.py
index ef7dc40ce65..92efd79844b 100644
--- a/benchmark/agbenchmark/generate_test.py
+++ b/benchmark/agbenchmark/generate_test.py
@@ -10,7 +10,7 @@
 
 import pytest
 
-from agbenchmark.__main__ import CHALLENGES_ALREADY_BEATEN, UPDATES_JSON_PATH
+from agbenchmark.__main__ import CHALLENGES_ALREADY_BEATEN
 from agbenchmark.agent_api_interface import append_updates_file
 from agbenchmark.agent_protocol_client.models.step import Step
 from agbenchmark.utils.challenge import Challenge
@@ -218,18 +218,4 @@ def challenge_should_be_ignored(json_file):
     return "challenges/deprecated" in json_file or "challenges/library" in json_file
 
 
-def initialize_updates_file():
-    if os.path.exists(UPDATES_JSON_PATH):
-        # If the file already exists, overwrite it with an empty list
-        with open(UPDATES_JSON_PATH, "w") as file:
-            json.dump([], file, indent=2)
-        print("Initialized updates.json by overwriting with an empty array")
-    else:
-        # If the file doesn't exist, create it and write an empty list
-        with open(UPDATES_JSON_PATH, "w") as file:
-            json.dump([], file, indent=2)
-        print("Created updates.json and initialized it with an empty array")
-
-
-initialize_updates_file()
 generate_tests()
diff --git a/benchmark/agbenchmark/reports/ReportManager.py b/benchmark/agbenchmark/reports/ReportManager.py
index fc4a553bc95..1b9e6ae5d91 100644
--- a/benchmark/agbenchmark/reports/ReportManager.py
+++ b/benchmark/agbenchmark/reports/ReportManager.py
@@ -1,3 +1,4 @@
+import copy
 import json
 import os
 import sys
@@ -11,6 +12,48 @@
 from agbenchmark.utils.utils import get_highest_success_difficulty
 
 
+class SingletonReportManager:
+    instance = None
+
+    def __new__(cls):
+        from agbenchmark.reports.agent_benchmark_config import (
+            get_agent_benchmark_config,
+        )
+
+        if not cls.instance:
+            cls.instance = super(SingletonReportManager, cls).__new__(cls)
+
+            agent_benchmark_config = get_agent_benchmark_config()
+            benchmark_start_time_dt = (
+                datetime.now()
+            )  # or any logic to fetch the datetime
+
+            # Make the Managers class attributes
+            cls.REGRESSION_MANAGER = ReportManager(
+                agent_benchmark_config.get_regression_reports_path(),
+                benchmark_start_time_dt,
+            )
+            cls.INFO_MANAGER = ReportManager(
+                str(
+                    agent_benchmark_config.get_reports_path(benchmark_start_time_dt)
+                    / "report.json"
+                ),
+                benchmark_start_time_dt,
+            )
+            cls.INTERNAL_INFO_MANAGER = ReportManager(
+                agent_benchmark_config.get_success_rate_path(), benchmark_start_time_dt
+            )
+
+        return cls.instance
+
+    @classmethod
+    def clear_instance(cls):
+        cls.instance = None
+        cls.REGRESSION_MANAGER = None
+        cls.INFO_MANAGER = None
+        cls.INTERNAL_INFO_MANAGER = None
+
+
 class ReportManager:
     """Abstracts interaction with the regression tests file"""
 
@@ -81,7 +124,7 @@ def end_info_report(self, config: AgentBenchmarkConfig) -> None:
                 "highest_difficulty": get_highest_success_difficulty(self.tests),
                 "total_cost": self.get_total_costs(),
             },
-            "tests": self.tests,
+            "tests": copy.copy(self.tests),
             "config": {
                 k: v for k, v in json.loads(config.json()).items() if v is not None
             },
@@ -105,6 +148,7 @@ def get_total_costs(self):
             cost = test_data["metrics"].get(
                 "cost", 0
             )  # gets the cost or defaults to 0 if cost is missing
+
             if cost is not None:  # check if cost is not None
                 all_costs_none = False
                 total_cost += cost  # add cost to total
diff --git a/benchmark/agbenchmark/reports/agent_benchmark_config.py b/benchmark/agbenchmark/reports/agent_benchmark_config.py
new file mode 100644
index 00000000000..3b45ed713c9
--- /dev/null
+++ b/benchmark/agbenchmark/reports/agent_benchmark_config.py
@@ -0,0 +1,18 @@
+import json
+from pathlib import Path
+
+from agbenchmark.utils.data_types import AgentBenchmarkConfig
+
+
+def get_agent_benchmark_config() -> AgentBenchmarkConfig:
+    agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
+    try:
+        with open(agent_benchmark_config_path, "r") as f:
+            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
+            agent_benchmark_config.agent_benchmark_config_path = (
+                agent_benchmark_config_path
+            )
+            return agent_benchmark_config
+    except json.JSONDecodeError:
+        print("Error: benchmark_config.json is not a valid JSON file.")
+        raise
diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/agbenchmark/reports/reports.py
index dd70500f251..8a6f04c46bb 100644
--- a/benchmark/agbenchmark/reports/reports.py
+++ b/benchmark/agbenchmark/reports/reports.py
@@ -3,13 +3,9 @@
 import sys
 from typing import Any, Dict
 
-from agbenchmark.__main__ import (
-    CHALLENGES_ALREADY_BEATEN,
-    INFO_MANAGER,
-    INTERNAL_INFO_MANAGER,
-    REGRESSION_MANAGER,
-    get_agent_benchmark_config,
-)
+from agbenchmark.__main__ import CHALLENGES_ALREADY_BEATEN
+from agbenchmark.reports.agent_benchmark_config import get_agent_benchmark_config
+from agbenchmark.reports.ReportManager import SingletonReportManager
 from agbenchmark.utils.data_types import DifficultyLevel
 from agbenchmark.utils.get_data_from_helicone import get_data_from_helicone
 from agbenchmark.utils.utils import calculate_success_percentage
@@ -21,12 +17,16 @@ def get_previous_test_results(
     agent_tests: dict[str, list[bool]] = {}
     mock = os.getenv("IS_MOCK")  # Check if --mock is in sys.argv
 
-    prev_test_results = INTERNAL_INFO_MANAGER.tests.get(test_name, [])
+    prev_test_results = SingletonReportManager().INTERNAL_INFO_MANAGER.tests.get(
+        test_name, []
+    )
 
     if not mock:
         # only add if it's an actual test
         prev_test_results.append(info_details["metrics"]["success"])
-        INTERNAL_INFO_MANAGER.add_test(test_name, prev_test_results)
+        SingletonReportManager().INTERNAL_INFO_MANAGER.add_test(
+            test_name, prev_test_results
+        )
 
     # can calculate success rate regardless of mock
     info_details["metrics"]["success_%"] = calculate_success_percentage(
@@ -45,7 +45,7 @@ def update_regression_tests(
     if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
         # if the last 3 tests were successful, add to the regression tests
         info_details["is_regression"] = True
-        REGRESSION_MANAGER.add_test(test_name, test_details)
+        SingletonReportManager().REGRESSION_MANAGER.add_test(test_name, test_details)
 
 
 def generate_single_call_report(
@@ -95,7 +95,7 @@ def generate_single_call_report(
         info_details["metrics"]["success"] = True
     else:
         if not mock:  # don't remove if it's a mock test
-            REGRESSION_MANAGER.remove_test(test_name)
+            SingletonReportManager().REGRESSION_MANAGER.remove_test(test_name)
         info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
         if call.excinfo.typename == "Skipped":
             info_details["metrics"]["attempted"] = False
@@ -146,7 +146,7 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
                             nested_test_info, nested_test_name
                         )
 
-        INFO_MANAGER.add_test(test_name, info_details)
+        SingletonReportManager().INFO_MANAGER.add_test(test_name, info_details)
 
 
 def update_challenges_already_beaten(
@@ -171,6 +171,6 @@ def update_challenges_already_beaten(
 def session_finish(suite_reports: dict) -> None:
     agent_benchmark_config = get_agent_benchmark_config()
 
-    INTERNAL_INFO_MANAGER.save()
-    INFO_MANAGER.end_info_report(agent_benchmark_config)
-    REGRESSION_MANAGER.save()
+    SingletonReportManager().INTERNAL_INFO_MANAGER.save()
+    SingletonReportManager().INFO_MANAGER.end_info_report(agent_benchmark_config)
+    SingletonReportManager().REGRESSION_MANAGER.save()
diff --git a/benchmark/tests/__init__.py b/benchmark/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/benchmark/tests/test_web_server.py b/benchmark/tests/test_web_server.py
new file mode 100644
index 00000000000..b592b4fd9e3
--- /dev/null
+++ b/benchmark/tests/test_web_server.py
@@ -0,0 +1,66 @@
+import threading
+import time
+import unittest
+
+import requests
+
+
+class TestAPIRequests(unittest.TestCase):
+    URL = "http://localhost:8080"
+
+    def test_post_correct_then_incorrect_request(self):
+        payload1 = {"tests": ["WriteFile", "ReadFile"], "mock": True}
+
+        # First POST request
+        response1 = requests.post(self.URL + "/reports", json=payload1)
+        self.assertEqual(response1.status_code, 200)
+        # Here you might want to check other aspects of the response, e.g., response1.json()
+        print(response1.json())
+        self.assertNotEqual(response1.json()["tests"], {})
+        payload2 = {"tests": ["TestWriteFile", "TestReadFile"], "mock": True}
+
+        # Second POST request
+        response2 = requests.post(self.URL + "/reports", json=payload2)
+        print(response2.json())
+
+        self.assertEqual(response2.json()["tests"], {})
+        assert response1.json() != {}
+        # Here you might want to check other aspects of the response, e.g., response2.json()
+
+    def test_invalid_payload(self):
+        invalid_payload = {"invalid_key": "value"}
+        response = requests.post(self.URL + "/reports", json=invalid_payload)
+        self.assertEqual(response.status_code, 422)  # Assuming 400 for Bad Request
+
+    def test_post_report_and_poll_updates(self):
+        payload1 = {"tests": ["WriteFile", "ReadFile"], "mock": True}
+        last_update_time = int(time.time())
+        # First POST request in a separate thread
+        threading.Thread(target=self.send_post_request, args=(payload1,)).start()
+
+        # Give a short time to ensure POST request is initiated before GET requests start
+        time.sleep(0.1)
+
+        # Start GET requests
+        for _ in range(5):
+            # get the current UNIX time
+            response = requests.get(
+                f"{self.URL}/updates?last_update_time={last_update_time}"
+            )
+            last_update_time = int(time.time())
+            if response.status_code == 200 and response.json():
+                print("Received a non-empty response:", response.json())
+                break
+
+            time.sleep(1)  # wait for 1 second before the next request
+        else:
+            self.fail("No updates received")
+
+    def send_post_request(self, payload):
+        response = requests.post(f"{self.URL}/reports", json=payload)
+        if response.status_code == 200:
+            print(response.json())
+
+
+if __name__ == "__main__":
+    unittest.main()