Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 26 additions & 6 deletions evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
import base64
import json
import openai
import argparse

from dotenv import load_dotenv

# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v"
TEST_CASES = {
"Go to Github.com": "The Github home page is visible.",
"Go to Github.com": "A Github page is visible.",
"Go to Youtube.com and play a video": "The YouTube video player is visible.",
}

Expand Down Expand Up @@ -110,10 +111,10 @@ def evaluate_final_screenshot(guideline):
return parse_eval_content(eval_content)


def run_test_case(objective, guideline):
'''Returns True if the result of the test with the given prompt meets the given guideline.'''
# Run `operate` with the test case prompt
subprocess.run(['operate', '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)
def run_test_case(objective, guideline, model):
'''Returns True if the result of the test with the given prompt meets the given guideline for the given model.'''
# Run `operate` with the model to evaluate and the test case prompt
subprocess.run(['operate', '-m', model, '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)

try:
result = evaluate_final_screenshot(guideline)
Expand All @@ -124,17 +125,36 @@ def run_test_case(objective, guideline):
return result


def get_test_model():
parser = argparse.ArgumentParser(
description="Run the self-operating-computer with a specified model."
)

parser.add_argument(
"-m",
"--model",
help="Specify the model to evaluate.",
required=False,
default="gpt-4-with-ocr",
)

return parser.parse_args().model


def main():
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

model = get_test_model()

print(f"{ANSI_BLUE}[EVALUATING MODEL `{model}`]{ANSI_RESET}")
print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")

passed = 0; failed = 0
for objective, guideline in TEST_CASES.items():
print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")

result = run_test_case(objective, guideline)
result = run_test_case(objective, guideline, model)
if result:
print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
passed += 1
Expand Down