From e2537900e15d5f5688a8298402e7d9c5021e9646 Mon Sep 17 00:00:00 2001
From: Michael Hogue <michael.hogue.19@gmail.com>
Date: Thu, 15 Feb 2024 08:53:12 -0500
Subject: [PATCH 1/2] Add `-m` argument to evaluate.py

---
 evaluate.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/evaluate.py b/evaluate.py
index 19b28d60..5fc29282 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -5,12 +5,13 @@
 import base64
 import json
 import openai
+import argparse
 
 from dotenv import load_dotenv
 
 # "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v"
 TEST_CASES = {
-    "Go to Github.com": "The Github home page is visible.",
+    "Go to Github.com": "A Github page is visible.",
     "Go to Youtube.com and play a video": "The YouTube video player is visible.",
 }
 
@@ -124,10 +125,29 @@ def run_test_case(objective, guideline):
     return result
 
 
+def get_test_model():
+    parser = argparse.ArgumentParser(
+        description="Run the self-operating-computer with a specified model."
+    )
+    
+    parser.add_argument(
+        "-m",
+        "--model",
+        help="Specify the model to evaluate.",
+        required=False,
+        default="gpt-4-with-ocr",
+    )
+    
+    return parser.parse_args().model
+
+
 def main():
     load_dotenv()
     openai.api_key = os.getenv("OPENAI_API_KEY")
     
+    model = get_test_model()
+    
+    print(f"{ANSI_BLUE}[EVALUATING MODEL `{model}`]{ANSI_RESET}")
     print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")
 
     passed = 0; failed = 0

From f781cfe652c10d6784118fdde9bb9c172b57bb7e Mon Sep 17 00:00:00 2001
From: Michael Hogue <michael.hogue.19@gmail.com>
Date: Thu, 15 Feb 2024 09:51:41 -0500
Subject: [PATCH 2/2] Pass model to `operate`

---
 evaluate.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index 5fc29282..124e9ac0 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -111,10 +111,10 @@ def evaluate_final_screenshot(guideline):
         return parse_eval_content(eval_content)
 
 
-def run_test_case(objective, guideline):
-    '''Returns True if the result of the test with the given prompt meets the given guideline.'''
-    # Run `operate` with the test case prompt
-    subprocess.run(['operate', '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)
+def run_test_case(objective, guideline, model):
+    '''Returns True if the result of the test with the given prompt meets the given guideline for the given model.'''
+    # Run `operate` with the model to evaluate and the test case prompt
+    subprocess.run(['operate', '-m', model, '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)
     
     try:
         result = evaluate_final_screenshot(guideline)
@@ -154,7 +154,7 @@ def main():
     for objective, guideline in TEST_CASES.items():
         print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")
         
-        result = run_test_case(objective, guideline)
+        result = run_test_case(objective, guideline, model)
         if result:
             print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
             passed += 1