From 05286446841dcf6376d804465b1293e96ca73d84 Mon Sep 17 00:00:00 2001 From: Michael Hogue Date: Fri, 8 Dec 2023 14:04:33 -0500 Subject: [PATCH 01/13] Add direct prompt mode --- operate/main.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/operate/main.py b/operate/main.py index eb6c82fa..b72102a2 100644 --- a/operate/main.py +++ b/operate/main.py @@ -195,7 +195,7 @@ def supports_ansi(): ANSI_BRIGHT_MAGENTA = "" -def main(model, accurate_mode, voice_mode=False): +def main(model, accurate_mode, prompt, voice_mode=False): """ Main function for the Self-Operating Computer """ @@ -216,11 +216,15 @@ def main(model, accurate_mode, voice_mode=False): ) sys.exit(1) - message_dialog( - title="Self-Operating Computer", - text="Ask a computer to do anything.", - style=style, - ).run() + # Skip message dialog if prompt was given directly + if not prompt: + message_dialog( + title="Self-Operating Computer", + text="Ask a computer to do anything.", + style=style, + ).run() + else: + print("Running direct prompt...") print("SYSTEM", platform.system()) # Clear the console @@ -229,7 +233,9 @@ def main(model, accurate_mode, voice_mode=False): else: print("\033c", end="") - if voice_mode: + if prompt: # Skip objective prompt if it was given as an argument + objective = prompt + elif voice_mode: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)" ) @@ -835,10 +841,18 @@ def main_entry(): action="store_true", required=False, ) + + # Allow for direct input of prompt + parser.add_argument( + "--prompt", + help="Directly input the objective prompt", + type=str, + required=False, + ) try: args = parser.parse_args() - main(args.model, accurate_mode=args.accurate, voice_mode=args.voice) + main(args.model, accurate_mode=args.accurate, prompt=args.prompt, voice_mode=args.voice) except KeyboardInterrupt: print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...") From a16fce91db66cbaae9ca691225d62148addf7425 Mon Sep 17 00:00:00 2001 From: Michael Hogue Date: Fri, 8 Dec 2023 19:23:02 -0500 Subject: [PATCH 02/13] Add evaluator.py --- evaluator.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 evaluator.py diff --git a/evaluator.py b/evaluator.py new file mode 100644 index 00000000..de89027f --- /dev/null +++ b/evaluator.py @@ -0,0 +1,75 @@ +import sys +import os +import subprocess +import platform +import openai + +from dotenv import load_dotenv + +# Check if on a windows terminal that supports ANSI escape codes +def supports_ansi(): + """ + Check if the terminal supports ANSI escape codes + """ + plat = platform.system() + supported_platform = plat != "Windows" or "ANSICON" in os.environ + is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty() + return supported_platform and is_a_tty + +if supports_ansi(): + # Standard green text + ANSI_GREEN = "\033[32m" + # Bright/bold green text + ANSI_BRIGHT_GREEN = "\033[92m" + # Reset to default text color + ANSI_RESET = "\033[0m" + # ANSI escape code for blue text + ANSI_BLUE = "\033[94m" # This is for bright blue + + # Standard yellow text + ANSI_YELLOW = "\033[33m" + + ANSI_RED = "\033[31m" + + # Bright magenta text + ANSI_BRIGHT_MAGENTA = "\033[95m" +else: + ANSI_GREEN = "" + ANSI_BRIGHT_GREEN = "" + ANSI_RESET = "" + ANSI_BLUE = "" + ANSI_YELLOW = "" + ANSI_RED = "" + ANSI_BRIGHT_MAGENTA = "" + + +def run_test_case(prompt, guideline): + '''Returns True if the result of the test with the given prompt meets the given guideline.''' + # Run main.py with the test case prompt + subprocess.run(['operate', '--prompt', f'"{prompt}"']) + + return True + + +def main(): + load_dotenv() + openai.api_key = os.getenv("OPENAI_API_KEY") + + # Define the test cases and the guidelines + test_cases = { + "Open YouTube and play holiday music": "The YouTube video player is loaded and actively playing holiday music.", + "Open Google Docs and write a poem": "A Google Doc file is opened in the browser with a poem typed into it.", + } + + for prompt, guideline in test_cases.items(): + print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} Test case '{prompt}'") + + result = run_test_case(prompt, guideline) + if result: + print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} Test case '{prompt}'") + else: + print(f"{ANSI_RED}[FAILED]{ANSI_RESET} Test case '{prompt}'") + + +if __name__ == "__main__": + main() From ffbffb645a1feeda04a0764b1c3209d2bc30d3eb Mon Sep 17 00:00:00 2001 From: Michael Hogue Date: Fri, 8 Dec 2023 19:45:56 -0500 Subject: [PATCH 03/13] Silence operator stdout --- evaluator.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/evaluator.py b/evaluator.py index de89027f..058c60a9 100644 --- a/evaluator.py +++ b/evaluator.py @@ -46,7 +46,7 @@ def supports_ansi(): def run_test_case(prompt, guideline): '''Returns True if the result of the test with the given prompt meets the given guideline.''' # Run main.py with the test case prompt - subprocess.run(['operate', '--prompt', f'"{prompt}"']) + subprocess.run(['operate', '--prompt', f'"{prompt}"'], stdout=subprocess.DEVNULL) return True @@ -60,15 +60,17 @@ def main(): "Open YouTube and play holiday music": "The YouTube video player is loaded and actively playing holiday music.", "Open Google Docs and write a poem": "A Google Doc file is opened in the browser with a poem typed into it.", } + + print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET} NOTE: `operate` output is silenced.") for prompt, guideline in test_cases.items(): - print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} Test case '{prompt}'") + print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{prompt}'") result = run_test_case(prompt, guideline) if result: - print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} Test case '{prompt}'") + print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{prompt}'") else: - print(f"{ANSI_RED}[FAILED]{ANSI_RESET} Test case '{prompt}'") + print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{prompt}'") if __name__ == "__main__": From ff7f021470c16835c29993388d88934e65145f20 Mon Sep 17 00:00:00 2001 From: Michael Hogue Date: Fri, 8 Dec 2023 19:51:46 -0500 Subject: [PATCH 04/13] Rename to `evaluate` --- evaluator.py => evaluate.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename evaluator.py => evaluate.py (100%) diff --git a/evaluator.py b/evaluate.py similarity index 100% rename from evaluator.py rename to evaluate.py From c9379e14c2ac0602732dbf342e94d303c894bd06 Mon Sep 17 00:00:00 2001 From: Michael Hogue Date: Fri, 8 Dec 2023 20:46:10 -0500 Subject: [PATCH 05/13] Use gpt-4v to evalue summary screenshot --- evaluate.py | 81 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 72 insertions(+), 9 deletions(-) diff --git a/evaluate.py b/evaluate.py index 058c60a9..8a9edbc1 100644 --- a/evaluate.py +++ b/evaluate.py @@ -2,10 +2,21 @@ import os import subprocess import platform +import base64 import openai from dotenv import load_dotenv +SUMMARY_SCREENSHOT_PATH = os.path.join('screenshots', 'summary_screenshot.png') + +EVALUATION_PROMPT = """ +Your job is to look at the given screenshot and determine if the following guideline is met in the image. +You can only respond in one of two possible ways: 'TRUE' or 'FALSE' with those exact spellings. +Respond TRUE or FALSE based on whether or not the given guideline is met. + +Guideline: {guideline} +""" + # Check if on a windows terminal that supports ANSI escape codes def supports_ansi(): """ @@ -41,14 +52,65 @@ def supports_ansi(): ANSI_YELLOW = "" ANSI_RED = "" ANSI_BRIGHT_MAGENTA = "" + + +def format_evaluation_prompt(guideline): + prompt = EVALUATION_PROMPT.format(guideline=guideline) + return prompt + + +def parse_eval_content(content): + if content == "TRUE": + return True + elif content == "FALSE": + return False + else: + print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...") + exit(1) -def run_test_case(prompt, guideline): +def evaluate_summary_screenshot(guideline): + '''Load the summary screenshot and return True or False if it meets the given guideline.''' + with open(SUMMARY_SCREENSHOT_PATH, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + eval_message = [{ + "role": "user", + "content": [ + {"type": "text", "text": format_evaluation_prompt(guideline)}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + }] + + response = openai.chat.completions.create( + model="gpt-4-vision-preview", + messages=eval_message, + presence_penalty=1, + frequency_penalty=1, + temperature=0.7, + max_tokens=300, + ) + + eval_content = response.choices[0].message.content + + return parse_eval_content(eval_content) + + +def run_test_case(objective, guideline): '''Returns True if the result of the test with the given prompt meets the given guideline.''' - # Run main.py with the test case prompt - subprocess.run(['operate', '--prompt', f'"{prompt}"'], stdout=subprocess.DEVNULL) + # Run `operate` with the test case prompt + subprocess.run(['operate', '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL) + + try: + result = evaluate_summary_screenshot(guideline) + except(OSError): + print("Couldn't open the summary screenshot") + return False - return True + return result def main(): @@ -57,20 +119,21 @@ def main(): # Define the test cases and the guidelines test_cases = { + "Go to Google.com": "The Google home page is visible with the search bar.", "Open YouTube and play holiday music": "The YouTube video player is loaded and actively playing holiday music.", "Open Google Docs and write a poem": "A Google Doc file is opened in the browser with a poem typed into it.", } print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET} NOTE: `operate` output is silenced.") - for prompt, guideline in test_cases.items(): - print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{prompt}'") + for objective, guideline in test_cases.items(): + print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'") - result = run_test_case(prompt, guideline) + result = run_test_case(objective, guideline) if result: - print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{prompt}'") + print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'") else: - print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{prompt}'") + print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'") if __name__ == "__main__": From ddbbba08923b6392997b5496d784ed03eb6a9bfb Mon Sep 17 00:00:00 2001 From: Michael Hogue Date: Fri, 8 Dec 2023 20:59:30 -0500 Subject: [PATCH 06/13] Change test cases --- evaluate.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/evaluate.py b/evaluate.py index 8a9edbc1..20df4768 100644 --- a/evaluate.py +++ b/evaluate.py @@ -7,7 +7,10 @@ from dotenv import load_dotenv -SUMMARY_SCREENSHOT_PATH = os.path.join('screenshots', 'summary_screenshot.png') +TEST_CASES = { + "Go to Google.com": "The Google home page is visible with the search bar.", + "Play a video on YouTube": "A YouTube video is playing.", +} EVALUATION_PROMPT = """ Your job is to look at the given screenshot and determine if the following guideline is met in the image. @@ -17,6 +20,8 @@ Guideline: {guideline} """ +SUMMARY_SCREENSHOT_PATH = os.path.join('screenshots', 'summary_screenshot.png') + # Check if on a windows terminal that supports ANSI escape codes def supports_ansi(): """ @@ -116,17 +121,10 @@ def run_test_case(objective, guideline): def main(): load_dotenv() openai.api_key = os.getenv("OPENAI_API_KEY") - - # Define the test cases and the guidelines - test_cases = { - "Go to Google.com": "The Google home page is visible with the search bar.", - "Open YouTube and play holiday music": "The YouTube video player is loaded and actively playing holiday music.", - "Open Google Docs and write a poem": "A Google Doc file is opened in the browser with a poem typed into it.", - } print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET} NOTE: `operate` output is silenced.") - for objective, guideline in test_cases.items(): + for objective, guideline in TEST_CASES.items(): print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'") result = run_test_case(objective, guideline) From 138012a4e496cc1682a9339b5a615c3bde8252e9 Mon Sep 17 00:00:00 2001 From: Michael Hogue Date: Sat, 9 Dec 2023 00:11:48 -0500 Subject: [PATCH 07/13] Add summary message --- evaluate.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/evaluate.py b/evaluate.py index 20df4768..6c28fa68 100644 --- a/evaluate.py +++ b/evaluate.py @@ -122,8 +122,9 @@ def main(): load_dotenv() openai.api_key = os.getenv("OPENAI_API_KEY") - print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET} NOTE: `operate` output is silenced.") + print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}") + passed = 0; failed = 0 for objective, guideline in TEST_CASES.items(): print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'") @@ -133,6 +134,9 @@ def main(): else: print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'") + print( + f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} tests passed, {failed} tests failed" + ) if __name__ == "__main__": main() From 8cbd372646928cdde9cdfeae002713f0d7ccd84d Mon Sep 17 00:00:00 2001 From: Michael Hogue Date: Sat, 9 Dec 2023 12:12:32 -0500 Subject: [PATCH 08/13] Add evaluation justification --- evaluate.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/evaluate.py b/evaluate.py index 6c28fa68..37e1330b 100644 --- a/evaluate.py +++ b/evaluate.py @@ -3,6 +3,7 @@ import subprocess import platform import base64 +import json import openai from dotenv import load_dotenv @@ -14,8 +15,10 @@ EVALUATION_PROMPT = """ Your job is to look at the given screenshot and determine if the following guideline is met in the image. -You can only respond in one of two possible ways: 'TRUE' or 'FALSE' with those exact spellings. -Respond TRUE or FALSE based on whether or not the given guideline is met. +You must respond in the following format ONLY. Do not add anything else: +{{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }} +guideline_met must be set to a JSON boolean. True if the image meets the given guideline. +reason must be a string containing a justification for your decision. Guideline: {guideline} """ @@ -65,11 +68,13 @@ def format_evaluation_prompt(guideline): def parse_eval_content(content): - if content == "TRUE": - return True - elif content == "FALSE": - return False - else: + try: + res = json.loads(content) + + print(res["reason"]) + + return res["guideline_met"] + except: print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...") exit(1) @@ -131,8 +136,10 @@ def main(): result = run_test_case(objective, guideline) if result: print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'") + passed += 1 else: print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'") + failed += 1 print( f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} tests passed, {failed} tests failed" From 4be8acda76d38e70581654fa306ef07caa990261 Mon Sep 17 00:00:00 2001 From: Michael Hogue Date: Sat, 9 Dec 2023 13:10:22 -0500 Subject: [PATCH 09/13] Change default test cases --- evaluate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evaluate.py b/evaluate.py index 37e1330b..8189bf96 100644 --- a/evaluate.py +++ b/evaluate.py @@ -9,8 +9,8 @@ from dotenv import load_dotenv TEST_CASES = { - "Go to Google.com": "The Google home page is visible with the search bar.", - "Play a video on YouTube": "A YouTube video is playing.", + "Go to Github.com": "The Github home page is visible.", + "Go to Youtube.com and play a video": "The YouTube video player is visible.", } EVALUATION_PROMPT = """ From 33f8e917a035ce50404a9dfc2a4b8292d1de8266 Mon Sep 17 00:00:00 2001 From: Michael Hogue Date: Sat, 9 Dec 2023 13:39:38 -0500 Subject: [PATCH 10/13] Add comment to TEST_CASES --- evaluate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/evaluate.py b/evaluate.py index 8189bf96..f543c82c 100644 --- a/evaluate.py +++ b/evaluate.py @@ -8,6 +8,7 @@ from dotenv import load_dotenv +# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v" TEST_CASES = { "Go to Github.com": "The Github home page is visible.", "Go to Youtube.com and play a video": "The YouTube video player is visible.", From 2341c23f83abebefe511fe3ff7ee5a878ff50acc Mon Sep 17 00:00:00 2001 From: Michael Hogue Date: Sun, 10 Dec 2023 09:18:13 -0500 Subject: [PATCH 11/13] Remove evaluator contribution idea --- CONTRIBUTING.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d9be9bcc..64c73aa8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,7 +14,6 @@ We appreciate your contributions! 3. Run `operate` to test your changes ## Contribution Ideas -- **Develop an Automated End-to-End Testing System**: Build an automated testing framework that can be run before merging PRs to `main` to confirm no test cases broke. An example of such a test case would be "go to google docs and write a poem". This testing system should be flexible to add new test cases in the future and reduce the time spent on manually testing each PR. - **Improve performance by finding optimal screenshot grid**: A primary element of the framework is that it overlays a percentage grid on the screenshot which GPT-4v uses to estimate click locations. If someone is able to find the optimal grid and some evaluation metrics to confirm it is an improvement on the current method then we will merge that PR. - **Improve the `SUMMARY_PROMPT`** - **Improve Linux and Windows compatibility**: There are still some issues with Linux and Windows compatibility. PRs to fix the issues are encouraged. From 9dabcb5ea792b52abac3751e23947fc0c9d0f01e Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Tue, 12 Dec 2023 18:13:29 -0800 Subject: [PATCH 12/13] Change name of paramater to `terminal_prompt` to avoid override of imported `prompt` --- operate/main.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/operate/main.py b/operate/main.py index 66631e2e..e1c38ff9 100644 --- a/operate/main.py +++ b/operate/main.py @@ -133,7 +133,6 @@ """ - class ModelNotRecognizedException(Exception): """Exception raised for unrecognized models.""" @@ -195,7 +194,7 @@ def supports_ansi(): ANSI_BRIGHT_MAGENTA = "" -def main(model, accurate_mode, prompt, voice_mode=False): +def main(model, accurate_mode, terminal_prompt, voice_mode=False): """ Main function for the Self-Operating Computer """ @@ -217,7 +216,7 @@ def main(model, accurate_mode, prompt, voice_mode=False): sys.exit(1) # Skip message dialog if prompt was given directly - if not prompt: + if not terminal_prompt: message_dialog( title="Self-Operating Computer", text="Ask a computer to do anything.", @@ -233,8 +232,8 @@ def main(model, accurate_mode, prompt, voice_mode=False): else: print("\033c", end="") - if prompt: # Skip objective prompt if it was given as an argument - objective = prompt + if terminal_prompt: # Skip objective prompt if it was given as an argument + objective = terminal_prompt elif voice_mode: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)" @@ -843,7 +842,7 @@ def main_entry(): action="store_true", required=False, ) - + # Allow for direct input of prompt parser.add_argument( "--prompt", @@ -854,7 +853,12 @@ def main_entry(): try: args = parser.parse_args() - main(args.model, accurate_mode=args.accurate, prompt=args.prompt, voice_mode=args.voice) + main( + args.model, + accurate_mode=args.accurate, + terminal_prompt=args.prompt, + voice_mode=args.voice, + ) except KeyboardInterrupt: print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...") From 979840cc9cf7185c68b07383c2e4d69c8c1476df Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Tue, 12 Dec 2023 18:14:36 -0800 Subject: [PATCH 13/13] Remove extra function string --- operate/main.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/operate/main.py b/operate/main.py index e1c38ff9..fe556e0d 100644 --- a/operate/main.py +++ b/operate/main.py @@ -200,9 +200,6 @@ def main(model, accurate_mode, terminal_prompt, voice_mode=False): """ mic = None # Initialize WhisperMic if voice_mode is True if voice_mode is True - """ - Main function for the Self-Operating Computer - """ if voice_mode: try: from whisper_mic import WhisperMic