diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d9be9bcc..64c73aa8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,7 +14,6 @@ We appreciate your contributions! 3. Run `operate` to test your changes ## Contribution Ideas -- **Develop an Automated End-to-End Testing System**: Build an automated testing framework that can be run before merging PRs to `main` to confirm no test cases broke. An example of such a test case would be "go to google docs and write a poem". This testing system should be flexible to add new test cases in the future and reduce the time spent on manually testing each PR. - **Improve performance by finding optimal screenshot grid**: A primary element of the framework is that it overlays a percentage grid on the screenshot which GPT-4v uses to estimate click locations. If someone is able to find the optimal grid and some evaluation metrics to confirm it is an improvement on the current method then we will merge that PR. - **Improve the `SUMMARY_PROMPT`** - **Improve Linux and Windows compatibility**: There are still some issues with Linux and Windows compatibility. PRs to fix the issues are encouraged. diff --git a/evaluate.py b/evaluate.py new file mode 100644 index 00000000..f543c82c --- /dev/null +++ b/evaluate.py @@ -0,0 +1,150 @@ +import sys +import os +import subprocess +import platform +import base64 +import json +import openai + +from dotenv import load_dotenv + +# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v" +TEST_CASES = { + "Go to Github.com": "The Github home page is visible.", + "Go to Youtube.com and play a video": "The YouTube video player is visible.", +} + +EVALUATION_PROMPT = """ +Your job is to look at the given screenshot and determine if the following guideline is met in the image. +You must respond in the following format ONLY. Do not add anything else: +{{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }} +guideline_met must be set to a JSON boolean. True if the image meets the given guideline. +reason must be a string containing a justification for your decision. + +Guideline: {guideline} +""" + +SUMMARY_SCREENSHOT_PATH = os.path.join('screenshots', 'summary_screenshot.png') + +# Check if on a windows terminal that supports ANSI escape codes +def supports_ansi(): + """ + Check if the terminal supports ANSI escape codes + """ + plat = platform.system() + supported_platform = plat != "Windows" or "ANSICON" in os.environ + is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty() + return supported_platform and is_a_tty + +if supports_ansi(): + # Standard green text + ANSI_GREEN = "\033[32m" + # Bright/bold green text + ANSI_BRIGHT_GREEN = "\033[92m" + # Reset to default text color + ANSI_RESET = "\033[0m" + # ANSI escape code for blue text + ANSI_BLUE = "\033[94m" # This is for bright blue + + # Standard yellow text + ANSI_YELLOW = "\033[33m" + + ANSI_RED = "\033[31m" + + # Bright magenta text + ANSI_BRIGHT_MAGENTA = "\033[95m" +else: + ANSI_GREEN = "" + ANSI_BRIGHT_GREEN = "" + ANSI_RESET = "" + ANSI_BLUE = "" + ANSI_YELLOW = "" + ANSI_RED = "" + ANSI_BRIGHT_MAGENTA = "" + + +def format_evaluation_prompt(guideline): + prompt = EVALUATION_PROMPT.format(guideline=guideline) + return prompt + + +def parse_eval_content(content): + try: + res = json.loads(content) + + print(res["reason"]) + + return res["guideline_met"] + except: + print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...") + exit(1) + + +def evaluate_summary_screenshot(guideline): + '''Load the summary screenshot and return True or False if it meets the given guideline.''' + with open(SUMMARY_SCREENSHOT_PATH, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + eval_message = [{ + "role": "user", + "content": [ + {"type": "text", "text": format_evaluation_prompt(guideline)}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + }] + + response = openai.chat.completions.create( + model="gpt-4-vision-preview", + messages=eval_message, + presence_penalty=1, + frequency_penalty=1, + temperature=0.7, + max_tokens=300, + ) + + eval_content = response.choices[0].message.content + + return parse_eval_content(eval_content) + + +def run_test_case(objective, guideline): + '''Returns True if the result of the test with the given prompt meets the given guideline.''' + # Run `operate` with the test case prompt + subprocess.run(['operate', '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL) + + try: + result = evaluate_summary_screenshot(guideline) + except(OSError): + print("Couldn't open the summary screenshot") + return False + + return result + + +def main(): + load_dotenv() + openai.api_key = os.getenv("OPENAI_API_KEY") + + print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}") + + passed = 0; failed = 0 + for objective, guideline in TEST_CASES.items(): + print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'") + + result = run_test_case(objective, guideline) + if result: + print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'") + passed += 1 + else: + print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'") + failed += 1 + + print( + f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} tests passed, {failed} tests failed" + ) + +if __name__ == "__main__": + main() diff --git a/operate/main.py b/operate/main.py index 5a3b3660..fe556e0d 100644 --- a/operate/main.py +++ b/operate/main.py @@ -133,7 +133,6 @@ """ - class ModelNotRecognizedException(Exception): """Exception raised for unrecognized models.""" @@ -195,15 +194,12 @@ def supports_ansi(): ANSI_BRIGHT_MAGENTA = "" -def main(model, accurate_mode, voice_mode=False): +def main(model, accurate_mode, terminal_prompt, voice_mode=False): """ Main function for the Self-Operating Computer """ mic = None # Initialize WhisperMic if voice_mode is True if voice_mode is True - """ - Main function for the Self-Operating Computer - """ if voice_mode: try: from whisper_mic import WhisperMic @@ -216,11 +212,15 @@ def main(model, accurate_mode, voice_mode=False): ) sys.exit(1) - message_dialog( - title="Self-Operating Computer", - text="Ask a computer to do anything.", - style=style, - ).run() + # Skip message dialog if prompt was given directly + if not terminal_prompt: + message_dialog( + title="Self-Operating Computer", + text="Ask a computer to do anything.", + style=style, + ).run() + else: + print("Running direct prompt...") print("SYSTEM", platform.system()) # Clear the console @@ -229,7 +229,9 @@ def main(model, accurate_mode, voice_mode=False): else: print("\033c", end="") - if voice_mode: + if terminal_prompt: # Skip objective prompt if it was given as an argument + objective = terminal_prompt + elif voice_mode: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)" ) @@ -838,9 +840,22 @@ def main_entry(): required=False, ) + # Allow for direct input of prompt + parser.add_argument( + "--prompt", + help="Directly input the objective prompt", + type=str, + required=False, + ) + try: args = parser.parse_args() - main(args.model, accurate_mode=args.accurate, voice_mode=args.voice) + main( + args.model, + accurate_mode=args.accurate, + terminal_prompt=args.prompt, + voice_mode=args.voice, + ) except KeyboardInterrupt: print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...")