diff --git a/operate/models/prompts.py b/operate/models/prompts.py index af5725f8..f0bd5195 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -1,8 +1,10 @@ +import platform + # General user Prompts USER_QUESTION = "Hello, I can help you with anything. What would you like done?" -SYSTEM_PROMPT = """ +SYSTEM_PROMPT_MAC = """ You are operating a computer, using the same operating system as a human. From looking at the screen, the objective, and your previous actions, take the next best series of action. @@ -47,8 +49,53 @@ Objective: {objective} # take the next best action for this objective """ +SYSTEM_PROMPT_WIN_LINUX= """ +You are operating a computer, using the same operating system as a human. + +From looking at the screen, the objective, and your previous actions, take the next best series of action. + +You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. + +1. click - Move mouse and click +[{{ "thought": "write a thought here", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format + +2. write - Write with your keyboard +[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] + +3. press - Use a hotkey or press key to operate the computer +[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] + +4. done - The objective is completed +[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] + +Return the actions in array format `[]`. You can take just one action or multiple actions. + +Here are some helpful combinations: + +# Opens Menu Search on Windows and Linux +[ + {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["win"] }}, + {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, + {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} +] + +# Focuses on the address bar in a browser before typing a website +[ + {{ "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["ctrl", "l"] }}, + {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, + {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }} +] + +A few important notes: + +- Go to Google Docs and Google Sheets by typing in the Chrome Address bar +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +Objective: {objective} # take the next best action for this objective +""" + -SYSTEM_PROMPT_LABELED = """ +SYSTEM_PROMPT_LABELED_MAC = """ You are operating a computer, using the same operating system as a human. From looking at the screen, the objective, and your previous actions, take the next best series of action. @@ -98,6 +145,56 @@ Objective: {objective} # take the next best action for this objective """ +SYSTEM_PROMPT_LABELED_WIN_LINUX = """ +You are operating a computer, using the same operating system as a human. + +From looking at the screen, the objective, and your previous actions, take the next best series of action. + +You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. + +1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` +[{{ "thought": "write a thought here", "operation": "click", "label": "~x" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format + +2. write - Write with your keyboard +[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] + +3. press - Use a hotkey or press key to operate the computer +[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] + +4. done - The objective is completed +[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] + +Return the actions in array format `[]`. You can take just one action or multiple actions. + +Here are some helpful combinations: + +# Opens Menu Search on Windows and Linux +[ + {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["win"] }}, + {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, +] + +# Focuses on the address bar in a browser before typing a website +[ + {{ "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["ctrl", "l"] }}, + {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, + {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }} +] + +# Send a "Hello World" message in the chat +[ + {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "~34" }}, + {{ "thought": "Now that I am focused on the message field, I'll go ahead and write ", "operation": "write", "content": "Hello World" }}, +] + +A few important notes: + +- Go to Google Docs and Google Sheets by typing in the Chrome Address bar +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +Objective: {objective} # take the next best action for this objective +""" + OPERATE_FIRST_MESSAGE_PROMPT = """ Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done @@ -115,7 +212,11 @@ def get_system_prompt(objective): """ Format the vision prompt """ - prompt = SYSTEM_PROMPT.format(objective=objective) + if platform.system() == "Darwin": + prompt = SYSTEM_PROMPT_MAC.format(objective=objective) + else: + prompt = SYSTEM_PROMPT_WIN_LINUX.format(objective=objective) + return prompt @@ -123,7 +224,11 @@ def get_system_prompt_labeled(objective): """ Format the vision prompt """ - prompt = SYSTEM_PROMPT_LABELED.format(objective=objective) + if platform.system() == "Darwin": + prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective) + else: + prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective) + return prompt