Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ We appreciate your contributions!
3. Run `operate` to test your changes

## Contribution Ideas
- **Develop an Automated End-to-End Testing System**: Build an automated testing framework that can be run before merging PRs to `main` to confirm no test cases broke. An example of such a test case would be "go to google docs and write a poem". This testing system should be flexible to add new test cases in the future and reduce the time spent on manually testing each PR.
- **Improve performance by finding optimal screenshot grid**: A primary element of the framework is that it overlays a percentage grid on the screenshot which GPT-4v uses to estimate click locations. If someone is able to find the optimal grid and some evaluation metrics to confirm it is an improvement on the current method then we will merge that PR.
- **Improve the `SUMMARY_PROMPT`**
- **Improve Linux and Windows compatibility**: There are still some issues with Linux and Windows compatibility. PRs to fix the issues are encouraged.
Expand Down
150 changes: 150 additions & 0 deletions evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import sys
import os
import subprocess
import platform
import base64
import json
import openai

from dotenv import load_dotenv

# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v"
TEST_CASES = {
"Go to Github.com": "The Github home page is visible.",
"Go to Youtube.com and play a video": "The YouTube video player is visible.",
}

EVALUATION_PROMPT = """
Your job is to look at the given screenshot and determine if the following guideline is met in the image.
You must respond in the following format ONLY. Do not add anything else:
{{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }}
guideline_met must be set to a JSON boolean. True if the image meets the given guideline.
reason must be a string containing a justification for your decision.

Guideline: {guideline}
"""

SUMMARY_SCREENSHOT_PATH = os.path.join('screenshots', 'summary_screenshot.png')

# Check if on a windows terminal that supports ANSI escape codes
def supports_ansi():
"""
Check if the terminal supports ANSI escape codes
"""
plat = platform.system()
supported_platform = plat != "Windows" or "ANSICON" in os.environ
is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
return supported_platform and is_a_tty

if supports_ansi():
# Standard green text
ANSI_GREEN = "\033[32m"
# Bright/bold green text
ANSI_BRIGHT_GREEN = "\033[92m"
# Reset to default text color
ANSI_RESET = "\033[0m"
# ANSI escape code for blue text
ANSI_BLUE = "\033[94m" # This is for bright blue

# Standard yellow text
ANSI_YELLOW = "\033[33m"

ANSI_RED = "\033[31m"

# Bright magenta text
ANSI_BRIGHT_MAGENTA = "\033[95m"
else:
ANSI_GREEN = ""
ANSI_BRIGHT_GREEN = ""
ANSI_RESET = ""
ANSI_BLUE = ""
ANSI_YELLOW = ""
ANSI_RED = ""
ANSI_BRIGHT_MAGENTA = ""


def format_evaluation_prompt(guideline):
prompt = EVALUATION_PROMPT.format(guideline=guideline)
return prompt


def parse_eval_content(content):
try:
res = json.loads(content)

print(res["reason"])

return res["guideline_met"]
except:
print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...")
exit(1)


def evaluate_summary_screenshot(guideline):
'''Load the summary screenshot and return True or False if it meets the given guideline.'''
with open(SUMMARY_SCREENSHOT_PATH, "rb") as img_file:
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")

eval_message = [{
"role": "user",
"content": [
{"type": "text", "text": format_evaluation_prompt(guideline)},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
],
}]

response = openai.chat.completions.create(
model="gpt-4-vision-preview",
messages=eval_message,
presence_penalty=1,
frequency_penalty=1,
temperature=0.7,
max_tokens=300,
)

eval_content = response.choices[0].message.content

return parse_eval_content(eval_content)


def run_test_case(objective, guideline):
'''Returns True if the result of the test with the given prompt meets the given guideline.'''
# Run `operate` with the test case prompt
subprocess.run(['operate', '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)

try:
result = evaluate_summary_screenshot(guideline)
except(OSError):
print("Couldn't open the summary screenshot")
return False

return result


def main():
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")

passed = 0; failed = 0
for objective, guideline in TEST_CASES.items():
print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")

result = run_test_case(objective, guideline)
if result:
print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
passed += 1
else:
print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'")
failed += 1

print(
f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} tests passed, {failed} tests failed"
)

if __name__ == "__main__":
main()
39 changes: 27 additions & 12 deletions operate/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@
"""



class ModelNotRecognizedException(Exception):
"""Exception raised for unrecognized models."""

Expand Down Expand Up @@ -195,15 +194,12 @@ def supports_ansi():
ANSI_BRIGHT_MAGENTA = ""


def main(model, accurate_mode, voice_mode=False):
def main(model, accurate_mode, terminal_prompt, voice_mode=False):
"""
Main function for the Self-Operating Computer
"""
mic = None
# Initialize WhisperMic if voice_mode is True if voice_mode is True
"""
Main function for the Self-Operating Computer
"""
if voice_mode:
try:
from whisper_mic import WhisperMic
Expand All @@ -216,11 +212,15 @@ def main(model, accurate_mode, voice_mode=False):
)
sys.exit(1)

message_dialog(
title="Self-Operating Computer",
text="Ask a computer to do anything.",
style=style,
).run()
# Skip message dialog if prompt was given directly
if not terminal_prompt:
message_dialog(
title="Self-Operating Computer",
text="Ask a computer to do anything.",
style=style,
).run()
else:
print("Running direct prompt...")

print("SYSTEM", platform.system())
# Clear the console
Expand All @@ -229,7 +229,9 @@ def main(model, accurate_mode, voice_mode=False):
else:
print("\033c", end="")

if voice_mode:
if terminal_prompt: # Skip objective prompt if it was given as an argument
objective = terminal_prompt
elif voice_mode:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)"
)
Expand Down Expand Up @@ -838,9 +840,22 @@ def main_entry():
required=False,
)

# Allow for direct input of prompt
parser.add_argument(
"--prompt",
help="Directly input the objective prompt",
type=str,
required=False,
)

try:
args = parser.parse_args()
main(args.model, accurate_mode=args.accurate, voice_mode=args.voice)
main(
args.model,
accurate_mode=args.accurate,
terminal_prompt=args.prompt,
voice_mode=args.voice,
)
except KeyboardInterrupt:
print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...")

Expand Down