Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
4a0a5df
Add `call_agent_1`
joshbickett Jan 9, 2024
1edc76c
Add `call_agent_1`
joshbickett Jan 9, 2024
8f8787a
Remove `async` from `fetch_agent_1_response`
joshbickett Jan 9, 2024
3830598
Minor fixes and `capture_screen_with_cursor`
joshbickett Jan 9, 2024
eecc0f7
Create `execute_operations` function
joshbickett Jan 11, 2024
639d3f6
Add back `DECISION_PROMPT`
joshbickett Jan 11, 2024
48f17b1
Fixes two bugs
joshbickett Jan 11, 2024
ccf7edf
Iterate `execute_operations_new`
joshbickett Jan 12, 2024
6855df6
Update to `execute_operations1`
joshbickett Jan 12, 2024
c6d92c8
remove `ACCURATE_MODE_VISION_PROMPT`
joshbickett Jan 12, 2024
6caa6a5
remove `capture_mini_screenshot_with_cursor`, small iteration to new …
joshbickett Jan 12, 2024
d0f47e6
Add `session_id` for `agent-1` api
joshbickett Jan 12, 2024
909efd7
better prints some fixes
joshbickett Jan 12, 2024
4e5168d
Update `README.md` to temp version until updated
joshbickett Jan 12, 2024
4e40001
Add `SYSTEM_PROMPT`
joshbickett Jan 12, 2024
d504679
Update `call_gpt_4_v` for keycommands
joshbickett Jan 12, 2024
2291810
Update `execute_operations` & remove search
joshbickett Jan 12, 2024
52d01c0
Remove `add_grid_to_image` approach
joshbickett Jan 12, 2024
1068ab0
Remove `add_grid_to_image`
joshbickett Jan 12, 2024
a204ca0
Update `message_dialog`
joshbickett Jan 12, 2024
df002b4
Fix `convert_percent_to_decimal` to match new prompt
joshbickett Jan 12, 2024
b2a63de
Move `get_last_assistant_message`
joshbickett Jan 12, 2024
cab5912
`SYSTEM_PROMPT` updates
joshbickett Jan 13, 2024
d19a376
Create `OperatingSystem` class
joshbickett Jan 13, 2024
6a1666c
Update `operating`
joshbickett Jan 13, 2024
862e047
name updates, `operate()`,etc.
joshbickett Jan 13, 2024
e5df2b7
Add `operation.get("summmary")`
joshbickett Jan 13, 2024
196e091
Add `ANSI_BLUE`
joshbickett Jan 13, 2024
6624da5
Update some file names, add `get_user_prompt`
joshbickett Jan 13, 2024
35a0047
Remove `monitor_size`, no longer used
joshbickett Jan 13, 2024
1444c1a
Add missing `__init__.py`
joshbickett Jan 13, 2024
b31c39c
Add `config.verbose` and better `print`
joshbickett Jan 13, 2024
27c6284
Increase loop max
joshbickett Jan 13, 2024
3fddff3
Add better error handling for `OperatingSystem`
joshbickett Jan 13, 2024
cee42ce
Update to `operating_system.py`
joshbickett Jan 13, 2024
6cd36fd
Iterate `call_gpt_4_vision_preview_labeled`
joshbickett Jan 14, 2024
42a8786
remove `format_label_prompt`
joshbickett Jan 14, 2024
8474227
fix `get_click_position_in_percent`
joshbickett Jan 14, 2024
182acce
Remove `img_base64_original` from `add_labels` function
joshbickett Jan 14, 2024
76c28c9
Update `call_gemini_pro_vision` to new method
joshbickett Jan 14, 2024
e36d774
remove extra list dimension in `call_gemini_pro_vision`
joshbickett Jan 14, 2024
e15bbd6
Change `mouse` to `click` as it is clear what the purpose is
joshbickett Jan 14, 2024
4aa05bc
update `operate_type == "click"` condition
joshbickett Jan 14, 2024
c79b065
Remove unused prompts, `VISION_PROMPT`, etc.
joshbickett Jan 14, 2024
f0b606c
improve `OPERATE_FIRST_MESSAGE_PROMPT` and `OPERATE_PROMPT`
joshbickett Jan 14, 2024
9ce0aa1
fix `call_gpt_4_vision_preview_labeled`
joshbickett Jan 14, 2024
cd8e6e3
bump up `max_tokens`
joshbickett Jan 14, 2024
6604300
increase `max_tokens`
joshbickett Jan 14, 2024
0f54222
Update `README.md`
joshbickett Jan 14, 2024
e590323
Clean up `README.md`
joshbickett Jan 14, 2024
a071bb3
Remove `Additional Thoughts` now that hotkeys are added
joshbickett Jan 14, 2024
0072093
Add multi-platform system prompts
michaelhhogue Jan 14, 2024
0fed49d
Merge pull request #134 from michaelhhogue/multiplatform-system-prompts
joshbickett Jan 15, 2024
ec891c9
remove `fetch_agent_1_response` for now
joshbickett Jan 15, 2024
de93bb4
update name to `config`
joshbickett Jan 15, 2024
6975d16
fix validation bug
joshbickett Jan 15, 2024
62fe6a9
update `initialize_apis1`
joshbickett Jan 15, 2024
5b3c75c
move `yolo_model` to only be required if using that mode
joshbickett Jan 15, 2024
c010fc3
Fix `Config` bug
joshbickett Jan 15, 2024
849ce90
remove `print`
joshbickett Jan 15, 2024
5bf795c
add back clearing
joshbickett Jan 15, 2024
b16d21a
remove unused `fetch_openai_response_async`
joshbickett Jan 15, 2024
47b0581
turn off `verbose`
joshbickett Jan 15, 2024
e9bef10
Switch back `README.md` for now
joshbickett Jan 16, 2024
d03805a
Merge branch 'main' into add-agent-1
joshbickett Jan 16, 2024
12b29d0
Add another useful `print`
joshbickett Jan 16, 2024
01bd850
Minor prompt improvements
joshbickett Jan 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
409 changes: 0 additions & 409 deletions operate/actions.py

This file was deleted.

68 changes: 68 additions & 0 deletions operate/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
import sys
from dotenv import load_dotenv
from openai import OpenAI
from prompt_toolkit.shortcuts import input_dialog


class Config:
"""
Configuration class for managing settings.

Attributes:
debug (bool): Flag indicating whether debug mode is enabled.
openai_api_key (str): API key for OpenAI.
google_api_key (str): API key for Google.
"""

def __init__(self):
load_dotenv()
self.verbose = False
self.openai_api_key = os.getenv("OPENAI_API_KEY")
self.google_api_key = os.getenv("GOOGLE_API_KEY")

def initialize_openai(self):
if self.openai_api_key:
client = OpenAI()
client.api_key = self.openai_api_key
client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url)
return client
return None

def validation(self, model, voice_mode):
"""
Validate the input parameters for the dialog operation.
"""
self.require_api_key(
"OPENAI_API_KEY", "OpenAI API key", model == "gpt-4" or voice_mode
)
self.require_api_key(
"GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision"
)

def require_api_key(self, key_name, key_description, is_required):
if is_required and not getattr(self, key_name.lower()):
self.prompt_and_save_api_key(key_name, key_description)

def prompt_and_save_api_key(self, key_name, key_description):
key_value = input_dialog(
title="API Key Required", text=f"Please enter your {key_description}:"
).run()

if key_value is None: # User pressed cancel or closed the dialog
sys.exit("Operation cancelled by user.")

if key_value:
self.save_api_key_to_env(key_name, key_value)
load_dotenv() # Reload environment variables
# Update the instance attribute with the new key

if key_value:
self.save_api_key_to_env(key_name, key_value)
load_dotenv() # Reload environment variables
setattr(self, key_name.lower(), key_value)

@staticmethod
def save_api_key_to_env(key_name, key_value):
with open(".env", "a") as file:
file.write(f"\n{key_name}='{key_value}'")
2 changes: 1 addition & 1 deletion operate/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
import argparse
from operate.utils.style import ANSI_BRIGHT_MAGENTA
from operate.dialog import main
from operate.operate import main


def main_entry():
Expand Down
Empty file added operate/models/__init__.py
Empty file.
321 changes: 321 additions & 0 deletions operate/models/apis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
import os
import time
import json
import base64
import traceback
import io


from PIL import Image
from ultralytics import YOLO
import google.generativeai as genai
from operate.config import Config
from operate.exceptions import ModelNotRecognizedException
from operate.utils.screenshot import (
capture_screen_with_cursor,
)
from operate.models.prompts import (
get_user_first_message_prompt,
get_user_prompt,
get_system_prompt,
)


from operate.utils.label import (
add_labels,
get_click_position_in_percent,
get_label_coordinates,
)
from operate.utils.style import (
ANSI_GREEN,
ANSI_RED,
ANSI_RESET,
)


# Load configuration
VERBOSE = Config().verbose


async def get_next_action(model, messages, objective, session_id):
if model == "gpt-4":
return call_gpt_4_vision_preview(messages), None
if model == "gpt-4-with-som":
operation = await call_gpt_4_vision_preview_labeled(messages, objective)
return operation, None
elif model == "agent-1":
return "coming soon"
elif model == "gemini-pro-vision":
return call_gemini_pro_vision(messages, objective), None

raise ModelNotRecognizedException(model)


def call_gpt_4_vision_preview(messages):
config = Config()
client = config.initialize_openai()
if VERBOSE:
print("[Self Operating Computer][get_next_action][call_gpt_4_v]")
time.sleep(1)
try:
screenshots_dir = "screenshots"
if not os.path.exists(screenshots_dir):
os.makedirs(screenshots_dir)

screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
# Call the function to capture the screen with the cursor
capture_screen_with_cursor(screenshot_filename)

with open(screenshot_filename, "rb") as img_file:
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")

if len(messages) == 1:
user_prompt = get_user_first_message_prompt()
else:
user_prompt = get_user_prompt()

if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_v] user_prompt",
user_prompt,
)

vision_message = {
"role": "user",
"content": [
{"type": "text", "text": user_prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
],
}
messages.append(vision_message)

response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=messages,
presence_penalty=1,
frequency_penalty=1,
temperature=0.7,
max_tokens=3000,
)

content = response.choices[0].message.content

if content.startswith("```json"):
content = content[len("```json") :] # Remove starting ```json
if content.endswith("```"):
content = content[: -len("```")] # Remove ending

assistant_message = {"role": "assistant", "content": content}
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_v] content",
content,
)
content = json.loads(content)

messages.append(assistant_message)

return content

except Exception as e:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying again {ANSI_RESET}",
e,
)
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
content,
)
traceback.print_exc()
return call_gpt_4_vision_preview(messages)


def call_gemini_pro_vision(messages, objective):
"""
Get the next action for Self-Operating Computer using Gemini Pro Vision
"""
# sleep for a second
time.sleep(1)
try:
screenshots_dir = "screenshots"
if not os.path.exists(screenshots_dir):
os.makedirs(screenshots_dir)

screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
# Call the function to capture the screen with the cursor
capture_screen_with_cursor(screenshot_filename)
# sleep for a second
time.sleep(1)
prompt = get_system_prompt(objective)

model = genai.GenerativeModel("gemini-pro-vision")

response = model.generate_content([prompt, Image.open(screenshot_filename)])

content = response.text[1:]

content = json.loads(content)
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gemini_pro_vision] content",
content,
)

return content

except Exception as e:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
e,
)
return call_gpt_4_vision_preview(messages)


async def call_gpt_4_vision_preview_labeled(messages, objective):
config = Config()
client = config.initialize_openai()
time.sleep(1)
try:
yolo_model = YOLO("./operate/models/weights/best.pt") # Load your trained model
screenshots_dir = "screenshots"
if not os.path.exists(screenshots_dir):
os.makedirs(screenshots_dir)

screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
# Call the function to capture the screen with the cursor
capture_screen_with_cursor(screenshot_filename)

with open(screenshot_filename, "rb") as img_file:
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")

img_base64_labeled, label_coordinates = add_labels(img_base64, yolo_model)

if len(messages) == 1:
user_prompt = get_user_first_message_prompt()
else:
user_prompt = get_user_prompt()

if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] user_prompt",
user_prompt,
)

vision_message = {
"role": "user",
"content": [
{"type": "text", "text": user_prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_base64_labeled}"
},
},
],
}
messages.append(vision_message)

response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=messages,
presence_penalty=1,
frequency_penalty=1,
temperature=0.7,
max_tokens=1000,
)

content = response.choices[0].message.content

if content.startswith("```json"):
content = content[len("```json") :] # Remove starting ```json
if content.endswith("```"):
content = content[: -len("```")] # Remove ending

assistant_message = {"role": "assistant", "content": content}
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] content",
content,
)
messages.append(assistant_message)

content = json.loads(content)

processed_content = []

for operation in content:
if operation.get("operation") == "click":
label = operation.get("label")
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] label",
label,
)

coordinates = get_label_coordinates(label, label_coordinates)
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] coordinates",
coordinates,
)
image = Image.open(
io.BytesIO(base64.b64decode(img_base64))
) # Load the image to get its size
image_size = image.size # Get the size of the image (width, height)
click_position_percent = get_click_position_in_percent(
coordinates, image_size
)
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] click_position_percent",
click_position_percent,
)
if not click_position_percent:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}"
)
return call_gpt_4_vision_preview(messages)

x_percent = f"{click_position_percent[0]:.2f}"
y_percent = f"{click_position_percent[1]:.2f}"
operation["x"] = x_percent
operation["y"] = y_percent
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new click operation",
operation,
)
processed_content.append(operation)
else:
processed_content.append(operation)

if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new processed_content",
processed_content,
)
return processed_content

except Exception as e:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
e,
)
return call_gpt_4_vision_preview(messages)


def get_last_assistant_message(messages):
"""
Retrieve the last message from the assistant in the messages array.
If the last assistant message is the first message in the array, return None.
"""
for index in reversed(range(len(messages))):
if messages[index]["role"] == "assistant":
if index == 0: # Check if the assistant message is the first in the array
return None
else:
return messages[index]
return None # Return None if no assistant message is found
Loading