Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion operate/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class Config:
Configuration class for managing settings.

Attributes:
debug (bool): Flag indicating whether debug mode is enabled.
verbose (bool): Flag indicating whether verbose mode is enabled.
openai_api_key (str): API key for OpenAI.
google_api_key (str): API key for Google.
"""
Expand Down
9 changes: 9 additions & 0 deletions operate/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ def main_entry():
help="Use voice input mode",
action="store_true",
)

# Add a flag for verbose mode
parser.add_argument(
"--verbose",
help="Run operate in verbose mode",
action="store_true",
)

# Allow for direct input of prompt
parser.add_argument(
"--prompt",
Expand All @@ -38,6 +46,7 @@ def main_entry():
args.model,
terminal_prompt=args.prompt,
voice_mode=args.voice,
verbose_mode=args.verbose
)
except KeyboardInterrupt:
print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...")
Expand Down
64 changes: 31 additions & 33 deletions operate/models/apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,9 @@

# Load configuration
config = Config()
VERBOSE = config.verbose


async def get_next_action(model, messages, objective, session_id):
if VERBOSE:
if config.verbose:
print("[Self-Operating Computer][get_next_action]")
print("[Self-Operating Computer][get_next_action] model", model)
if model == "gpt-4":
Expand All @@ -61,7 +59,7 @@ async def get_next_action(model, messages, objective, session_id):


def call_gpt_4_vision_preview(messages):
if VERBOSE:
if config.verbose:
print("[call_gpt_4_v]")
time.sleep(1)
client = config.initialize_openai()
Expand All @@ -82,7 +80,7 @@ def call_gpt_4_vision_preview(messages):
else:
user_prompt = get_user_prompt()

if VERBOSE:
if config.verbose:
print(
"[call_gpt_4_v] user_prompt",
user_prompt,
Expand Down Expand Up @@ -117,7 +115,7 @@ def call_gpt_4_vision_preview(messages):
content = content[: -len("```")] # Remove ending

assistant_message = {"role": "assistant", "content": content}
if VERBOSE:
if config.verbose:
print(
"[call_gpt_4_v] content",
content,
Expand All @@ -137,7 +135,7 @@ def call_gpt_4_vision_preview(messages):
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
content,
)
if VERBOSE:
if config.verbose:
traceback.print_exc()
return call_gpt_4_vision_preview(messages)

Expand All @@ -146,7 +144,7 @@ def call_gemini_pro_vision(messages, objective):
"""
Get the next action for Self-Operating Computer using Gemini Pro Vision
"""
if VERBOSE:
if config.verbose:
print(
"[Self Operating Computer][call_gemini_pro_vision]",
)
Expand All @@ -165,18 +163,18 @@ def call_gemini_pro_vision(messages, objective):
prompt = get_system_prompt("gemini-pro-vision", objective)

model = config.initialize_google()
if VERBOSE:
if config.verbose:
print("[call_gemini_pro_vision] model", model)

response = model.generate_content([prompt, Image.open(screenshot_filename)])

content = response.text[1:]
if VERBOSE:
if config.verbose:
print("[call_gemini_pro_vision] response", response)
print("[call_gemini_pro_vision] content", content)

content = json.loads(content)
if VERBOSE:
if config.verbose:
print(
"[get_next_action][call_gemini_pro_vision] content",
content,
Expand All @@ -188,14 +186,14 @@ def call_gemini_pro_vision(messages, objective):
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying another method {ANSI_RESET}"
)
if VERBOSE:
if config.verbose:
print("[Self-Operating Computer][Operate] error", e)
traceback.print_exc()
return call_gpt_4_vision_preview(messages)


async def call_gpt_4_vision_preview_ocr(messages, objective, model):
if VERBOSE:
if config.verbose:
print("[call_gpt_4_vision_preview_ocr]")

# Construct the path to the file within the package
Expand Down Expand Up @@ -260,7 +258,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
# Normalize line breaks and remove any unwanted characters
content = "\n".join(line.strip() for line in content.splitlines())

if VERBOSE:
if config.verbose:
print(
"\n\n\n[call_gpt_4_vision_preview_ocr] content after cleaning", content
)
Expand All @@ -274,7 +272,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
for operation in content:
if operation.get("operation") == "click":
text_to_click = operation.get("text")
if VERBOSE:
if config.verbose:
print(
"[call_gpt_4_vision_preview_ocr][click] text_to_click",
text_to_click,
Expand All @@ -296,7 +294,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
operation["x"] = coordinates["x"]
operation["y"] = coordinates["y"]

if VERBOSE:
if config.verbose:
print(
"[call_gpt_4_vision_preview_ocr][click] text_element_index",
text_element_index,
Expand Down Expand Up @@ -324,7 +322,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying another method {ANSI_RESET}"
)
if VERBOSE:
if config.verbose:
print("[Self-Operating Computer][Operate] error", e)
traceback.print_exc()
return gpt_4_fallback(messages, objective, model)
Expand Down Expand Up @@ -356,7 +354,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
else:
user_prompt = get_user_prompt()

if VERBOSE:
if config.verbose:
print(
"[call_gpt_4_vision_preview_labeled] user_prompt",
user_prompt,
Expand Down Expand Up @@ -393,7 +391,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
content = content[: -len("```")] # Remove ending

assistant_message = {"role": "assistant", "content": content}
if VERBOSE:
if config.verbose:
print(
"[call_gpt_4_vision_preview_labeled] content",
content,
Expand All @@ -407,14 +405,14 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
for operation in content:
if operation.get("operation") == "click":
label = operation.get("label")
if VERBOSE:
if config.verbose:
print(
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] label",
label,
)

coordinates = get_label_coordinates(label, label_coordinates)
if VERBOSE:
if config.verbose:
print(
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] coordinates",
coordinates,
Expand All @@ -426,7 +424,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
click_position_percent = get_click_position_in_percent(
coordinates, image_size
)
if VERBOSE:
if config.verbose:
print(
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] click_position_percent",
click_position_percent,
Expand All @@ -441,7 +439,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
y_percent = f"{click_position_percent[1]:.2f}"
operation["x"] = x_percent
operation["y"] = y_percent
if VERBOSE:
if config.verbose:
print(
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] new click operation",
operation,
Expand All @@ -450,7 +448,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
else:
processed_content.append(operation)

if VERBOSE:
if config.verbose:
print(
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] new processed_content",
processed_content,
Expand All @@ -461,14 +459,14 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying another method {ANSI_RESET}"
)
if VERBOSE:
if config.verbose:
print("[Self-Operating Computer][Operate] error", e)
traceback.print_exc()
return call_gpt_4_vision_preview(messages)


def call_ollama_llava(messages):
if VERBOSE:
if config.verbose:
print("[call_ollama_llava]")
time.sleep(1)
try:
Expand All @@ -485,7 +483,7 @@ def call_ollama_llava(messages):
else:
user_prompt = get_user_prompt()

if VERBOSE:
if config.verbose:
print(
"[call_ollama_llava] user_prompt",
user_prompt,
Expand Down Expand Up @@ -516,7 +514,7 @@ def call_ollama_llava(messages):
content = content[: -len("```")] # Remove ending

assistant_message = {"role": "assistant", "content": content}
if VERBOSE:
if config.verbose:
print(
"[call_ollama_llava] content",
content,
Expand All @@ -542,7 +540,7 @@ def call_ollama_llava(messages):
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
content,
)
if VERBOSE:
if config.verbose:
traceback.print_exc()
return call_ollama_llava(messages)

Expand All @@ -562,15 +560,15 @@ def get_last_assistant_message(messages):


def gpt_4_fallback(messages, objective, model):
if VERBOSE:
if config.verbose:
print("[gpt_4_fallback]")
system_prompt = get_system_prompt("gpt-4-vision-preview", objective)
new_system_message = {"role": "system", "content": system_prompt}
# remove and replace the first message in `messages` with `new_system_message`

messages[0] = new_system_message

if VERBOSE:
if config.verbose:
print("[gpt_4_fallback][updated]")
print("[gpt_4_fallback][updated] len(messages)", len(messages))

Expand All @@ -581,7 +579,7 @@ def confirm_system_prompt(messages, objective, model):
"""
On `Exception` we default to `call_gpt_4_vision_preview` so we have this function to reassign system prompt in case of a previous failure
"""
if VERBOSE:
if config.verbose:
print("[confirm_system_prompt] model", model)

system_prompt = get_system_prompt(model, objective)
Expand All @@ -590,7 +588,7 @@ def confirm_system_prompt(messages, objective, model):

messages[0] = new_system_message

if VERBOSE:
if config.verbose:
print("[confirm_system_prompt]")
print("[confirm_system_prompt] len(messages)", len(messages))
for m in messages:
Expand Down
4 changes: 2 additions & 2 deletions operate/models/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from operate.config import Config

# Load configuration
VERBOSE = Config().verbose
config = Config()

# General user Prompts
USER_QUESTION = "Hello, I can help you with anything. What would you like done?"
Expand Down Expand Up @@ -380,7 +380,7 @@ def get_system_prompt(model, objective):
prompt = prompt_string.format(objective=objective)

# Optional verbose output
if VERBOSE:
if config.verbose:
print("[get_system_prompt] model:", model)
print("[get_system_prompt] prompt name:", prompt_name)
# print("[get_system_prompt] prompt:", prompt)
Expand Down
14 changes: 6 additions & 8 deletions operate/operate.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,7 @@
config = Config()
operating_system = OperatingSystem()

VERBOSE = config.verbose


def main(model, terminal_prompt, voice_mode=False):
def main(model, terminal_prompt, voice_mode=False, verbose_mode=False):
"""
Main function for the Self-Operating Computer.

Expand All @@ -48,6 +45,7 @@ def main(model, terminal_prompt, voice_mode=False):
mic = None
# Initialize `WhisperMic`, if `voice_mode` is True

config.verbose = verbose_mode
config.validation(model, voice_mode)

if voice_mode:
Expand Down Expand Up @@ -104,7 +102,7 @@ def main(model, terminal_prompt, voice_mode=False):
session_id = None

while True:
if VERBOSE:
if config.verbose:
print("[Self Operating Computer] loop_count", loop_count)
try:
operations, session_id = asyncio.run(
Expand All @@ -131,17 +129,17 @@ def main(model, terminal_prompt, voice_mode=False):


def operate(operations):
if VERBOSE:
if config.verbose:
print("[Self Operating Computer][operate]")
for operation in operations:
if VERBOSE:
if config.verbose:
print("[Self Operating Computer][operate] operation", operation)
# wait one second
time.sleep(1)
operate_type = operation.get("operation").lower()
operate_thought = operation.get("thought")
operate_detail = ""
if VERBOSE:
if config.verbose:
print("[Self Operating Computer][operate] operate_type", operate_type)

if operate_type == "press" or operate_type == "hotkey":
Expand Down
Loading