diff --git a/operate/config.py b/operate/config.py index 00228686..278869bf 100644 --- a/operate/config.py +++ b/operate/config.py @@ -11,7 +11,7 @@ class Config: Configuration class for managing settings. Attributes: - debug (bool): Flag indicating whether debug mode is enabled. + verbose (bool): Flag indicating whether verbose mode is enabled. openai_api_key (str): API key for OpenAI. google_api_key (str): API key for Google. """ diff --git a/operate/main.py b/operate/main.py index 73065854..86832e4e 100644 --- a/operate/main.py +++ b/operate/main.py @@ -24,6 +24,14 @@ def main_entry(): help="Use voice input mode", action="store_true", ) + + # Add a flag for verbose mode + parser.add_argument( + "--verbose", + help="Run operate in verbose mode", + action="store_true", + ) + # Allow for direct input of prompt parser.add_argument( "--prompt", @@ -38,6 +46,7 @@ def main_entry(): args.model, terminal_prompt=args.prompt, voice_mode=args.voice, + verbose_mode=args.verbose ) except KeyboardInterrupt: print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...") diff --git a/operate/models/apis.py b/operate/models/apis.py index d81f597c..0d75b036 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -34,11 +34,9 @@ # Load configuration config = Config() -VERBOSE = config.verbose - async def get_next_action(model, messages, objective, session_id): - if VERBOSE: + if config.verbose: print("[Self-Operating Computer][get_next_action]") print("[Self-Operating Computer][get_next_action] model", model) if model == "gpt-4": @@ -61,7 +59,7 @@ async def get_next_action(model, messages, objective, session_id): def call_gpt_4_vision_preview(messages): - if VERBOSE: + if config.verbose: print("[call_gpt_4_v]") time.sleep(1) client = config.initialize_openai() @@ -82,7 +80,7 @@ def call_gpt_4_vision_preview(messages): else: user_prompt = get_user_prompt() - if VERBOSE: + if config.verbose: print( "[call_gpt_4_v] user_prompt", user_prompt, @@ -117,7 +115,7 @@ def call_gpt_4_vision_preview(messages): content = content[: -len("```")] # Remove ending assistant_message = {"role": "assistant", "content": content} - if VERBOSE: + if config.verbose: print( "[call_gpt_4_v] content", content, @@ -137,7 +135,7 @@ def call_gpt_4_vision_preview(messages): f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}", content, ) - if VERBOSE: + if config.verbose: traceback.print_exc() return call_gpt_4_vision_preview(messages) @@ -146,7 +144,7 @@ def call_gemini_pro_vision(messages, objective): """ Get the next action for Self-Operating Computer using Gemini Pro Vision """ - if VERBOSE: + if config.verbose: print( "[Self Operating Computer][call_gemini_pro_vision]", ) @@ -165,18 +163,18 @@ def call_gemini_pro_vision(messages, objective): prompt = get_system_prompt("gemini-pro-vision", objective) model = config.initialize_google() - if VERBOSE: + if config.verbose: print("[call_gemini_pro_vision] model", model) response = model.generate_content([prompt, Image.open(screenshot_filename)]) content = response.text[1:] - if VERBOSE: + if config.verbose: print("[call_gemini_pro_vision] response", response) print("[call_gemini_pro_vision] content", content) content = json.loads(content) - if VERBOSE: + if config.verbose: print( "[get_next_action][call_gemini_pro_vision] content", content, @@ -188,14 +186,14 @@ def call_gemini_pro_vision(messages, objective): print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying another method {ANSI_RESET}" ) - if VERBOSE: + if config.verbose: print("[Self-Operating Computer][Operate] error", e) traceback.print_exc() return call_gpt_4_vision_preview(messages) async def call_gpt_4_vision_preview_ocr(messages, objective, model): - if VERBOSE: + if config.verbose: print("[call_gpt_4_vision_preview_ocr]") # Construct the path to the file within the package @@ -260,7 +258,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model): # Normalize line breaks and remove any unwanted characters content = "\n".join(line.strip() for line in content.splitlines()) - if VERBOSE: + if config.verbose: print( "\n\n\n[call_gpt_4_vision_preview_ocr] content after cleaning", content ) @@ -274,7 +272,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model): for operation in content: if operation.get("operation") == "click": text_to_click = operation.get("text") - if VERBOSE: + if config.verbose: print( "[call_gpt_4_vision_preview_ocr][click] text_to_click", text_to_click, @@ -296,7 +294,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model): operation["x"] = coordinates["x"] operation["y"] = coordinates["y"] - if VERBOSE: + if config.verbose: print( "[call_gpt_4_vision_preview_ocr][click] text_element_index", text_element_index, @@ -324,7 +322,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model): print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying another method {ANSI_RESET}" ) - if VERBOSE: + if config.verbose: print("[Self-Operating Computer][Operate] error", e) traceback.print_exc() return gpt_4_fallback(messages, objective, model) @@ -356,7 +354,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): else: user_prompt = get_user_prompt() - if VERBOSE: + if config.verbose: print( "[call_gpt_4_vision_preview_labeled] user_prompt", user_prompt, @@ -393,7 +391,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): content = content[: -len("```")] # Remove ending assistant_message = {"role": "assistant", "content": content} - if VERBOSE: + if config.verbose: print( "[call_gpt_4_vision_preview_labeled] content", content, @@ -407,14 +405,14 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): for operation in content: if operation.get("operation") == "click": label = operation.get("label") - if VERBOSE: + if config.verbose: print( "[Self Operating Computer][call_gpt_4_vision_preview_labeled] label", label, ) coordinates = get_label_coordinates(label, label_coordinates) - if VERBOSE: + if config.verbose: print( "[Self Operating Computer][call_gpt_4_vision_preview_labeled] coordinates", coordinates, @@ -426,7 +424,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): click_position_percent = get_click_position_in_percent( coordinates, image_size ) - if VERBOSE: + if config.verbose: print( "[Self Operating Computer][call_gpt_4_vision_preview_labeled] click_position_percent", click_position_percent, @@ -441,7 +439,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): y_percent = f"{click_position_percent[1]:.2f}" operation["x"] = x_percent operation["y"] = y_percent - if VERBOSE: + if config.verbose: print( "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new click operation", operation, @@ -450,7 +448,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): else: processed_content.append(operation) - if VERBOSE: + if config.verbose: print( "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new processed_content", processed_content, @@ -461,14 +459,14 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying another method {ANSI_RESET}" ) - if VERBOSE: + if config.verbose: print("[Self-Operating Computer][Operate] error", e) traceback.print_exc() return call_gpt_4_vision_preview(messages) def call_ollama_llava(messages): - if VERBOSE: + if config.verbose: print("[call_ollama_llava]") time.sleep(1) try: @@ -485,7 +483,7 @@ def call_ollama_llava(messages): else: user_prompt = get_user_prompt() - if VERBOSE: + if config.verbose: print( "[call_ollama_llava] user_prompt", user_prompt, @@ -516,7 +514,7 @@ def call_ollama_llava(messages): content = content[: -len("```")] # Remove ending assistant_message = {"role": "assistant", "content": content} - if VERBOSE: + if config.verbose: print( "[call_ollama_llava] content", content, @@ -542,7 +540,7 @@ def call_ollama_llava(messages): f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}", content, ) - if VERBOSE: + if config.verbose: traceback.print_exc() return call_ollama_llava(messages) @@ -562,7 +560,7 @@ def get_last_assistant_message(messages): def gpt_4_fallback(messages, objective, model): - if VERBOSE: + if config.verbose: print("[gpt_4_fallback]") system_prompt = get_system_prompt("gpt-4-vision-preview", objective) new_system_message = {"role": "system", "content": system_prompt} @@ -570,7 +568,7 @@ def gpt_4_fallback(messages, objective, model): messages[0] = new_system_message - if VERBOSE: + if config.verbose: print("[gpt_4_fallback][updated]") print("[gpt_4_fallback][updated] len(messages)", len(messages)) @@ -581,7 +579,7 @@ def confirm_system_prompt(messages, objective, model): """ On `Exception` we default to `call_gpt_4_vision_preview` so we have this function to reassign system prompt in case of a previous failure """ - if VERBOSE: + if config.verbose: print("[confirm_system_prompt] model", model) system_prompt = get_system_prompt(model, objective) @@ -590,7 +588,7 @@ def confirm_system_prompt(messages, objective, model): messages[0] = new_system_message - if VERBOSE: + if config.verbose: print("[confirm_system_prompt]") print("[confirm_system_prompt] len(messages)", len(messages)) for m in messages: diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 43a3864e..96e679fb 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -2,7 +2,7 @@ from operate.config import Config # Load configuration -VERBOSE = Config().verbose +config = Config() # General user Prompts USER_QUESTION = "Hello, I can help you with anything. What would you like done?" @@ -380,7 +380,7 @@ def get_system_prompt(model, objective): prompt = prompt_string.format(objective=objective) # Optional verbose output - if VERBOSE: + if config.verbose: print("[get_system_prompt] model:", model) print("[get_system_prompt] prompt name:", prompt_name) # print("[get_system_prompt] prompt:", prompt) diff --git a/operate/operate.py b/operate/operate.py index beeb3651..fa379bc6 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -29,10 +29,7 @@ config = Config() operating_system = OperatingSystem() -VERBOSE = config.verbose - - -def main(model, terminal_prompt, voice_mode=False): +def main(model, terminal_prompt, voice_mode=False, verbose_mode=False): """ Main function for the Self-Operating Computer. @@ -48,6 +45,7 @@ def main(model, terminal_prompt, voice_mode=False): mic = None # Initialize `WhisperMic`, if `voice_mode` is True + config.verbose = verbose_mode config.validation(model, voice_mode) if voice_mode: @@ -104,7 +102,7 @@ def main(model, terminal_prompt, voice_mode=False): session_id = None while True: - if VERBOSE: + if config.verbose: print("[Self Operating Computer] loop_count", loop_count) try: operations, session_id = asyncio.run( @@ -131,17 +129,17 @@ def main(model, terminal_prompt, voice_mode=False): def operate(operations): - if VERBOSE: + if config.verbose: print("[Self Operating Computer][operate]") for operation in operations: - if VERBOSE: + if config.verbose: print("[Self Operating Computer][operate] operation", operation) # wait one second time.sleep(1) operate_type = operation.get("operation").lower() operate_thought = operation.get("thought") operate_detail = "" - if VERBOSE: + if config.verbose: print("[Self Operating Computer][operate] operate_type", operate_type) if operate_type == "press" or operate_type == "hotkey": diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py index 68b4d435..937511b0 100644 --- a/operate/utils/ocr.py +++ b/operate/utils/ocr.py @@ -4,7 +4,7 @@ from datetime import datetime # Load configuration -VERBOSE = Config().verbose +config = Config() def get_text_element(result, search_text, image_path): @@ -21,7 +21,7 @@ def get_text_element(result, search_text, image_path): Raises: Exception: If the text element is not found in the results. """ - if VERBOSE: + if config.verbose: print("[get_text_element]") print("[get_text_element] search_text", search_text) # Create /ocr directory if it doesn't exist @@ -38,17 +38,17 @@ def get_text_element(result, search_text, image_path): text = element[1] box = element[0] - if VERBOSE: + if config.verbose: # Draw bounding box in blue draw.polygon([tuple(point) for point in box], outline="blue") if search_text in text: found_index = index - if VERBOSE: + if config.verbose: print("[get_text_element][loop] found search_text, index:", index) if found_index is not None: - if VERBOSE: + if config.verbose: # Draw bounding box of the found text in red box = result[found_index][0] draw.polygon([tuple(point) for point in box], outline="red")