diff --git a/README.md b/README.md index 120b83f9..f52399bf 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,13 @@ operate -m gemini-pro-vision **Enter your Google AI Studio API key when terminal prompts you for it** If you don't have one, you can obtain a key [here](https://makersuite.google.com/app/apikey) after setting up your Google AI Studio account. You may also need [authorize credentials for a desktop application](https://ai.google.dev/palm_docs/oauth_quickstart). It took me a bit of time to get it working, if anyone knows a simpler way, please make a PR: +### Optical Character Recognition Mode `-m gpt-4-with-ocr` +The Self-Operating Computer Framework now integrates Optical Character Recognition (OCR) capabilities with the `gpt-4-with-ocr` mode. This mode gives GPT-4 a hash map of clickable elements by coordinates. GPT-4 can decide to `click` elements by text and then the code references the hash map to get the coordinates for that element GPT-4 wanted to click. + +Based on recent tests, OCR performs better than `som` and vanilla GPT-4 so we made it the default for the project. To use the OCR mode you can simply write: + + `operate` or `operate -m gpt-4-with-ocr` will also work. + ### Set-of-Mark Prompting `-m gpt-4-with-som` The Self-Operating Computer Framework now supports Set-of-Mark (SoM) Prompting with the `gpt-4-with-som` command. This new visual prompting method enhances the visual grounding capabilities of large multimodal models. diff --git a/operate/config.py b/operate/config.py index d68382da..581af043 100644 --- a/operate/config.py +++ b/operate/config.py @@ -49,9 +49,6 @@ def initialize_openai(self): ) api_key = os.getenv("OPENAI_API_KEY") - if self.verbose: - print("[Config][initialize_openai] api_key", api_key) - client = OpenAI( api_key=api_key, ) @@ -65,9 +62,10 @@ def initialize_google(self): print("[Config][initialize_google] using cached google_api_key") api_key = self.google_api_key else: - print( - "[Config][initialize_google] no cached google_api_key, try to get from env." - ) + if self.verbose: + print( + "[Config][initialize_google] no cached google_api_key, try to get from env." + ) api_key = os.getenv("GOOGLE_API_KEY") genai.configure(api_key=api_key, transport="rest") model = genai.GenerativeModel("gemini-pro-vision") diff --git a/operate/main.py b/operate/main.py index 3cf991da..73065854 100644 --- a/operate/main.py +++ b/operate/main.py @@ -15,7 +15,7 @@ def main_entry(): "--model", help="Specify the model to use", required=False, - default="gpt-4", + default="gpt-4-with-ocr", ) # Add a voice flag diff --git a/operate/models/apis.py b/operate/models/apis.py index 2e1d8bf5..64499774 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -4,6 +4,7 @@ import base64 import traceback import io +import easyocr from PIL import Image @@ -19,6 +20,7 @@ get_user_prompt, get_system_prompt, ) +from operate.utils.ocr import get_text_element, get_text_coordinates from operate.utils.label import ( @@ -48,6 +50,9 @@ async def get_next_action(model, messages, objective, session_id): if model == "gpt-4-with-som": operation = await call_gpt_4_vision_preview_labeled(messages, objective) return operation, None + if model == "gpt-4-with-ocr": + operation = await call_gpt_4_vision_preview_ocr(messages, objective, model) + return operation, None elif model == "agent-1": return "coming soon" elif model == "gemini-pro-vision": @@ -58,7 +63,7 @@ async def get_next_action(model, messages, objective, session_id): def call_gpt_4_vision_preview(messages): if VERBOSE: - print("[Self Operating Computer][get_next_action][call_gpt_4_v]") + print("[call_gpt_4_v]") time.sleep(1) client = config.initialize_openai() try: @@ -80,7 +85,7 @@ def call_gpt_4_vision_preview(messages): if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_v] user_prompt", + "[call_gpt_4_v] user_prompt", user_prompt, ) @@ -115,7 +120,7 @@ def call_gpt_4_vision_preview(messages): assistant_message = {"role": "assistant", "content": content} if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_v] content", + "[call_gpt_4_v] content", content, ) content = json.loads(content) @@ -157,25 +162,23 @@ def call_gemini_pro_vision(messages, objective): capture_screen_with_cursor(screenshot_filename) # sleep for a second time.sleep(1) - prompt = get_system_prompt(objective) + prompt = get_system_prompt("gemini-pro-vision", objective) model = config.initialize_google() if VERBOSE: - print("[Self Operating Computer][call_gemini_pro_vision] model", model) + print("[call_gemini_pro_vision] model", model) response = model.generate_content([prompt, Image.open(screenshot_filename)]) content = response.text[1:] if VERBOSE: - print( - "[Self Operating Computer][call_gemini_pro_vision] response", response - ) - print("[Self Operating Computer][call_gemini_pro_vision] content", content) + print("[call_gemini_pro_vision] response", response) + print("[call_gemini_pro_vision] content", content) content = json.loads(content) if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gemini_pro_vision] content", + "[get_next_action][call_gemini_pro_vision] content", content, ) @@ -189,6 +192,132 @@ def call_gemini_pro_vision(messages, objective): return call_gpt_4_vision_preview(messages) +async def call_gpt_4_vision_preview_ocr(messages, objective, model): + if VERBOSE: + print("[call_gpt_4_vision_preview_ocr]") + + # Construct the path to the file within the package + try: + time.sleep(1) + client = config.initialize_openai() + + confirm_system_prompt(messages, objective, model) + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + with open(screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() + + if VERBOSE: + print( + "[call_gpt_4_vision_preview_ocr] user_prompt", + user_prompt, + ) + + vision_message = { + "role": "user", + "content": [ + {"type": "text", "text": user_prompt}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + } + messages.append(vision_message) + + response = client.chat.completions.create( + model="gpt-4-vision-preview", + messages=messages, + presence_penalty=1, + frequency_penalty=1, + temperature=0.7, + max_tokens=1000, + ) + + content = response.choices[0].message.content + + if content.startswith("```json"): + content = content[len("```json") :] # Remove starting ```json + if content.endswith("```"): + content = content[: -len("```")] # Remove ending + + content_str = content + + content = json.loads(content) + if VERBOSE: + print("[call_gpt_4_vision_preview_ocr] content", content) + + processed_content = [] + + for operation in content: + if operation.get("operation") == "click": + text_to_click = operation.get("text") + if VERBOSE: + print( + "[call_gpt_4_vision_preview_ocr][click] text_to_click", + text_to_click, + ) + # Initialize EasyOCR Reader + reader = easyocr.Reader(["en"]) + + # Read the screenshot + result = reader.readtext(screenshot_filename) + + text_element_index = get_text_element( + result, text_to_click, screenshot_filename + ) + coordinates = get_text_coordinates( + result, text_element_index, screenshot_filename + ) + + # add `coordinates`` to `content` + operation["x"] = coordinates["x"] + operation["y"] = coordinates["y"] + + if VERBOSE: + print( + "[call_gpt_4_vision_preview_ocr][click] text_element_index", + text_element_index, + ) + print( + "[call_gpt_4_vision_preview_ocr][click] coordinates", + coordinates, + ) + print( + "[call_gpt_4_vision_preview_ocr][click] final operation", + operation, + ) + processed_content.append(operation) + + else: + processed_content.append(operation) + + # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history + assistant_message = {"role": "assistant", "content": content_str} + messages.append(assistant_message) + + return processed_content + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}", + e, + ) + traceback.print_exc() + return gpt_4_fallback(messages, objective, model) + + async def call_gpt_4_vision_preview_labeled(messages, objective): time.sleep(1) client = config.initialize_openai() @@ -217,7 +346,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] user_prompt", + "[call_gpt_4_vision_preview_labeled] user_prompt", user_prompt, ) @@ -254,7 +383,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): assistant_message = {"role": "assistant", "content": content} if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] content", + "[call_gpt_4_vision_preview_labeled] content", content, ) messages.append(assistant_message) @@ -268,14 +397,14 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): label = operation.get("label") if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] label", + "[Self Operating Computer][call_gpt_4_vision_preview_labeled] label", label, ) coordinates = get_label_coordinates(label, label_coordinates) if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] coordinates", + "[Self Operating Computer][call_gpt_4_vision_preview_labeled] coordinates", coordinates, ) image = Image.open( @@ -287,7 +416,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): ) if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] click_position_percent", + "[Self Operating Computer][call_gpt_4_vision_preview_labeled] click_position_percent", click_position_percent, ) if not click_position_percent: @@ -302,7 +431,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): operation["y"] = y_percent if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new click operation", + "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new click operation", operation, ) processed_content.append(operation) @@ -311,7 +440,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new processed_content", + "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new processed_content", processed_content, ) return processed_content @@ -321,6 +450,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}", e, ) + traceback.print_exc() return call_gpt_4_vision_preview(messages) @@ -336,3 +466,39 @@ def get_last_assistant_message(messages): else: return messages[index] return None # Return None if no assistant message is found + + +def gpt_4_fallback(messages, objective, model): + if VERBOSE: + print("[gpt_4_fallback]") + system_prompt = get_system_prompt("gpt-4-vision-preview", objective) + new_system_message = {"role": "system", "content": system_prompt} + # remove and replace the first message in `messages` with `new_system_message` + + messages[0] = new_system_message + if VERBOSE: + print("[gpt_4_fallback] new messages", messages) + + if VERBOSE: + print("[gpt_4_fallback][updated]") + print("[gpt_4_fallback][updated] len(messages)", len(messages)) + + return call_gpt_4_vision_preview(messages) + + +def confirm_system_prompt(messages, objective, model): + """ + On `Exception` we default to `call_gpt_4_vision_preview` so we have this function to reassign system prompt in case of a previous failure + """ + if VERBOSE: + print("[confirm_system_prompt]") + + system_prompt = get_system_prompt(model, objective) + new_system_message = {"role": "system", "content": system_prompt} + # remove and replace the first message in `messages` with `new_system_message` + + messages[0] = new_system_message + + if VERBOSE: + print("[confirm_system_prompt][updated]") + print("[confirm_system_prompt][updated] len(messages)", len(messages)) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index c567bf48..5ba4ec67 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -1,4 +1,8 @@ import platform +from operate.config import Config + +# Load configuration +VERBOSE = Config().verbose # General user Prompts USER_QUESTION = "Hello, I can help you with anything. What would you like done?" @@ -46,7 +50,7 @@ - Go to Google Docs and Google Sheets by typing in the Chrome Address bar - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. -Objective: {objective} # take the next best action for this objective +Objective: {objective} """ SYSTEM_PROMPT_WIN_LINUX = """ @@ -91,7 +95,7 @@ - Go to Google Docs and Google Sheets by typing in the Chrome Address bar - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. -Objective: {objective} # take the next best action for this objective +Objective: {objective} """ @@ -142,7 +146,7 @@ - Go to Google Docs and Google Sheets by typing in the Chrome Address bar - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. -Objective: {objective} # take the next best action for this objective +Objective: {objective} """ SYSTEM_PROMPT_LABELED_WIN_LINUX = """ @@ -192,9 +196,112 @@ - Go to Google Docs and Google Sheets by typing in the Chrome Address bar - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. -Objective: {objective} # take the next best action for this objective +Objective: {objective} +""" + + +SYSTEM_PROMPT_OCR_MAC = """ +You are operating a computer, using the same operating system as a human. + +From looking at the screen, the objective, and your previous actions, take the next best series of action. + +You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. + +1. click - Move mouse and click +[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons or links with text to click. If the button you want to click doesn't have text you can say `"no button"` for the text value and we'll try a different method. + +2. write - Write with your keyboard +[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] + +3. press - Use a hotkey or press key to operate the computer +[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] + +4. done - The objective is completed +[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] + +Return the actions in array format `[]`. You can take just one action or multiple actions. + +Here a helpful example: + +# Opens Spotlight Search on Mac and see if Google Chrome is available to use +[ + {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["command", "space"] }}, + {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, + {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} +] + +# Go to a website (LinkedIn) when the browser is already open + +[ + {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["command", "t"] }}, + {{ "thought": "Now I'll write LinkedIn's website to go there", "operation": "write", "content": "https://www.linkedin.com/feed/" }}, + {{ "thought": "Finally I'll press enter to go to LinkedIn", "operation": "press", "keys": ["enter"] }} +] + +# Search for someone on Linkedin when already on linkedin.com +[ + {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }}, + {{ "thought": "Now that the field is active I can write the name of the person I'd like to search for", "operation": "write", "content": "John Doe" }}, + {{ "thought": "Finally I'll submit the search form with enter", "operation": "presss", "keys": ["enter"] }}, +] + +A very important note, don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +Objective: {objective} """ +SYSTEM_PROMPT_OCR_WIN_LINUX = """ +You are operating a computer, using the same operating system as a human. + +From looking at the screen, the objective, and your previous actions, take the next best series of action. + +You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. + +1. click - Move mouse and click +[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons or links with text to click. If the button you want to click doesn't have text you can say `"no button"` for the text value and we'll try a different method. + +2. write - Write with your keyboard +[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] + +3. press - Use a hotkey or press key to operate the computer +[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] + +4. done - The objective is completed +[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] + +Return the actions in array format `[]`. You can take just one action or multiple actions. + +Here are some helpful combinations: + +# Opens Spotlight Search on Mac and see if Google Chrome is available to use +[ + {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["win"] }}, + {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, + {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} +] + +# Go to a website (LinkedIn) when the browser is already open + +[ + {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["ctrl", "t"] }}, + {{ "thought": "Now I'll write LinkedIn's website to go there", "operation": "write", "content": "https://www.linkedin.com/feed/" }}, + {{ "thought": "Finally I'll press enter to go to LinkedIn", "operation": "press", "keys": ["enter"] }} +] + +# Search for someone on Linkedin when already on linkedin.com +[ + {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }}, + {{ "thought": "Now that the field is active I can write the name of the person I'd like to search for", "operation": "write", "content": "John Doe" }}, + {{ "thought": "Finally I'll submit the search form with enter", "operation": "presss", "keys": ["enter"] }}, +] + +A few important notes: + +- Go to Google Docs and Google Sheets by typing in the Chrome Address bar +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +Objective: {objective} +""" OPERATE_FIRST_MESSAGE_PROMPT = """ Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done @@ -208,26 +315,45 @@ Action:""" -def get_system_prompt(objective): +def get_system_prompt(model, objective): """ - Format the vision prompt + Format the vision prompt more efficiently and print the name of the prompt used """ - if platform.system() == "Darwin": - prompt = SYSTEM_PROMPT_MAC.format(objective=objective) - else: - prompt = SYSTEM_PROMPT_WIN_LINUX.format(objective=objective) - return prompt - - -def get_system_prompt_labeled(objective): - """ - Format the vision prompt - """ - if platform.system() == "Darwin": - prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective) - else: - prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective) + prompt_map = { + ("gpt-4-with-som", "Darwin"): ( + SYSTEM_PROMPT_LABELED_MAC, + "SYSTEM_PROMPT_LABELED_MAC", + ), + ("gpt-4-with-som", "Other"): ( + SYSTEM_PROMPT_LABELED_WIN_LINUX, + "SYSTEM_PROMPT_LABELED_WIN_LINUX", + ), + ("gpt-4-with-ocr", "Darwin"): (SYSTEM_PROMPT_OCR_MAC, "SYSTEM_PROMPT_OCR_MAC"), + ("gpt-4-with-ocr", "Other"): ( + SYSTEM_PROMPT_OCR_WIN_LINUX, + "SYSTEM_PROMPT_OCR_WIN_LINUX", + ), + ("default", "Darwin"): (SYSTEM_PROMPT_MAC, "SYSTEM_PROMPT_MAC"), + ("default", "Other"): (SYSTEM_PROMPT_WIN_LINUX, "SYSTEM_PROMPT_WIN_LINUX"), + } + + os_type = "Darwin" if platform.system() == "Darwin" else "Other" + + # Fetching the prompt tuple (string and name) based on the model and OS + prompt_tuple = prompt_map.get((model, os_type), prompt_map[("default", os_type)]) + + # Extracting the prompt string and its name + prompt_string, prompt_name = prompt_tuple + + # Formatting the prompt + prompt = prompt_string.format(objective=objective) + + # Optional verbose output + if VERBOSE: + print("[get_system_prompt] model:", model) + print("[get_system_prompt] prompt name:", prompt_name) + # print("[get_system_prompt] prompt:", prompt) return prompt diff --git a/operate/operate.py b/operate/operate.py index 29b9ca8e..ab79d4f7 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -11,7 +11,6 @@ from operate.models.prompts import ( USER_QUESTION, get_system_prompt, - get_system_prompt_labeled, ) from operate.config import Config from operate.utils.style import ( @@ -96,10 +95,7 @@ def main(model, terminal_prompt, voice_mode=False): print(f"{ANSI_YELLOW}[User]{ANSI_RESET}") objective = prompt(style=style) - if model == "gpt-4-with-som": - system_prompt = get_system_prompt_labeled(objective) - else: - system_prompt = get_system_prompt(objective) + system_prompt = get_system_prompt(model, objective) system_message = {"role": "system", "content": system_prompt} messages = [system_message] diff --git a/operate/utils/label.py b/operate/utils/label.py index 14232391..20d55fee 100644 --- a/operate/utils/label.py +++ b/operate/utils/label.py @@ -68,11 +68,11 @@ def add_labels(base64_data, yolo_model): ) # Create a separate draw object for the debug image font_size = 45 - detections_dir = "detections" + labeled_images_dir = "labeled_images" label_coordinates = {} # Dictionary to store coordinates - if not os.path.exists(detections_dir): - os.makedirs(detections_dir) + if not os.path.exists(labeled_images_dir): + os.makedirs(labeled_images_dir) counter = 0 drawn_boxes = [] # List to keep track of boxes already drawn @@ -116,9 +116,11 @@ def add_labels(base64_data, yolo_model): # Save the image timestamp = time.strftime("%Y%m%d-%H%M%S") - output_path = os.path.join(detections_dir, f"img_{timestamp}_labeled.png") - output_path_debug = os.path.join(detections_dir, f"img_{timestamp}_debug.png") - output_path_original = os.path.join(detections_dir, f"img_{timestamp}_original.png") + output_path = os.path.join(labeled_images_dir, f"img_{timestamp}_labeled.png") + output_path_debug = os.path.join(labeled_images_dir, f"img_{timestamp}_debug.png") + output_path_original = os.path.join( + labeled_images_dir, f"img_{timestamp}_original.png" + ) image_labeled.save(output_path) image_debug.save(output_path_debug) diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py new file mode 100644 index 00000000..68b4d435 --- /dev/null +++ b/operate/utils/ocr.py @@ -0,0 +1,100 @@ +from operate.config import Config +from PIL import Image, ImageDraw +import os +from datetime import datetime + +# Load configuration +VERBOSE = Config().verbose + + +def get_text_element(result, search_text, image_path): + """ + Searches for a text element in the OCR results and returns its index. Also draws bounding boxes on the image. + Args: + result (list): The list of results returned by EasyOCR. + search_text (str): The text to search for in the OCR results. + image_path (str): Path to the original image. + + Returns: + int: The index of the element containing the search text. + + Raises: + Exception: If the text element is not found in the results. + """ + if VERBOSE: + print("[get_text_element]") + print("[get_text_element] search_text", search_text) + # Create /ocr directory if it doesn't exist + ocr_dir = "ocr" + if not os.path.exists(ocr_dir): + os.makedirs(ocr_dir) + + # Open the original image + image = Image.open(image_path) + draw = ImageDraw.Draw(image) + + found_index = None + for index, element in enumerate(result): + text = element[1] + box = element[0] + + if VERBOSE: + # Draw bounding box in blue + draw.polygon([tuple(point) for point in box], outline="blue") + + if search_text in text: + found_index = index + if VERBOSE: + print("[get_text_element][loop] found search_text, index:", index) + + if found_index is not None: + if VERBOSE: + # Draw bounding box of the found text in red + box = result[found_index][0] + draw.polygon([tuple(point) for point in box], outline="red") + # Save the image with bounding boxes + datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S") + ocr_image_path = os.path.join(ocr_dir, f"ocr_image_{datetime_str}.png") + image.save(ocr_image_path) + print("[get_text_element] OCR image saved at:", ocr_image_path) + + return found_index + + raise Exception("The text element was not found in the image") + + +def get_text_coordinates(result, index, image_path): + """ + Gets the coordinates of the text element at the specified index as a percentage of screen width and height. + Args: + result (list): The list of results returned by EasyOCR. + index (int): The index of the text element in the results list. + image_path (str): Path to the screenshot image. + + Returns: + dict: A dictionary containing the 'x' and 'y' coordinates as percentages of the screen width and height. + """ + if index >= len(result): + raise Exception("Index out of range in OCR results") + + # Get the bounding box of the text element + bounding_box = result[index][0] + + # Calculate the center of the bounding box + min_x = min([coord[0] for coord in bounding_box]) + max_x = max([coord[0] for coord in bounding_box]) + min_y = min([coord[1] for coord in bounding_box]) + max_y = max([coord[1] for coord in bounding_box]) + + center_x = (min_x + max_x) / 2 + center_y = (min_y + max_y) / 2 + + # Get image dimensions + with Image.open(image_path) as img: + width, height = img.size + + # Convert to percentages + percent_x = round((center_x / width), 3) + percent_y = round((center_y / height), 3) + + return {"x": percent_x, "y": percent_y} diff --git a/requirements.txt b/requirements.txt index 2c796cd9..f2727e69 100644 --- a/requirements.txt +++ b/requirements.txt @@ -49,4 +49,5 @@ wcwidth==0.2.9 zipp==3.17.0 google-generativeai==0.3.0 aiohttp==3.9.1 -ultralytics==8.0.227 \ No newline at end of file +ultralytics==8.0.227 +easyocr==1.7.1 \ No newline at end of file