Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
b229923
Add initial ocr approach with `call_gpt_4_vision_preview_ocr`
joshbickett Jan 21, 2024
4bca53e
Add `SYSTEM_PROMPT_OCR_MAC` and `SYSTEM_PROMPT_OCR_WIN_LINUX`
joshbickett Jan 21, 2024
8f7765c
Iterate ocr method with `get_text_element`
joshbickett Jan 21, 2024
a156a48
Bug fixes
joshbickett Jan 21, 2024
f6b61c6
Experimental prompt fixes
joshbickett Jan 21, 2024
c90ce67
Add `get_text_coordinates`
joshbickett Jan 21, 2024
ff7a716
Change dir to `labeled_images_dir`
joshbickett Jan 21, 2024
5f5ed69
Iterate `get_text_coordinates`
joshbickett Jan 21, 2024
4790a4f
Fix `get_text_coordinates`
joshbickett Jan 21, 2024
3281fac
iterate get text function
joshbickett Jan 21, 2024
df1cbd2
add `content_str` for proper message structure
joshbickett Jan 21, 2024
14f3a8a
Adjust `SYSTEM_PROMPT_OCR_MAC`
joshbickett Jan 21, 2024
2716b92
Add `easyocr` to `requirements.txt
joshbickett Jan 21, 2024
817d137
Update `SYSTEM_PROMPT_OCR_WIN_LINUX`
joshbickett Jan 21, 2024
ff35f17
Create `gpt_4_fallback` method
joshbickett Jan 22, 2024
135b3ea
New click prompt 'no button' approach for `SYSTEM_PROMPT_OCR_MAC`
joshbickett Jan 22, 2024
849cc7e
update `get_system_prompt`
joshbickett Jan 24, 2024
e1fa953
Turn off `verbose`
joshbickett Jan 24, 2024
39ddc45
Add missing `if self.verbose:`
joshbickett Jan 24, 2024
e8ffbb5
Default to `gpt-4-with-ocr` since it performs best
joshbickett Jan 24, 2024
24e6884
Add `### Optical Character Recognition Mode `-m gpt-4-with-ocr``
joshbickett Jan 24, 2024
b1cc4fa
Update `readme.md`
joshbickett Jan 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,13 @@ operate -m gemini-pro-vision

**Enter your Google AI Studio API key when terminal prompts you for it** If you don't have one, you can obtain a key [here](https://makersuite.google.com/app/apikey) after setting up your Google AI Studio account. You may also need [authorize credentials for a desktop application](https://ai.google.dev/palm_docs/oauth_quickstart). It took me a bit of time to get it working, if anyone knows a simpler way, please make a PR:

### Optical Character Recognition Mode `-m gpt-4-with-ocr`
The Self-Operating Computer Framework now integrates Optical Character Recognition (OCR) capabilities with the `gpt-4-with-ocr` mode. This mode gives GPT-4 a hash map of clickable elements by coordinates. GPT-4 can decide to `click` elements by text and then the code references the hash map to get the coordinates for that element GPT-4 wanted to click.

Based on recent tests, OCR performs better than `som` and vanilla GPT-4 so we made it the default for the project. To use the OCR mode you can simply write:

`operate` or `operate -m gpt-4-with-ocr` will also work.

### Set-of-Mark Prompting `-m gpt-4-with-som`
The Self-Operating Computer Framework now supports Set-of-Mark (SoM) Prompting with the `gpt-4-with-som` command. This new visual prompting method enhances the visual grounding capabilities of large multimodal models.

Expand Down
10 changes: 4 additions & 6 deletions operate/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@ def initialize_openai(self):
)
api_key = os.getenv("OPENAI_API_KEY")

if self.verbose:
print("[Config][initialize_openai] api_key", api_key)

client = OpenAI(
api_key=api_key,
)
Expand All @@ -65,9 +62,10 @@ def initialize_google(self):
print("[Config][initialize_google] using cached google_api_key")
api_key = self.google_api_key
else:
print(
"[Config][initialize_google] no cached google_api_key, try to get from env."
)
if self.verbose:
print(
"[Config][initialize_google] no cached google_api_key, try to get from env."
)
api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=api_key, transport="rest")
model = genai.GenerativeModel("gemini-pro-vision")
Expand Down
2 changes: 1 addition & 1 deletion operate/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def main_entry():
"--model",
help="Specify the model to use",
required=False,
default="gpt-4",
default="gpt-4-with-ocr",
)

# Add a voice flag
Expand Down
200 changes: 183 additions & 17 deletions operate/models/apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import base64
import traceback
import io
import easyocr


from PIL import Image
Expand All @@ -19,6 +20,7 @@
get_user_prompt,
get_system_prompt,
)
from operate.utils.ocr import get_text_element, get_text_coordinates


from operate.utils.label import (
Expand Down Expand Up @@ -48,6 +50,9 @@ async def get_next_action(model, messages, objective, session_id):
if model == "gpt-4-with-som":
operation = await call_gpt_4_vision_preview_labeled(messages, objective)
return operation, None
if model == "gpt-4-with-ocr":
operation = await call_gpt_4_vision_preview_ocr(messages, objective, model)
return operation, None
elif model == "agent-1":
return "coming soon"
elif model == "gemini-pro-vision":
Expand All @@ -58,7 +63,7 @@ async def get_next_action(model, messages, objective, session_id):

def call_gpt_4_vision_preview(messages):
if VERBOSE:
print("[Self Operating Computer][get_next_action][call_gpt_4_v]")
print("[call_gpt_4_v]")
time.sleep(1)
client = config.initialize_openai()
try:
Expand All @@ -80,7 +85,7 @@ def call_gpt_4_vision_preview(messages):

if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_v] user_prompt",
"[call_gpt_4_v] user_prompt",
user_prompt,
)

Expand Down Expand Up @@ -115,7 +120,7 @@ def call_gpt_4_vision_preview(messages):
assistant_message = {"role": "assistant", "content": content}
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_v] content",
"[call_gpt_4_v] content",
content,
)
content = json.loads(content)
Expand Down Expand Up @@ -157,25 +162,23 @@ def call_gemini_pro_vision(messages, objective):
capture_screen_with_cursor(screenshot_filename)
# sleep for a second
time.sleep(1)
prompt = get_system_prompt(objective)
prompt = get_system_prompt("gemini-pro-vision", objective)

model = config.initialize_google()
if VERBOSE:
print("[Self Operating Computer][call_gemini_pro_vision] model", model)
print("[call_gemini_pro_vision] model", model)

response = model.generate_content([prompt, Image.open(screenshot_filename)])

content = response.text[1:]
if VERBOSE:
print(
"[Self Operating Computer][call_gemini_pro_vision] response", response
)
print("[Self Operating Computer][call_gemini_pro_vision] content", content)
print("[call_gemini_pro_vision] response", response)
print("[call_gemini_pro_vision] content", content)

content = json.loads(content)
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gemini_pro_vision] content",
"[get_next_action][call_gemini_pro_vision] content",
content,
)

Expand All @@ -189,6 +192,132 @@ def call_gemini_pro_vision(messages, objective):
return call_gpt_4_vision_preview(messages)


async def call_gpt_4_vision_preview_ocr(messages, objective, model):
if VERBOSE:
print("[call_gpt_4_vision_preview_ocr]")

# Construct the path to the file within the package
try:
time.sleep(1)
client = config.initialize_openai()

confirm_system_prompt(messages, objective, model)
screenshots_dir = "screenshots"
if not os.path.exists(screenshots_dir):
os.makedirs(screenshots_dir)

screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
# Call the function to capture the screen with the cursor
capture_screen_with_cursor(screenshot_filename)

with open(screenshot_filename, "rb") as img_file:
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")

if len(messages) == 1:
user_prompt = get_user_first_message_prompt()
else:
user_prompt = get_user_prompt()

if VERBOSE:
print(
"[call_gpt_4_vision_preview_ocr] user_prompt",
user_prompt,
)

vision_message = {
"role": "user",
"content": [
{"type": "text", "text": user_prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
],
}
messages.append(vision_message)

response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=messages,
presence_penalty=1,
frequency_penalty=1,
temperature=0.7,
max_tokens=1000,
)

content = response.choices[0].message.content

if content.startswith("```json"):
content = content[len("```json") :] # Remove starting ```json
if content.endswith("```"):
content = content[: -len("```")] # Remove ending

content_str = content

content = json.loads(content)
if VERBOSE:
print("[call_gpt_4_vision_preview_ocr] content", content)

processed_content = []

for operation in content:
if operation.get("operation") == "click":
text_to_click = operation.get("text")
if VERBOSE:
print(
"[call_gpt_4_vision_preview_ocr][click] text_to_click",
text_to_click,
)
# Initialize EasyOCR Reader
reader = easyocr.Reader(["en"])

# Read the screenshot
result = reader.readtext(screenshot_filename)

text_element_index = get_text_element(
result, text_to_click, screenshot_filename
)
coordinates = get_text_coordinates(
result, text_element_index, screenshot_filename
)

# add `coordinates`` to `content`
operation["x"] = coordinates["x"]
operation["y"] = coordinates["y"]

if VERBOSE:
print(
"[call_gpt_4_vision_preview_ocr][click] text_element_index",
text_element_index,
)
print(
"[call_gpt_4_vision_preview_ocr][click] coordinates",
coordinates,
)
print(
"[call_gpt_4_vision_preview_ocr][click] final operation",
operation,
)
processed_content.append(operation)

else:
processed_content.append(operation)

# wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
assistant_message = {"role": "assistant", "content": content_str}
messages.append(assistant_message)

return processed_content

except Exception as e:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
e,
)
traceback.print_exc()
return gpt_4_fallback(messages, objective, model)


async def call_gpt_4_vision_preview_labeled(messages, objective):
time.sleep(1)
client = config.initialize_openai()
Expand Down Expand Up @@ -217,7 +346,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):

if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] user_prompt",
"[call_gpt_4_vision_preview_labeled] user_prompt",
user_prompt,
)

Expand Down Expand Up @@ -254,7 +383,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
assistant_message = {"role": "assistant", "content": content}
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] content",
"[call_gpt_4_vision_preview_labeled] content",
content,
)
messages.append(assistant_message)
Expand All @@ -268,14 +397,14 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
label = operation.get("label")
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] label",
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] label",
label,
)

coordinates = get_label_coordinates(label, label_coordinates)
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] coordinates",
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] coordinates",
coordinates,
)
image = Image.open(
Expand All @@ -287,7 +416,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
)
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] click_position_percent",
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] click_position_percent",
click_position_percent,
)
if not click_position_percent:
Expand All @@ -302,7 +431,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
operation["y"] = y_percent
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new click operation",
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] new click operation",
operation,
)
processed_content.append(operation)
Expand All @@ -311,7 +440,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):

if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new processed_content",
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] new processed_content",
processed_content,
)
return processed_content
Expand All @@ -321,6 +450,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
e,
)
traceback.print_exc()
return call_gpt_4_vision_preview(messages)


Expand All @@ -336,3 +466,39 @@ def get_last_assistant_message(messages):
else:
return messages[index]
return None # Return None if no assistant message is found


def gpt_4_fallback(messages, objective, model):
if VERBOSE:
print("[gpt_4_fallback]")
system_prompt = get_system_prompt("gpt-4-vision-preview", objective)
new_system_message = {"role": "system", "content": system_prompt}
# remove and replace the first message in `messages` with `new_system_message`

messages[0] = new_system_message
if VERBOSE:
print("[gpt_4_fallback] new messages", messages)

if VERBOSE:
print("[gpt_4_fallback][updated]")
print("[gpt_4_fallback][updated] len(messages)", len(messages))

return call_gpt_4_vision_preview(messages)


def confirm_system_prompt(messages, objective, model):
"""
On `Exception` we default to `call_gpt_4_vision_preview` so we have this function to reassign system prompt in case of a previous failure
"""
if VERBOSE:
print("[confirm_system_prompt]")

system_prompt = get_system_prompt(model, objective)
new_system_message = {"role": "system", "content": system_prompt}
# remove and replace the first message in `messages` with `new_system_message`

messages[0] = new_system_message

if VERBOSE:
print("[confirm_system_prompt][updated]")
print("[confirm_system_prompt][updated] len(messages)", len(messages))
Loading