### Import dependencies & define prompt

In [72]:
import base64, json, re, copy
from pathlib import Path

def image_file_to_data_url(img_path: str) -> str:
    p = Path(img_path)
    mime = "image/png" if p.suffix.lower() == ".png" else "image/jpeg"
    b64 = base64.b64encode(p.read_bytes()).decode("utf-8")
    return f"data:{mime};base64,{b64}"

SYSTEM_PROMPT = """# Tools

You may call one or more functions to assist with the user query.

You are provided with function signatures within <tools></tools> XML tags:
<tools>
{"type":"function","function":{"name_for_human":"mobile_use","name":"mobile_use","description":"Use a touchscreen to interact with a mobile device, and take screenshots.\n* This is an interface to a mobile device with touchscreen. You can perform actions like clicking, typing, swiping, etc.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.\n* The screen's resolution is 1000x1000.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.","parameters":{"properties":{"action":{"description":"The action to perform.","enum":["key","click","long_press","swipe","type","system_button","open","wait","terminate"],"type":"string"},"coordinate":{"type":"array"},"coordinate2":{"type":"array"},"text":{"type":"string"},"time":{"type":"number"},"button":{"enum":["Back","Home","Menu","Enter"],"type":"string"},"status":{"enum":["success","failure"],"type":"string"}},"required":["action"],"type":"object"},"args_format":"Format the arguments as a JSON object."}}
</tools>

For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
<tool_call>
{"name": <function-name>, "arguments": <args-json-object>}
</tool_call>

# Response format

Response format for every step:
1) Action: a short imperative describing what to do in the UI.
2) A single <tool_call>...</tool_call> block containing only the JSON: {"name": <function-name>, "arguments": <args-json-object>}.

Rules:
- Output exactly in the order: Action, <tool_call>.
- Be brief: one for Action.
- Do not output anything else outside those two parts.
- If finishing, use action=terminate in the tool call.
"""

### Define function

In [73]:
def extract_action_line(assistant_text: str) -> str | None:
    m = re.search(r"Action:\s*(.+)", assistant_text)
    if not m:
        return None
    return m.group(1).strip()

def build_initial_user_text(goal: str, previous_actions: list[str]) -> str:
    if not previous_actions:
        prev = "No previous action."
    else:
        prev = "\n".join([f"Step{i+1}: {a}" for i, a in enumerate(previous_actions)])
    return (
        "Please generate the next move according to the UI screenshot, instruction and previous actions.\n\n"
        f"Instruction: {goal}\n\n"
        f"Previous actions:\n{prev}"
    )

### Model server

In [74]:
from openai import OpenAI

VLLM_BASE_URL = "http://YOUR_VLLM_HOST:8000/v1"
VLLM_API_KEY = "YOUR_API_KEY"
MODEL_NAME = "YOUR_MODEL_NAME"

client = OpenAI(
    base_url=VLLM_BASE_URL,
    api_key=VLLM_API_KEY,
)

def call_model(messages: list[dict]) -> str:
    resp = client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
        temperature=0.0,
    )
    return resp.choices[0].message.content

### Step 1

In [None]:
LAST_IMAGE_TURNS = 5
goal = "Turn on the dark mode"
previous_actions = []

img_path = "screenshot_1.png"
data_url_1 = image_file_to_data_url(img_path)

messages = [
    {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
    {
        "role": "user",
        "content": [
            {"type": "text", "text": build_initial_user_text(goal, previous_actions)},
            {"type": "image_url", "image_url": {"url": data_url_1}},
        ],
    },
]

assistant_text_1 = call_model(messages)
print(assistant_text_1)

messages.append({"role": "assistant", "content": [{"type": "text", "text": assistant_text_1}]})
previous_actions.append(extract_action_line(assistant_text_1))

### Step 2

In [None]:
img_path = f"screenshot_2.png"
data_url_2 = image_file_to_data_url(img_path)

messages.append(
    {"role": "user", "content": [{"type": "image_url", "image_url": {"url": data_url_2}}]}
)

cur_messages = copy.deepcopy(messages)

if len(previous_actions) > LAST_IMAGE_TURNS - 1:
    cur_previous_actions = previous_actions[-(len(previous_actions) - LAST_IMAGE_TURNS + 1):]
    cur_messages_current = cur_messages[-(2 * LAST_IMAGE_TURNS -1):]
    cur_messages[1]["content"][0]["text"] = build_initial_user_text(goal, cur_previous_actions)
    del cur_messages[1]["content"][1]
    cur_messages = cur_messages[:2] + cur_messages_current
    cur_messages[1]["content"].append(cur_messages[2]["content"][0])
    del cur_messages[2]

assistant_text_2 = call_model(cur_messages)
print(assistant_text_2)

messages.append({"role": "assistant", "content": [{"type": "text", "text": assistant_text_2}]})

previous_actions.append(extract_action_line(assistant_text_2))

### ......
### Step n

In [None]:
img_path = f"screenshot_n.png"
data_url_n = image_file_to_data_url(img_path)

messages.append(
    {"role": "user", "content": [{"type": "image_url", "image_url": {"url": data_url_n}}]}
)

cur_messages = copy.deepcopy(messages)

if len(previous_actions) > LAST_IMAGE_TURNS - 1:
    cur_previous_actions = previous_actions[-(len(previous_actions) - LAST_IMAGE_TURNS + 1):]
    cur_messages_current = cur_messages[-(2 * LAST_IMAGE_TURNS -1):]
    cur_messages[1]["content"][0]["text"] = build_initial_user_text(goal, cur_previous_actions)
    del cur_messages[1]["content"][1]
    cur_messages = cur_messages[:2] + cur_messages_current
    cur_messages[1]["content"].append(cur_messages[2]["content"][0])
    del cur_messages[2]

assistant_text_n = call_model(cur_messages)
print(assistant_text_n)

messages.append({"role": "assistant", "content": [{"type": "text", "text": assistant_text_n}]})

previous_actions.append(extract_action_line(assistant_text_n))