Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

iframes support #405

Merged
merged 7 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions skyvern/forge/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,6 +811,11 @@ async def _build_and_record_step_prompt(
artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP,
data=json.dumps(scraped_page.id_to_xpath_dict, indent=2).encode(),
)
await app.ARTIFACT_MANAGER.create_artifact(
step=step,
artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP,
data=json.dumps(scraped_page.id_to_frame_dict, indent=2).encode(),
)
await app.ARTIFACT_MANAGER.create_artifact(
step=step,
artifact_type=ArtifactType.VISIBLE_ELEMENTS_TREE,
Expand Down
1 change: 1 addition & 0 deletions skyvern/forge/sdk/artifact/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class ArtifactType(StrEnum):
LLM_RESPONSE = "llm_response"
LLM_RESPONSE_PARSED = "llm_response_parsed"
VISIBLE_ELEMENTS_ID_XPATH_MAP = "visible_elements_id_xpath_map"
VISIBLE_ELEMENTS_ID_FRAME_MAP = "visible_elements_id_frame_map"
VISIBLE_ELEMENTS_TREE = "visible_elements_tree"
VISIBLE_ELEMENTS_TREE_TRIMMED = "visible_elements_tree_trimmed"
VISIBLE_ELEMENTS_TREE_IN_PROMPT = "visible_elements_tree_in_prompt"
Expand Down
1 change: 1 addition & 0 deletions skyvern/forge/sdk/artifact/storage/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
ArtifactType.LLM_RESPONSE: "json",
ArtifactType.LLM_RESPONSE_PARSED: "json",
ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP: "json",
ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP: "json",
ArtifactType.VISIBLE_ELEMENTS_TREE: "json",
ArtifactType.VISIBLE_ELEMENTS_TREE_TRIMMED: "json",
ArtifactType.VISIBLE_ELEMENTS_TREE_IN_PROMPT: "txt",
Expand Down
115 changes: 76 additions & 39 deletions skyvern/webeye/actions/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from deprecation import deprecated
from playwright.async_api import Locator, Page

from skyvern.constants import REPO_ROOT_DIR
from skyvern.constants import REPO_ROOT_DIR, SKYVERN_ID_ATTR
from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound
from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine
Expand Down Expand Up @@ -175,7 +175,7 @@ async def handle_click_action(
num_downloaded_files_before=num_downloaded_files_before,
download_dir=download_dir,
)
xpath = await validate_actions_in_dom(action, page, scraped_page)
xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
await asyncio.sleep(0.3)
if action.download:
results = await handle_click_to_download_file_action(action, page, scraped_page)
Expand All @@ -185,6 +185,7 @@ async def handle_click_action(
page,
action,
xpath,
frame,
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
)

Expand All @@ -208,10 +209,12 @@ async def handle_click_to_download_file_action(
page: Page,
scraped_page: ScrapedPage,
) -> list[ActionResult]:
xpath = await validate_actions_in_dom(action, page, scraped_page)
xpath, frame = await validate_actions_in_dom(action, page, scraped_page)

locator = resolve_locator(page, frame, xpath)

try:
await page.click(
f"xpath={xpath}",
await locator.click(
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
modifiers=["Alt"],
)
Expand All @@ -229,8 +232,9 @@ async def handle_input_text_action(
task: Task,
step: Step,
) -> list[ActionResult]:
xpath = await validate_actions_in_dom(action, page, scraped_page)
locator = page.locator(f"xpath={xpath}")
xpath, frame = await validate_actions_in_dom(action, page, scraped_page)

locator = resolve_locator(page, frame, xpath)

current_text = await locator.input_value()
if current_text == action.text:
Expand Down Expand Up @@ -269,20 +273,28 @@ async def handle_upload_file_action(
file_url=action.file_url,
)
return [ActionFailure(ImaginaryFileUrl(action.file_url))]
xpath = await validate_actions_in_dom(action, page, scraped_page)

xpath, frame = await validate_actions_in_dom(action, page, scraped_page)

file_path = await download_file(file_url)
locator = page.locator(f"xpath={xpath}")

locator = resolve_locator(page, frame, xpath)

is_file_input = await is_file_input_element(locator)

if is_file_input:
LOG.info("Taking UploadFileAction. Found file input tag", action=action)
if file_path:
await page.locator(f"xpath={xpath}").set_input_files(
locator = resolve_locator(page, frame, xpath)

await locator.set_input_files(
file_path,
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
)

# Sleep for 10 seconds after uploading a file to let the page process it
await asyncio.sleep(10)

return [ActionSuccess()]
else:
return [ActionFailure(Exception(f"Failed to download file from {action.file_url}"))]
Expand All @@ -295,6 +307,7 @@ async def handle_upload_file_action(
page,
action,
xpath,
frame,
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
)

Expand All @@ -307,15 +320,17 @@ async def handle_download_file_action(
task: Task,
step: Step,
) -> list[ActionResult]:
xpath = await validate_actions_in_dom(action, page, scraped_page)
xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
file_name = f"{action.file_name or uuid.uuid4()}"
full_file_path = f"{REPO_ROOT_DIR}/downloads/{task.workflow_run_id or task.task_id}/{file_name}"
try:
# Start waiting for the download
async with page.expect_download() as download_info:
await asyncio.sleep(0.3)
await page.click(
f"xpath={xpath}",

locator = resolve_locator(page, frame, xpath)

await locator.click(
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
modifiers=["Alt"],
)
Expand Down Expand Up @@ -355,9 +370,10 @@ async def handle_select_option_action(
task: Task,
step: Step,
) -> list[ActionResult]:
xpath = await validate_actions_in_dom(action, page, scraped_page)
xpath, frame = await validate_actions_in_dom(action, page, scraped_page)

locator = resolve_locator(page, frame, xpath)

locator = page.locator(f"xpath={xpath}")
tag_name = await get_tag_name_lowercase(locator)
element_dict = scraped_page.id_to_element_dict[action.element_id]
LOG.info(
Expand Down Expand Up @@ -400,7 +416,7 @@ async def handle_select_option_action(
child_anchor_xpath=child_anchor_xpath,
)
click_action = ClickAction(element_id=action.element_id)
return await chain_click(task, page, click_action, child_anchor_xpath)
return await chain_click(task, page, click_action, child_anchor_xpath, frame)

# handler the select action on <label>
select_element_id = get_select_id_in_label_children(scraped_page, action.element_id)
Expand Down Expand Up @@ -432,7 +448,7 @@ async def handle_select_option_action(
action=action,
)
click_action = ClickAction(element_id=action.element_id)
action_result = await chain_click(task, page, click_action, xpath)
action_result = await chain_click(task, page, click_action, xpath, frame)
return action_result
elif tag_name == "ul" or tag_name == "div" or tag_name == "li":
# if the role is listbox, find the option with the "label" or "value" and click that option element
Expand Down Expand Up @@ -464,7 +480,7 @@ async def handle_select_option_action(
)
# click the option element
click_action = ClickAction(element_id=action.element_id)
return await chain_click(task, page, click_action, xpath)
return await chain_click(task, page, click_action, xpath, frame)
else:
LOG.error(
"SelectOptionAction on a non-listbox element. Cannot handle this action",
Expand All @@ -481,19 +497,17 @@ async def handle_select_option_action(
current_text = await locator.input_value()
if current_text == action.option.label:
return [ActionSuccess()]

try:
# First click by label (if it matches)
await page.click(
f"xpath={xpath}",
await locator.click(
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
)
await page.select_option(
xpath,
await locator.select_option(
label=action.option.label,
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
)
await page.click(
f"xpath={xpath}",
await locator.click(
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
)
return [ActionSuccess()]
Expand Down Expand Up @@ -536,7 +550,7 @@ async def handle_select_option_action(


async def handle_checkbox_action(
self: actions.CheckboxAction,
action: actions.CheckboxAction,
page: Page,
scraped_page: ScrapedPage,
task: Task,
Expand All @@ -549,11 +563,14 @@ async def handle_checkbox_action(
Treating checkbox actions as click actions seem to perform way more reliably
Developers who tried this and failed: 2 (Suchintan and Shu 😂)
"""
xpath = await validate_actions_in_dom(self, page, scraped_page)
if self.is_checked:
await page.check(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
xpath, frame = await validate_actions_in_dom(action, page, scraped_page)

locator = resolve_locator(page, frame, xpath)

if action.is_checked:
await locator.check(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
else:
await page.uncheck(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
await locator.uncheck(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)

# TODO (suchintan): Why does checking the label work, but not the actual input element?
return [ActionSuccess()]
Expand Down Expand Up @@ -630,9 +647,11 @@ def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:
return secret_value if secret_value is not None else parameter


async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> str:
async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> tuple[str, str]:
xpath = scraped_page.id_to_xpath_dict[action.element_id]
locator = page.locator(xpath)
frame = scraped_page.id_to_frame_dict[action.element_id]

locator = resolve_locator(page, frame, xpath)

num_elements = await locator.count()
if num_elements < 1:
Expand All @@ -652,14 +671,15 @@ async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: S
else:
LOG.info("Validated action xpath in DOM", action=action)

return xpath
return xpath, frame


async def chain_click(
task: Task,
page: Page,
action: ClickAction | UploadFileAction,
xpath: str,
frame: str,
timeout: int = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
) -> List[ActionResult]:
# Add a defensive page handler here in case a click action opens a file chooser.
Expand Down Expand Up @@ -689,9 +709,11 @@ async def chain_click(
Clicks on an element identified by the xpath and its parent if failed.
:param xpath: xpath of the element to click
"""
javascript_triggered = await is_javascript_triggered(page, xpath)
javascript_triggered = await is_javascript_triggered(page, frame, xpath)
try:
await page.click(f"xpath={xpath}", timeout=timeout)
locator = resolve_locator(page, frame, xpath)
await locator.click(timeout=timeout)

LOG.info("Chain click: main element click succeeded", action=action, xpath=xpath)
return [
ActionSuccess(
Expand All @@ -718,10 +740,12 @@ async def chain_click(

parent_xpath = f"{xpath}/.."
try:
parent_javascript_triggered = await is_javascript_triggered(page, parent_xpath)
parent_javascript_triggered = await is_javascript_triggered(page, frame, parent_xpath)
javascript_triggered = javascript_triggered or parent_javascript_triggered
parent_locator = page.locator(xpath).locator("..")

parent_locator = resolve_locator(page, frame, xpath).locator("..")
await parent_locator.click(timeout=timeout)

LOG.info(
"Chain click: successfully clicked parent element",
action=action,
Expand Down Expand Up @@ -806,9 +830,10 @@ def get_checkbox_id_in_label_children(scraped_page: ScrapedPage, element_id: str
return None


async def is_javascript_triggered(page: Page, xpath: str) -> bool:
locator = page.locator(f"xpath={xpath}")
async def is_javascript_triggered(page: Page, frame: str, xpath: str) -> bool:
locator = resolve_locator(page, frame, xpath)
element = locator.first

tag_name = await element.evaluate("e => e.tagName")
if tag_name.lower() == "a":
href = await element.evaluate("e => e.href")
Expand Down Expand Up @@ -928,8 +953,13 @@ async def click_listbox_option(
text = child["text"] if "text" in child else ""
if text and (text == action.option.label or text == action.option.value):
option_xpath = scraped_page.id_to_xpath_dict[child["id"]]
option_frame = scraped_page.id_to_frame_dict[child["id"]]

try:
await page.click(f"xpath={option_xpath}", timeout=1000)
locator = resolve_locator(page, option_frame, option_xpath)

await locator.click(timeout=1000)

return True
except Exception:
LOG.error(
Expand All @@ -941,3 +971,10 @@ async def click_listbox_option(
if "children" in child:
bfs_queue.extend(child["children"])
return False


def resolve_locator(page: Page, frame: str, xpath: str) -> Locator:
if frame == "main":
return page.locator(f"xpath={xpath}")

return page.frame_locator(f"[{SKYVERN_ID_ATTR}='{frame}']").locator(f"xpath={xpath}")
8 changes: 7 additions & 1 deletion skyvern/webeye/scraper/domUtils.js
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ function uniqueId() {
return result;
}

function buildTreeFromBody() {
function buildTreeFromBody(frame = "main") {
var elements = [];
var resultArray = [];

Expand Down Expand Up @@ -679,6 +679,7 @@ function buildTreeFromBody() {

let elementObj = {
id: element_id,
frame: frame,
interactable: interactable,
tagName: elementTagNameLower,
attributes: attrs,
Expand Down Expand Up @@ -760,6 +761,11 @@ function buildTreeFromBody() {
processElement(child, elementObj.id);
});
return elementObj;
} else if (element.tagName.toLowerCase() === "iframe") {
let iframeElementObject = buildElementObject(element, true);

elements.push(iframeElementObject);
resultArray.push(iframeElementObject);
} else {
// For a non-interactable element, if it has direct text, we also tagged
// it with unique_id, but with interatable=false in the element.
Expand Down
Loading
Loading