Skyvern-AI · LawyZheng · Jun 6, 2024 · May 24, 2024 · May 28, 2024 · May 28, 2024
diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py
@@ -811,6 +811,11 @@ async def _build_and_record_step_prompt(
             artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP,
             data=json.dumps(scraped_page.id_to_xpath_dict, indent=2).encode(),
         )
+        await app.ARTIFACT_MANAGER.create_artifact(
+            step=step,
+            artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP,
+            data=json.dumps(scraped_page.id_to_frame_dict, indent=2).encode(),
+        )
         await app.ARTIFACT_MANAGER.create_artifact(
             step=step,
             artifact_type=ArtifactType.VISIBLE_ELEMENTS_TREE,

diff --git a/skyvern/forge/sdk/artifact/models.py b/skyvern/forge/sdk/artifact/models.py
@@ -22,6 +22,7 @@ class ArtifactType(StrEnum):
     LLM_RESPONSE = "llm_response"
     LLM_RESPONSE_PARSED = "llm_response_parsed"
     VISIBLE_ELEMENTS_ID_XPATH_MAP = "visible_elements_id_xpath_map"
+    VISIBLE_ELEMENTS_ID_FRAME_MAP = "visible_elements_id_frame_map"
     VISIBLE_ELEMENTS_TREE = "visible_elements_tree"
     VISIBLE_ELEMENTS_TREE_TRIMMED = "visible_elements_tree_trimmed"
     VISIBLE_ELEMENTS_TREE_IN_PROMPT = "visible_elements_tree_in_prompt"

diff --git a/skyvern/forge/sdk/artifact/storage/base.py b/skyvern/forge/sdk/artifact/storage/base.py
@@ -14,6 +14,7 @@
     ArtifactType.LLM_RESPONSE: "json",
     ArtifactType.LLM_RESPONSE_PARSED: "json",
     ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP: "json",
+    ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP: "json",
     ArtifactType.VISIBLE_ELEMENTS_TREE: "json",
     ArtifactType.VISIBLE_ELEMENTS_TREE_TRIMMED: "json",
     ArtifactType.VISIBLE_ELEMENTS_TREE_IN_PROMPT: "txt",

diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py
@@ -8,7 +8,7 @@
 from deprecation import deprecated
 from playwright.async_api import Locator, Page
 
-from skyvern.constants import REPO_ROOT_DIR
+from skyvern.constants import REPO_ROOT_DIR, SKYVERN_ID_ATTR
 from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound
 from skyvern.forge import app
 from skyvern.forge.prompts import prompt_engine
@@ -175,7 +175,7 @@ async def handle_click_action(
             num_downloaded_files_before=num_downloaded_files_before,
             download_dir=download_dir,
         )
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
     await asyncio.sleep(0.3)
     if action.download:
         results = await handle_click_to_download_file_action(action, page, scraped_page)
@@ -185,6 +185,7 @@ async def handle_click_action(
             page,
             action,
             xpath,
+            frame,
             timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
         )
 
@@ -208,10 +209,12 @@ async def handle_click_to_download_file_action(
     page: Page,
     scraped_page: ScrapedPage,
 ) -> list[ActionResult]:
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
+    locator = resolve_locator(page, frame, xpath)
+
     try:
-        await page.click(
-            f"xpath={xpath}",
+        await locator.click(
             timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
             modifiers=["Alt"],
         )
@@ -229,8 +232,9 @@ async def handle_input_text_action(
     task: Task,
     step: Step,
 ) -> list[ActionResult]:
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
-    locator = page.locator(f"xpath={xpath}")
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
+    locator = resolve_locator(page, frame, xpath)
 
     current_text = await locator.input_value()
     if current_text == action.text:
@@ -269,20 +273,28 @@ async def handle_upload_file_action(
             file_url=action.file_url,
         )
         return [ActionFailure(ImaginaryFileUrl(action.file_url))]
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
     file_path = await download_file(file_url)
-    locator = page.locator(f"xpath={xpath}")
+
+    locator = resolve_locator(page, frame, xpath)
+
     is_file_input = await is_file_input_element(locator)
+
     if is_file_input:
         LOG.info("Taking UploadFileAction. Found file input tag", action=action)
         if file_path:
-            await page.locator(f"xpath={xpath}").set_input_files(
+            locator = resolve_locator(page, frame, xpath)
+
+            await locator.set_input_files(
                 file_path,
                 timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
             )
 
             # Sleep for 10 seconds after uploading a file to let the page process it
             await asyncio.sleep(10)
+
             return [ActionSuccess()]
         else:
             return [ActionFailure(Exception(f"Failed to download file from {action.file_url}"))]
@@ -295,6 +307,7 @@ async def handle_upload_file_action(
             page,
             action,
             xpath,
+            frame,
             timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
         )
 
@@ -307,15 +320,17 @@ async def handle_download_file_action(
     task: Task,
     step: Step,
 ) -> list[ActionResult]:
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
     file_name = f"{action.file_name or uuid.uuid4()}"
     full_file_path = f"{REPO_ROOT_DIR}/downloads/{task.workflow_run_id or task.task_id}/{file_name}"
     try:
         # Start waiting for the download
         async with page.expect_download() as download_info:
             await asyncio.sleep(0.3)
-            await page.click(
-                f"xpath={xpath}",
+
+            locator = resolve_locator(page, frame, xpath)
+
+            await locator.click(
                 timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
                 modifiers=["Alt"],
             )
@@ -355,9 +370,10 @@ async def handle_select_option_action(
     task: Task,
     step: Step,
 ) -> list[ActionResult]:
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
+    locator = resolve_locator(page, frame, xpath)
 
-    locator = page.locator(f"xpath={xpath}")
     tag_name = await get_tag_name_lowercase(locator)
     element_dict = scraped_page.id_to_element_dict[action.element_id]
     LOG.info(
@@ -400,7 +416,7 @@ async def handle_select_option_action(
                 child_anchor_xpath=child_anchor_xpath,
             )
             click_action = ClickAction(element_id=action.element_id)
-            return await chain_click(task, page, click_action, child_anchor_xpath)
+            return await chain_click(task, page, click_action, child_anchor_xpath, frame)
 
         # handler the select action on <label>
         select_element_id = get_select_id_in_label_children(scraped_page, action.element_id)
@@ -432,7 +448,7 @@ async def handle_select_option_action(
             action=action,
         )
         click_action = ClickAction(element_id=action.element_id)
-        action_result = await chain_click(task, page, click_action, xpath)
+        action_result = await chain_click(task, page, click_action, xpath, frame)
         return action_result
     elif tag_name == "ul" or tag_name == "div" or tag_name == "li":
         # if the role is listbox, find the option with the "label" or "value" and click that option element
@@ -464,7 +480,7 @@ async def handle_select_option_action(
             )
             # click the option element
             click_action = ClickAction(element_id=action.element_id)
-            return await chain_click(task, page, click_action, xpath)
+            return await chain_click(task, page, click_action, xpath, frame)
         else:
             LOG.error(
                 "SelectOptionAction on a non-listbox element. Cannot handle this action",
@@ -481,19 +497,17 @@ async def handle_select_option_action(
     current_text = await locator.input_value()
     if current_text == action.option.label:
         return [ActionSuccess()]
+
     try:
         # First click by label (if it matches)
-        await page.click(
-            f"xpath={xpath}",
+        await locator.click(
             timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
         )
-        await page.select_option(
-            xpath,
+        await locator.select_option(
             label=action.option.label,
             timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
         )
-        await page.click(
-            f"xpath={xpath}",
+        await locator.click(
             timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
         )
         return [ActionSuccess()]
@@ -536,7 +550,7 @@ async def handle_select_option_action(
 
 
 async def handle_checkbox_action(
-    self: actions.CheckboxAction,
+    action: actions.CheckboxAction,
     page: Page,
     scraped_page: ScrapedPage,
     task: Task,
@@ -549,11 +563,14 @@ async def handle_checkbox_action(
     Treating checkbox actions as click actions seem to perform way more reliably
     Developers who tried this and failed: 2 (Suchintan and Shu 😂)
     """
-    xpath = await validate_actions_in_dom(self, page, scraped_page)
-    if self.is_checked:
-        await page.check(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
+    locator = resolve_locator(page, frame, xpath)
+
+    if action.is_checked:
+        await locator.check(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
     else:
-        await page.uncheck(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        await locator.uncheck(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
 
     # TODO (suchintan): Why does checking the label work, but not the actual input element?
     return [ActionSuccess()]
@@ -630,9 +647,11 @@ def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:
     return secret_value if secret_value is not None else parameter
 
 
-async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> str:
+async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> tuple[str, str]:
     xpath = scraped_page.id_to_xpath_dict[action.element_id]
-    locator = page.locator(xpath)
+    frame = scraped_page.id_to_frame_dict[action.element_id]
+
+    locator = resolve_locator(page, frame, xpath)
 
     num_elements = await locator.count()
     if num_elements < 1:
@@ -652,14 +671,15 @@ async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: S
     else:
         LOG.info("Validated action xpath in DOM", action=action)
 
-    return xpath
+    return xpath, frame
 
 
 async def chain_click(
     task: Task,
     page: Page,
     action: ClickAction | UploadFileAction,
     xpath: str,
+    frame: str,
     timeout: int = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
 ) -> List[ActionResult]:
     # Add a defensive page handler here in case a click action opens a file chooser.
@@ -689,9 +709,11 @@ async def chain_click(
     Clicks on an element identified by the xpath and its parent if failed.
     :param xpath: xpath of the element to click
     """
-    javascript_triggered = await is_javascript_triggered(page, xpath)
+    javascript_triggered = await is_javascript_triggered(page, frame, xpath)
     try:
-        await page.click(f"xpath={xpath}", timeout=timeout)
+        locator = resolve_locator(page, frame, xpath)
+        await locator.click(timeout=timeout)
+
         LOG.info("Chain click: main element click succeeded", action=action, xpath=xpath)
         return [
             ActionSuccess(
@@ -718,10 +740,12 @@ async def chain_click(
 
         parent_xpath = f"{xpath}/.."
         try:
-            parent_javascript_triggered = await is_javascript_triggered(page, parent_xpath)
+            parent_javascript_triggered = await is_javascript_triggered(page, frame, parent_xpath)
             javascript_triggered = javascript_triggered or parent_javascript_triggered
-            parent_locator = page.locator(xpath).locator("..")
+
+            parent_locator = resolve_locator(page, frame, xpath).locator("..")
             await parent_locator.click(timeout=timeout)
+
             LOG.info(
                 "Chain click: successfully clicked parent element",
                 action=action,
@@ -806,9 +830,10 @@ def get_checkbox_id_in_label_children(scraped_page: ScrapedPage, element_id: str
     return None
 
 
-async def is_javascript_triggered(page: Page, xpath: str) -> bool:
-    locator = page.locator(f"xpath={xpath}")
+async def is_javascript_triggered(page: Page, frame: str, xpath: str) -> bool:
+    locator = resolve_locator(page, frame, xpath)
     element = locator.first
+
     tag_name = await element.evaluate("e => e.tagName")
     if tag_name.lower() == "a":
         href = await element.evaluate("e => e.href")
@@ -928,8 +953,13 @@ async def click_listbox_option(
             text = child["text"] if "text" in child else ""
             if text and (text == action.option.label or text == action.option.value):
                 option_xpath = scraped_page.id_to_xpath_dict[child["id"]]
+                option_frame = scraped_page.id_to_frame_dict[child["id"]]
+
                 try:
-                    await page.click(f"xpath={option_xpath}", timeout=1000)
+                    locator = resolve_locator(page, option_frame, option_xpath)
+
+                    await locator.click(timeout=1000)
+
                     return True
                 except Exception:
                     LOG.error(
@@ -941,3 +971,10 @@ async def click_listbox_option(
         if "children" in child:
             bfs_queue.extend(child["children"])
     return False
+
+
+def resolve_locator(page: Page, frame: str, xpath: str) -> Locator:
+    if frame == "main":
+        return page.locator(f"xpath={xpath}")
+
+    return page.frame_locator(f"[{SKYVERN_ID_ATTR}='{frame}']").locator(f"xpath={xpath}")
diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js
@@ -576,7 +576,7 @@ function uniqueId() {
   return result;
 }
 
-function buildTreeFromBody() {
+function buildTreeFromBody(frame = "main") {
   var elements = [];
   var resultArray = [];
 
@@ -679,6 +679,7 @@ function buildTreeFromBody() {
 
     let elementObj = {
       id: element_id,
+      frame: frame,
       interactable: interactable,
       tagName: elementTagNameLower,
       attributes: attrs,
@@ -760,6 +761,11 @@ function buildTreeFromBody() {
         processElement(child, elementObj.id);
       });
       return elementObj;
+    } else if (element.tagName.toLowerCase() === "iframe") {
+      let iframeElementObject = buildElementObject(element, true);
+
+      elements.push(iframeElementObject);
+      resultArray.push(iframeElementObject);
     } else {
       // For a non-interactable element, if it has direct text, we also tagged
       // it with unique_id, but with interatable=false in the element.