iframes support (#405)

Co-authored-by: Aleksei Zarubin <12220926+alexzarbn@users.noreply.github.com>
Skyvern-AI · Jun 6, 2024 · 2854193 · 2854193
1 parent 064c831
commit 2854193
Show file tree

Hide file tree

Showing 7 changed files with 207 additions and 51 deletions.
diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py
@@ -818,6 +818,11 @@ async def _build_and_record_step_prompt(
             artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP,
             data=json.dumps(scraped_page.id_to_xpath_dict, indent=2).encode(),
         )
+        await app.ARTIFACT_MANAGER.create_artifact(
+            step=step,
+            artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP,
+            data=json.dumps(scraped_page.id_to_frame_dict, indent=2).encode(),
+        )
         await app.ARTIFACT_MANAGER.create_artifact(
             step=step,
             artifact_type=ArtifactType.VISIBLE_ELEMENTS_TREE,

diff --git a/skyvern/forge/sdk/artifact/models.py b/skyvern/forge/sdk/artifact/models.py
@@ -22,6 +22,7 @@ class ArtifactType(StrEnum):
     LLM_RESPONSE = "llm_response"
     LLM_RESPONSE_PARSED = "llm_response_parsed"
     VISIBLE_ELEMENTS_ID_XPATH_MAP = "visible_elements_id_xpath_map"
+    VISIBLE_ELEMENTS_ID_FRAME_MAP = "visible_elements_id_frame_map"
     VISIBLE_ELEMENTS_TREE = "visible_elements_tree"
     VISIBLE_ELEMENTS_TREE_TRIMMED = "visible_elements_tree_trimmed"
     VISIBLE_ELEMENTS_TREE_IN_PROMPT = "visible_elements_tree_in_prompt"

diff --git a/skyvern/forge/sdk/artifact/storage/base.py b/skyvern/forge/sdk/artifact/storage/base.py
@@ -14,6 +14,7 @@
     ArtifactType.LLM_RESPONSE: "json",
     ArtifactType.LLM_RESPONSE_PARSED: "json",
     ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP: "json",
+    ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP: "json",
     ArtifactType.VISIBLE_ELEMENTS_TREE: "json",
     ArtifactType.VISIBLE_ELEMENTS_TREE_TRIMMED: "json",
     ArtifactType.VISIBLE_ELEMENTS_TREE_IN_PROMPT: "txt",

diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py
@@ -6,10 +6,10 @@
 
 import structlog
 from deprecation import deprecated
-from playwright.async_api import Locator, Page
+from playwright.async_api import FrameLocator, Locator, Page
 
-from skyvern.constants import REPO_ROOT_DIR
-from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound
+from skyvern.constants import REPO_ROOT_DIR, SKYVERN_ID_ATTR
+from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound, SkyvernException
 from skyvern.forge import app
 from skyvern.forge.prompts import prompt_engine
 from skyvern.forge.sdk.api.files import (
@@ -175,16 +175,18 @@ async def handle_click_action(
             num_downloaded_files_before=num_downloaded_files_before,
             download_dir=download_dir,
         )
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
     await asyncio.sleep(0.3)
     if action.download:
         results = await handle_click_to_download_file_action(action, page, scraped_page)
     else:
         results = await chain_click(
             task,
+            scraped_page,
             page,
             action,
             xpath,
+            frame,
             timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
         )
 
@@ -208,10 +210,12 @@ async def handle_click_to_download_file_action(
     page: Page,
     scraped_page: ScrapedPage,
 ) -> list[ActionResult]:
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
+    locator = resolve_locator(scraped_page, page, frame, xpath)
+
     try:
-        await page.click(
-            f"xpath={xpath}",
+        await locator.click(
             timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
             modifiers=["Alt"],
         )
@@ -229,8 +233,9 @@ async def handle_input_text_action(
     task: Task,
     step: Step,
 ) -> list[ActionResult]:
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
-    locator = page.locator(f"xpath={xpath}")
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
+    locator = resolve_locator(scraped_page, page, frame, xpath)
 
     current_text = await locator.input_value()
     if current_text == action.text:
@@ -269,20 +274,28 @@ async def handle_upload_file_action(
             file_url=action.file_url,
         )
         return [ActionFailure(ImaginaryFileUrl(action.file_url))]
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
     file_path = await download_file(file_url)
-    locator = page.locator(f"xpath={xpath}")
+
+    locator = resolve_locator(scraped_page, page, frame, xpath)
+
     is_file_input = await is_file_input_element(locator)
+
     if is_file_input:
         LOG.info("Taking UploadFileAction. Found file input tag", action=action)
         if file_path:
-            await page.locator(f"xpath={xpath}").set_input_files(
+            locator = resolve_locator(scraped_page, page, frame, xpath)
+
+            await locator.set_input_files(
                 file_path,
                 timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
             )
 
             # Sleep for 10 seconds after uploading a file to let the page process it
             await asyncio.sleep(10)
+
             return [ActionSuccess()]
         else:
             return [ActionFailure(Exception(f"Failed to download file from {action.file_url}"))]
@@ -292,9 +305,11 @@ async def handle_upload_file_action(
         action.is_upload_file_tag = False
         return await chain_click(
             task,
+            scraped_page,
             page,
             action,
             xpath,
+            frame,
             timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
         )
 
@@ -307,15 +322,17 @@ async def handle_download_file_action(
     task: Task,
     step: Step,
 ) -> list[ActionResult]:
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
     file_name = f"{action.file_name or uuid.uuid4()}"
     full_file_path = f"{REPO_ROOT_DIR}/downloads/{task.workflow_run_id or task.task_id}/{file_name}"
     try:
         # Start waiting for the download
         async with page.expect_download() as download_info:
             await asyncio.sleep(0.3)
-            await page.click(
-                f"xpath={xpath}",
+
+            locator = resolve_locator(scraped_page, page, frame, xpath)
+
+            await locator.click(
                 timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
                 modifiers=["Alt"],
             )
@@ -355,9 +372,10 @@ async def handle_select_option_action(
     task: Task,
     step: Step,
 ) -> list[ActionResult]:
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
+    locator = resolve_locator(scraped_page, page, frame, xpath)
 
-    locator = page.locator(f"xpath={xpath}")
     tag_name = await get_tag_name_lowercase(locator)
     element_dict = scraped_page.id_to_element_dict[action.element_id]
     LOG.info(
@@ -400,7 +418,7 @@ async def handle_select_option_action(
                 child_anchor_xpath=child_anchor_xpath,
             )
             click_action = ClickAction(element_id=action.element_id)
-            return await chain_click(task, page, click_action, child_anchor_xpath)
+            return await chain_click(task, scraped_page, page, click_action, child_anchor_xpath, frame)
 
         # handler the select action on <label>
         select_element_id = get_select_id_in_label_children(scraped_page, action.element_id)
@@ -432,7 +450,7 @@ async def handle_select_option_action(
             action=action,
         )
         click_action = ClickAction(element_id=action.element_id)
-        action_result = await chain_click(task, page, click_action, xpath)
+        action_result = await chain_click(task, scraped_page, page, click_action, xpath, frame)
         return action_result
     elif tag_name == "ul" or tag_name == "div" or tag_name == "li":
         # if the role is listbox, find the option with the "label" or "value" and click that option element
@@ -464,7 +482,7 @@ async def handle_select_option_action(
             )
             # click the option element
             click_action = ClickAction(element_id=action.element_id)
-            return await chain_click(task, page, click_action, xpath)
+            return await chain_click(task, scraped_page, page, click_action, xpath, frame)
         else:
             LOG.error(
                 "SelectOptionAction on a non-listbox element. Cannot handle this action",
@@ -481,19 +499,17 @@ async def handle_select_option_action(
     current_text = await locator.input_value()
     if current_text == action.option.label:
         return [ActionSuccess()]
+
     try:
         # First click by label (if it matches)
-        await page.click(
-            f"xpath={xpath}",
+        await locator.click(
             timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
         )
-        await page.select_option(
-            xpath,
+        await locator.select_option(
             label=action.option.label,
             timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
         )
-        await page.click(
-            f"xpath={xpath}",
+        await locator.click(
             timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
         )
         return [ActionSuccess()]
@@ -536,7 +552,7 @@ async def handle_select_option_action(
 
 
 async def handle_checkbox_action(
-    self: actions.CheckboxAction,
+    action: actions.CheckboxAction,
     page: Page,
     scraped_page: ScrapedPage,
     task: Task,
@@ -549,11 +565,14 @@ async def handle_checkbox_action(
     Treating checkbox actions as click actions seem to perform way more reliably
     Developers who tried this and failed: 2 (Suchintan and Shu 😂)
     """
-    xpath = await validate_actions_in_dom(self, page, scraped_page)
-    if self.is_checked:
-        await page.check(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
+    locator = resolve_locator(scraped_page, page, frame, xpath)
+
+    if action.is_checked:
+        await locator.check(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
     else:
-        await page.uncheck(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        await locator.uncheck(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
 
     # TODO (suchintan): Why does checking the label work, but not the actual input element?
     return [ActionSuccess()]
@@ -630,9 +649,11 @@ def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:
     return secret_value if secret_value is not None else parameter
 
 
-async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> str:
+async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> tuple[str, str]:
     xpath = scraped_page.id_to_xpath_dict[action.element_id]
-    locator = page.locator(xpath)
+    frame = scraped_page.id_to_frame_dict[action.element_id]
+
+    locator = resolve_locator(scraped_page, page, frame, xpath)
 
     num_elements = await locator.count()
     if num_elements < 1:
@@ -652,14 +673,16 @@ async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: S
     else:
         LOG.info("Validated action xpath in DOM", action=action)
 
-    return xpath
+    return xpath, frame
 
 
 async def chain_click(
     task: Task,
+    scraped_page: ScrapedPage,
     page: Page,
     action: ClickAction | UploadFileAction,
     xpath: str,
+    frame: str,
     timeout: int = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
 ) -> List[ActionResult]:
     # Add a defensive page handler here in case a click action opens a file chooser.
@@ -689,9 +712,11 @@ async def chain_click(
     Clicks on an element identified by the xpath and its parent if failed.
     :param xpath: xpath of the element to click
     """
-    javascript_triggered = await is_javascript_triggered(page, xpath)
+    javascript_triggered = await is_javascript_triggered(scraped_page, page, frame, xpath)
     try:
-        await page.click(f"xpath={xpath}", timeout=timeout)
+        locator = resolve_locator(scraped_page, page, frame, xpath)
+        await locator.click(timeout=timeout)
+
         LOG.info("Chain click: main element click succeeded", action=action, xpath=xpath)
         return [
             ActionSuccess(
@@ -718,10 +743,12 @@ async def chain_click(
 
         parent_xpath = f"{xpath}/.."
         try:
-            parent_javascript_triggered = await is_javascript_triggered(page, parent_xpath)
+            parent_javascript_triggered = await is_javascript_triggered(scraped_page, page, frame, parent_xpath)
             javascript_triggered = javascript_triggered or parent_javascript_triggered
-            parent_locator = page.locator(xpath).locator("..")
+
+            parent_locator = resolve_locator(scraped_page, page, frame, xpath).locator("..")
             await parent_locator.click(timeout=timeout)
+
             LOG.info(
                 "Chain click: successfully clicked parent element",
                 action=action,
@@ -806,9 +833,10 @@ def get_checkbox_id_in_label_children(scraped_page: ScrapedPage, element_id: str
     return None
 
 
-async def is_javascript_triggered(page: Page, xpath: str) -> bool:
-    locator = page.locator(f"xpath={xpath}")
+async def is_javascript_triggered(scraped_page: ScrapedPage, page: Page, frame: str, xpath: str) -> bool:
+    locator = resolve_locator(scraped_page, page, frame, xpath)
     element = locator.first
+
     tag_name = await element.evaluate("e => e.tagName")
     if tag_name.lower() == "a":
         href = await element.evaluate("e => e.href")
@@ -928,8 +956,13 @@ async def click_listbox_option(
             text = child["text"] if "text" in child else ""
             if text and (text == action.option.label or text == action.option.value):
                 option_xpath = scraped_page.id_to_xpath_dict[child["id"]]
+                option_frame = scraped_page.id_to_frame_dict[child["id"]]
+
                 try:
-                    await page.click(f"xpath={option_xpath}", timeout=1000)
+                    locator = resolve_locator(scraped_page, page, option_frame, option_xpath)
+
+                    await locator.click(timeout=1000)
+
                     return True
                 except Exception:
                     LOG.error(
@@ -941,3 +974,28 @@ async def click_listbox_option(
         if "children" in child:
             bfs_queue.extend(child["children"])
     return False
+
+
+def resolve_locator(scrape_page: ScrapedPage, page: Page, frame: str, xpath: str) -> Locator:
+    iframe_path: list[str] = []
+
+    while frame != "main.frame":
+        iframe_path.append(frame)
+
+        frame_element = scrape_page.id_to_element_dict.get(frame)
+        if frame_element is None:
+            raise MissingElement(element_id=frame)
+
+        parent_frame = frame_element.get("frame")
+        if not parent_frame:
+            raise SkyvernException(f"element without frame: {frame_element}")
+
+        LOG.info(f"{frame} is a child frame of {parent_frame}")
+        frame = parent_frame
+
+    current_page: Page | FrameLocator = page
+    while len(iframe_path) > 0:
+        child_frame = iframe_path.pop()
+        current_page = current_page.frame_locator(f"[{SKYVERN_ID_ATTR}='{child_frame}']")
+
+    return current_page.locator(f"xpath={xpath}")