ServiceNow · gasse · Sep 5, 2024 · Aug 26, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/browsergym/core/src/browsergym/core/env.py b/browsergym/core/src/browsergym/core/env.py
@@ -491,7 +491,7 @@ def _get_obs(self):
                     logger.warning(
                         f"An error occured while extracting the dom and axtree. Retrying ({retries_left}/{EXTRACT_OBS_MAX_TRIES} tries left).\n{repr(e)}"
                     )
-                    # post-extract cleanup (aria-roledescription attribute)
+                    # post-extract cleanup (ARIA attributes)
                     _post_extract(self.page)
                     time.sleep(0.5)
                     continue

diff --git a/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js b/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js
@@ -1,26 +1,8 @@
 /**
  * Go through all DOM elements in the frame (including shadowDOMs), give them unique browsergym
- * identifiers (bid), and store custom data in the aria-roledescription attribute.
+ * identifiers (bid), and store custom data in ARIA attributes.
  */
 async ([parent_bid, bid_attr_name]) => {
-
-    // standard html tags
-    // https://www.w3schools.com/tags/
-    const html_tags = new Set([
-        "a", "abbr", "acronym", "address", "applet", "area", "article", "aside", "audio",
-        "b", "base", "basefont", "bdi", "bdo", "big", "blockquote", "body", "br", "button",
-        "canvas", "caption", "center", "cite", "code", "col", "colgroup", "data", "datalist",
-        "dd", "del", "details", "dfn", "dialog", "dir", "div", "dl", "dt", "em", "embed",
-        "fieldset", "figcaption", "figure", "font", "footer", "form", "frame", "frameset",
-        "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i",
-        "iframe", "img", "input", "ins", "kbd", "label", "legend", "li", "link", "main",
-        "map", "mark", "menu", "meta", "meter", "nav", "noframes", "noscript", "object",
-        "ol", "optgroup", "option", "output", "p", "param", "picture", "pre", "progress",
-        "q", "rp", "rt", "ruby", "s", "samp", "script", "search", "section", "select",
-        "small", "source", "span", "strike", "strong", "style", "sub", "summary", "sup",
-        "svg", "table", "tbody", "td", "template", "textarea", "tfoot", "th", "thead",
-        "time", "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr"
-    ]);
     const set_of_marks_tags = new Set([
         "input", "textarea", "select", "button", "a", "iframe", "video", "li", "td", "option"
     ]);
@@ -69,11 +51,6 @@ async ([parent_bid, bid_attr_name]) => {
             );
         }
         i++;
-        // we will mark only standard HTML tags
-        if (!elem.tagName || !html_tags.has(elem.tagName.toLowerCase())) {
-            // Skipping element
-            continue;  // stop and move on to the next element
-        }
         // Processing element
         // register intersection callback on element, and keep track of element for waiting later
         elem.setAttribute('browsergym_visibility_ratio', 0);
@@ -132,15 +109,11 @@ async ([parent_bid, bid_attr_name]) => {
         }
         all_bids.add(elem_global_bid);
 
-        // Hack: store custom data inside the aria-roledescription attribute (will be available in DOM and AXTree)
+        // Hack: store custom data inside ARIA attributes (will be available in DOM and AXTree)
         //  - elem_global_bid: global element identifier (unique over multiple frames)
         // TODO: add more data if needed (x, y coordinates, bounding box, is_visible, is_clickable etc.)
-        let original_content = "";
-        if (elem.hasAttribute("aria-roledescription")) {
-            original_content = elem.getAttribute("aria-roledescription");
-        }
-        let new_content = `${elem_global_bid}_${original_content}`
-        elem.setAttribute("aria-roledescription", new_content);
+        push_bid_to_attribute(elem_global_bid, elem, "aria-roledescription");
+        push_bid_to_attribute(elem_global_bid, elem, "aria-description");  // fallback for generic nodes
 
         // set-of-marks flag (He et al. 2024)
         // https://github.com/MinorJerry/WebVoyager/blob/main/utils.py
@@ -229,6 +202,15 @@ function whoCapturesCenterClick(element){
     }
 }
 
+function push_bid_to_attribute(bid, elem, attr){
+    let original_content = "";
+    if (elem.hasAttribute(attr)) {
+        original_content = elem.getAttribute(attr);
+    }
+    let new_content = `browsergym_id_${bid} ${original_content}`
+    elem.setAttribute(attr, new_content);
+}
+
 function elementFromPoint(x, y) {
     let dom = document;
     let last_elem = null;

diff --git a/browsergym/core/src/browsergym/core/javascript/frame_unmark_elements.js b/browsergym/core/src/browsergym/core/javascript/frame_unmark_elements.js
@@ -1,6 +1,6 @@
 /**
  * Go through all DOM elements in the frame (including shadowDOMs),
- * and cleanup previously stored data in the aria-roledescription attribute.
+ * and cleanup previously stored data in ARIA attributes.
  */
 () => {
     // get all DOM elements in the current frame (does not include elements in shadowDOMs)
@@ -18,23 +18,23 @@
             );
         }
         i++;
-        // Hack: remove custom data stored inside the aria-roledescription tag
+        // Hack: remove custom data stored in ARIA attributes
         //  - elem_global_id: global browsergym identifier
-        if (elem.hasAttribute("aria-roledescription")) {
-            let content = elem.getAttribute("aria-roledescription");
-            // TODO: handle more data if needed
-            let n_data_items = 1;  // bid
-            let post_data_index = 0;
-            for (let j = 0 ; j < n_data_items ; j++) {
-                post_data_index = content.indexOf("_", post_data_index) + 1;
-            }
-            original_content = content.substring(post_data_index);
-            if (original_content) {
-                elem.setAttribute("aria-roledescription", original_content);
-            }
-            else {
-                elem.removeAttribute("aria-roledescription");
-            }
+        pop_bid_from_attribute(elem, "aria-description");
+        pop_bid_from_attribute(elem, "aria-roledescription");  // fallback for generic nodes
+    }
+}
+
+function pop_bid_from_attribute(elem, attr) {
+    let bid_regex = /^browsergym_id[^\s]*\s/;
+    if (elem.hasAttribute(attr)) {
+        let content = elem.getAttribute(attr);
+        let original_content = content.replace(bid_regex, '');
+        if (original_content) {
+            elem.setAttribute(attr, original_content);
+        }
+        else {
+            elem.removeAttribute(attr);
         }
     }
 }
diff --git a/browsergym/core/src/browsergym/core/observation.py b/browsergym/core/src/browsergym/core/observation.py
@@ -130,24 +130,20 @@ def extract_screenshot(page: playwright.sync_api.Page):
     return img
 
 
-# TODO: handle more data items if needed
+# we could handle more data items here if needed
 __BID_EXPR = r"([a-z0-9]+)"
-__FLOAT_EXPR = r"([+-]?(?:[0-9]*[.])?[0-9]+)"
-__BOOL_EXPR = r"([01])"
-# bid, bbox_left, bbox_top, center_x, center_y, bbox_right, bbox_bottom, is_in_viewport
-__DATA_REGEXP = re.compile(__BID_EXPR + r"_" + r"(.*)")
+__DATA_REGEXP = re.compile(r"^browsergym_id_" + __BID_EXPR + r"\s?" + r"(.*)")
 
 
-def extract_data_items_from_aria(string):
+def extract_data_items_from_aria(string: str, with_warning: bool = True):
     """
-    Utility function to extract temporary data stored in the "aria-roledescription" attribute of a node
+    Utility function to extract temporary data stored in the ARIA attributes of a node
     """
 
     match = __DATA_REGEXP.fullmatch(string)
     if not match:
-        logger.warning(
-            f'Data items could not be extracted from "aria-roledescription" attribute: {string}'
-        )
+        if with_warning:
+            logger.warning(f"Failed to extract BrowserGym data from ARIA string: {repr(string)}")
         return [], string
 
     groups = match.groups()
@@ -171,7 +167,7 @@ def extract_dom_snapshot(
         computed_styles: whitelist of computed styles to return.
         include_dom_rects: whether to include DOM rectangles (offsetRects, clientRects, scrollRects) in the snapshot.
         include_paint_order: whether to include paint orders in the snapshot.
-        temp_data_cleanup: whether to clean up the temporary data stored in the "aria-roledescription" attribute.
+        temp_data_cleanup: whether to clean up the temporary data stored in the ARIA attributes.
 
     Returns:
         A document snapshot, including the full DOM tree of the root node (including iframes,
@@ -191,43 +187,50 @@ def extract_dom_snapshot(
     )
     cdp.detach()
 
-    # if requested, remove temporary data stored in the "aria-roledescription" attribute of each node
+    # if requested, remove temporary data stored in the ARIA attributes of each node
     if temp_data_cleanup:
-        try:
-            target_attr_name_id = dom_snapshot["strings"].index("aria-roledescription")
-        except ValueError:
-            target_attr_name_id = -1
-        # run the cleanup only if the "aria-roledescription" string is present
-        if target_attr_name_id > -1:
-            processed_string_ids = set()
-            for document in dom_snapshot["documents"]:
-                for node_attributes in document["nodes"]["attributes"]:
-                    i = 0
-                    # find the "aria-roledescription" attribute, if any
-                    for i in range(0, len(node_attributes), 2):
-                        attr_name_id = node_attributes[i]
-                        attr_value_id = node_attributes[i + 1]
-                        if attr_name_id == target_attr_name_id:
-                            attr_value = dom_snapshot["strings"][attr_value_id]
-                            # remove any data stored in the "aria-roledescription" attribute
-                            if attr_value_id not in processed_string_ids:
-                                _, new_attr_value = extract_data_items_from_aria(attr_value)
-                                dom_snapshot["strings"][
-                                    attr_value_id
-                                ] = new_attr_value  # update the string in the metadata
-                                processed_string_ids.add(
-                                    attr_value_id
-                                )  # mark string as processed (in case several "aria-roledescription" attributes share the same value string)
-                                attr_value = new_attr_value
-                            # remove "aria-roledescription" attribute (name and value) if empty
-                            if attr_value == "":
-                                del node_attributes[i : i + 2]
-                            # once "aria-roledescription" is found, exit the search
-                            break
+        pop_bids_from_attribute(dom_snapshot, "aria-roledescription")
+        pop_bids_from_attribute(dom_snapshot, "aria-description")
 
     return dom_snapshot
 
 
+def pop_bids_from_attribute(dom_snapshot, attr: str):
+    try:
+        target_attr_name_id = dom_snapshot["strings"].index(attr)
+    except ValueError:
+        target_attr_name_id = -1
+    # run the cleanup only if the target attribute string is present
+    if target_attr_name_id > -1:
+        processed_string_ids = set()
+        for document in dom_snapshot["documents"]:
+            for node_attributes in document["nodes"]["attributes"]:
+                i = 0
+                # find the target attribute, if any
+                for i in range(0, len(node_attributes), 2):
+                    attr_name_id = node_attributes[i]
+                    attr_value_id = node_attributes[i + 1]
+                    if attr_name_id == target_attr_name_id:
+                        attr_value = dom_snapshot["strings"][attr_value_id]
+                        # remove any data stored in the target attribute
+                        if attr_value_id not in processed_string_ids:
+                            _, new_attr_value = extract_data_items_from_aria(
+                                attr_value, with_warning=False
+                            )
+                            dom_snapshot["strings"][
+                                attr_value_id
+                            ] = new_attr_value  # update the string in the metadata
+                            processed_string_ids.add(
+                                attr_value_id
+                            )  # mark string as processed (in case several nodes share the same target attribute string value)
+                            attr_value = new_attr_value
+                        # remove target attribute (name and value) if empty
+                        if attr_value == "":
+                            del node_attributes[i : i + 2]
+                        # once target attribute is found, exit the search
+                        break
+
+
 def extract_dom_extra_properties(dom_snapshot):
     def to_string(idx):
         if idx == -1:
@@ -433,30 +436,34 @@ def extract_all_frame_axtrees(page: playwright.sync_api.Page):
 
     cdp.detach()
 
-    # extract browsergym properties (bids, coordinates, etc.) from the "roledescription" property ("aria-roledescription" attribute)
+    # extract browsergym data from ARIA attributes
     for ax_tree in frame_axtrees.values():
         for node in ax_tree["nodes"]:
-            # look for the "roledescription" property
+            data_items = []
+            # look for data in the node's "roledescription" property
             if "properties" in node:
                 for i, prop in enumerate(node["properties"]):
                     if prop["name"] == "roledescription":
                         data_items, new_value = extract_data_items_from_aria(prop["value"]["value"])
                         prop["value"]["value"] = new_value
-                        # remove the "roledescription" property if empty
+                        # remove the "description" property if empty
                         if new_value == "":
                             del node["properties"][i]
-                        # add all extracted "browsergym" properties to the AXTree
-                        if data_items:
-                            (browsergym_id,) = data_items
-                            node["properties"].append(
-                                {
-                                    "name": "browsergym_id",
-                                    "value": {
-                                        "type": "string",
-                                        "value": browsergym_id,
-                                    },
-                                }
-                            )
+                        break
+            # look for data in the node's "description" (fallback plan)
+            if "description" in node:
+                data_items_bis, new_value = extract_data_items_from_aria(
+                    node["description"]["value"]
+                )
+                node["description"]["value"] = new_value
+                if new_value == "":
+                    del node["description"]
+                if not data_items:
+                    data_items = data_items_bis
+            # add the extracted "browsergym" data to the AXTree
+            if data_items:
+                (browsergym_id,) = data_items
+                node["browsergym_id"] = browsergym_id
     return frame_axtrees