From 3d7a97093850f1a6f6f9ee59a4ad4b6fb82eb011 Mon Sep 17 00:00:00 2001 From: Maxime Gasse Date: Mon, 26 Aug 2024 16:32:01 -0400 Subject: [PATCH 1/3] more robust axtree remove_redundant_static_text --- browsergym/core/src/browsergym/utils/obs.py | 27 +++++++-------------- tests/core/test_observation.py | 1 - 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/browsergym/core/src/browsergym/utils/obs.py b/browsergym/core/src/browsergym/utils/obs.py index 25a4475a..adc552ea 100644 --- a/browsergym/core/src/browsergym/utils/obs.py +++ b/browsergym/core/src/browsergym/utils/obs.py @@ -181,19 +181,6 @@ def dfs(node_idx: int, parent_node_skipped: bool) -> str: return html -def _remove_redundant_static_text(ax_tree: str) -> str: - """Removes redundant `StaticText` from the accessibility tree""" - new_lines = [] - lines = ax_tree.split("\n") - for line in lines: - if line.strip().startswith("StaticText"): - content = line.split("StaticText")[1].strip().strip("'") - if content in "\n".join(new_lines[-3:]): - continue - new_lines.append(line) - return "\n".join(new_lines) - - def _get_coord_str(coord, decimals): if isinstance(coord, str): coord = list(map(float, ast.literal_eval(coord))) @@ -311,13 +298,14 @@ def flatten_axtree_to_str( for idx, node in enumerate(AX_tree["nodes"]): node_id_to_idx[node["nodeId"]] = idx - def dfs(node_idx: int, depth: int, parent_node_filtered: bool) -> str: + def dfs(node_idx: int, depth: int, parent_node_filtered: bool, parent_node_name: str) -> str: tree_str = "" node = AX_tree["nodes"][node_idx] indent = "\t" * depth skip_node = False filter_node = False node_role = node["role"]["value"] + node_name = "" if node_role in ignored_roles: skip_node = True @@ -359,6 +347,8 @@ def dfs(node_idx: int, depth: int, parent_node_filtered: bool) -> str: if node_role == "StaticText": if parent_node_filtered: skip_node = True + elif remove_redundant_static_text and node_name in parent_node_name: + skip_node = True else: filter_node, extra_attributes_to_print = _process_bid( bid, @@ -407,7 +397,10 @@ def dfs(node_idx: int, depth: int, parent_node_filtered: bool) -> str: # mark this to save some tokens child_depth = depth if skip_node else (depth + 1) child_str = dfs( - node_id_to_idx[child_node_id], child_depth, parent_node_filtered=filter_node + node_id_to_idx[child_node_id], + child_depth, + parent_node_filtered=filter_node, + parent_node_name=node_name, ) if child_str: if tree_str: @@ -416,9 +409,7 @@ def dfs(node_idx: int, depth: int, parent_node_filtered: bool) -> str: return tree_str - tree_str = dfs(0, 0, False) - if remove_redundant_static_text: - tree_str = _remove_redundant_static_text(tree_str) + tree_str = dfs(0, 0, False, "") return tree_str diff --git a/tests/core/test_observation.py b/tests/core/test_observation.py index cfc1f8b8..bf3c8fdd 100644 --- a/tests/core/test_observation.py +++ b/tests/core/test_observation.py @@ -11,7 +11,6 @@ import browsergym.core from browsergym.utils.obs import ( - _remove_redundant_static_text, flatten_axtree_to_str, flatten_dom_to_str, ) From 7358adc748fba5fa7b658d5f43f3e19b01a203d4 Mon Sep 17 00:00:00 2001 From: Maxime Gasse Date: Tue, 27 Aug 2024 15:21:31 -0400 Subject: [PATCH 2/3] fallback AXTree bids from aria-description --- browsergym/core/src/browsergym/core/env.py | 2 +- .../core/javascript/frame_mark_elements.js | 21 +-- .../core/javascript/frame_unmark_elements.js | 34 ++--- .../core/src/browsergym/core/observation.py | 123 +++++++++--------- browsergym/core/src/browsergym/utils/obs.py | 28 ++-- 5 files changed, 114 insertions(+), 94 deletions(-) diff --git a/browsergym/core/src/browsergym/core/env.py b/browsergym/core/src/browsergym/core/env.py index e9c8a517..ffdd4cca 100644 --- a/browsergym/core/src/browsergym/core/env.py +++ b/browsergym/core/src/browsergym/core/env.py @@ -491,7 +491,7 @@ def _get_obs(self): logger.warning( f"An error occured while extracting the dom and axtree. Retrying ({retries_left}/{EXTRACT_OBS_MAX_TRIES} tries left).\n{repr(e)}" ) - # post-extract cleanup (aria-roledescription attribute) + # post-extract cleanup (ARIA attributes) _post_extract(self.page) time.sleep(0.5) continue diff --git a/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js b/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js index edb5645a..96faa027 100644 --- a/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js +++ b/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js @@ -1,6 +1,6 @@ /** * Go through all DOM elements in the frame (including shadowDOMs), give them unique browsergym - * identifiers (bid), and store custom data in the aria-roledescription attribute. + * identifiers (bid), and store custom data in ARIA attributes. */ async ([parent_bid, bid_attr_name]) => { @@ -132,15 +132,11 @@ async ([parent_bid, bid_attr_name]) => { } all_bids.add(elem_global_bid); - // Hack: store custom data inside the aria-roledescription attribute (will be available in DOM and AXTree) + // Hack: store custom data inside ARIA attributes (will be available in DOM and AXTree) // - elem_global_bid: global element identifier (unique over multiple frames) // TODO: add more data if needed (x, y coordinates, bounding box, is_visible, is_clickable etc.) - let original_content = ""; - if (elem.hasAttribute("aria-roledescription")) { - original_content = elem.getAttribute("aria-roledescription"); - } - let new_content = `${elem_global_bid}_${original_content}` - elem.setAttribute("aria-roledescription", new_content); + push_bid_to_attribute(elem_global_bid, elem, "aria-roledescription"); + push_bid_to_attribute(elem_global_bid, elem, "aria-description"); // fallback for generic nodes // set-of-marks flag (He et al. 2024) // https://github.com/MinorJerry/WebVoyager/blob/main/utils.py @@ -229,6 +225,15 @@ function whoCapturesCenterClick(element){ } } +function push_bid_to_attribute(bid, elem, attr){ + let original_content = ""; + if (elem.hasAttribute(attr)) { + original_content = elem.getAttribute(attr); + } + let new_content = `browsergym_id_${bid} ${original_content}` + elem.setAttribute(attr, new_content); +} + function elementFromPoint(x, y) { let dom = document; let last_elem = null; diff --git a/browsergym/core/src/browsergym/core/javascript/frame_unmark_elements.js b/browsergym/core/src/browsergym/core/javascript/frame_unmark_elements.js index 578a47b9..4ea14d80 100644 --- a/browsergym/core/src/browsergym/core/javascript/frame_unmark_elements.js +++ b/browsergym/core/src/browsergym/core/javascript/frame_unmark_elements.js @@ -1,6 +1,6 @@ /** * Go through all DOM elements in the frame (including shadowDOMs), - * and cleanup previously stored data in the aria-roledescription attribute. + * and cleanup previously stored data in ARIA attributes. */ () => { // get all DOM elements in the current frame (does not include elements in shadowDOMs) @@ -18,23 +18,23 @@ ); } i++; - // Hack: remove custom data stored inside the aria-roledescription tag + // Hack: remove custom data stored in ARIA attributes // - elem_global_id: global browsergym identifier - if (elem.hasAttribute("aria-roledescription")) { - let content = elem.getAttribute("aria-roledescription"); - // TODO: handle more data if needed - let n_data_items = 1; // bid - let post_data_index = 0; - for (let j = 0 ; j < n_data_items ; j++) { - post_data_index = content.indexOf("_", post_data_index) + 1; - } - original_content = content.substring(post_data_index); - if (original_content) { - elem.setAttribute("aria-roledescription", original_content); - } - else { - elem.removeAttribute("aria-roledescription"); - } + pop_bid_from_attribute(elem, "aria-description"); + pop_bid_from_attribute(elem, "aria-roledescription"); // fallback for generic nodes + } +} + +function pop_bid_from_attribute(elem, attr) { + let bid_regex = /^browsergym_id[^\s]*\s/; + if (elem.hasAttribute(attr)) { + let content = elem.getAttribute(attr); + let original_content = content.replace(bid_regex, ''); + if (original_content) { + elem.setAttribute(attr, original_content); + } + else { + elem.removeAttribute(attr); } } } diff --git a/browsergym/core/src/browsergym/core/observation.py b/browsergym/core/src/browsergym/core/observation.py index efe2a10d..ae7ead0e 100644 --- a/browsergym/core/src/browsergym/core/observation.py +++ b/browsergym/core/src/browsergym/core/observation.py @@ -130,24 +130,20 @@ def extract_screenshot(page: playwright.sync_api.Page): return img -# TODO: handle more data items if needed +# we could handle more data items here if needed __BID_EXPR = r"([a-z0-9]+)" -__FLOAT_EXPR = r"([+-]?(?:[0-9]*[.])?[0-9]+)" -__BOOL_EXPR = r"([01])" -# bid, bbox_left, bbox_top, center_x, center_y, bbox_right, bbox_bottom, is_in_viewport -__DATA_REGEXP = re.compile(__BID_EXPR + r"_" + r"(.*)") +__DATA_REGEXP = re.compile(r"^browsergym_id_" + __BID_EXPR + r"\s?" + r"(.*)") -def extract_data_items_from_aria(string): +def extract_data_items_from_aria(string: str, with_warning: bool = True): """ - Utility function to extract temporary data stored in the "aria-roledescription" attribute of a node + Utility function to extract temporary data stored in the ARIA attributes of a node """ match = __DATA_REGEXP.fullmatch(string) if not match: - logger.warning( - f'Data items could not be extracted from "aria-roledescription" attribute: {string}' - ) + if with_warning: + logger.warning(f"Failed to extract BrowserGym data from ARIA string: {repr(string)}") return [], string groups = match.groups() @@ -171,7 +167,7 @@ def extract_dom_snapshot( computed_styles: whitelist of computed styles to return. include_dom_rects: whether to include DOM rectangles (offsetRects, clientRects, scrollRects) in the snapshot. include_paint_order: whether to include paint orders in the snapshot. - temp_data_cleanup: whether to clean up the temporary data stored in the "aria-roledescription" attribute. + temp_data_cleanup: whether to clean up the temporary data stored in the ARIA attributes. Returns: A document snapshot, including the full DOM tree of the root node (including iframes, @@ -191,43 +187,50 @@ def extract_dom_snapshot( ) cdp.detach() - # if requested, remove temporary data stored in the "aria-roledescription" attribute of each node + # if requested, remove temporary data stored in the ARIA attributes of each node if temp_data_cleanup: - try: - target_attr_name_id = dom_snapshot["strings"].index("aria-roledescription") - except ValueError: - target_attr_name_id = -1 - # run the cleanup only if the "aria-roledescription" string is present - if target_attr_name_id > -1: - processed_string_ids = set() - for document in dom_snapshot["documents"]: - for node_attributes in document["nodes"]["attributes"]: - i = 0 - # find the "aria-roledescription" attribute, if any - for i in range(0, len(node_attributes), 2): - attr_name_id = node_attributes[i] - attr_value_id = node_attributes[i + 1] - if attr_name_id == target_attr_name_id: - attr_value = dom_snapshot["strings"][attr_value_id] - # remove any data stored in the "aria-roledescription" attribute - if attr_value_id not in processed_string_ids: - _, new_attr_value = extract_data_items_from_aria(attr_value) - dom_snapshot["strings"][ - attr_value_id - ] = new_attr_value # update the string in the metadata - processed_string_ids.add( - attr_value_id - ) # mark string as processed (in case several "aria-roledescription" attributes share the same value string) - attr_value = new_attr_value - # remove "aria-roledescription" attribute (name and value) if empty - if attr_value == "": - del node_attributes[i : i + 2] - # once "aria-roledescription" is found, exit the search - break + pop_bids_from_attribute(dom_snapshot, "aria-roledescription") + pop_bids_from_attribute(dom_snapshot, "aria-description") return dom_snapshot +def pop_bids_from_attribute(dom_snapshot, attr: str): + try: + target_attr_name_id = dom_snapshot["strings"].index(attr) + except ValueError: + target_attr_name_id = -1 + # run the cleanup only if the target attribute string is present + if target_attr_name_id > -1: + processed_string_ids = set() + for document in dom_snapshot["documents"]: + for node_attributes in document["nodes"]["attributes"]: + i = 0 + # find the target attribute, if any + for i in range(0, len(node_attributes), 2): + attr_name_id = node_attributes[i] + attr_value_id = node_attributes[i + 1] + if attr_name_id == target_attr_name_id: + attr_value = dom_snapshot["strings"][attr_value_id] + # remove any data stored in the target attribute + if attr_value_id not in processed_string_ids: + _, new_attr_value = extract_data_items_from_aria( + attr_value, with_warning=False + ) + dom_snapshot["strings"][ + attr_value_id + ] = new_attr_value # update the string in the metadata + processed_string_ids.add( + attr_value_id + ) # mark string as processed (in case several nodes share the same target attribute string value) + attr_value = new_attr_value + # remove target attribute (name and value) if empty + if attr_value == "": + del node_attributes[i : i + 2] + # once target attribute is found, exit the search + break + + def extract_dom_extra_properties(dom_snapshot): def to_string(idx): if idx == -1: @@ -433,30 +436,34 @@ def extract_all_frame_axtrees(page: playwright.sync_api.Page): cdp.detach() - # extract browsergym properties (bids, coordinates, etc.) from the "roledescription" property ("aria-roledescription" attribute) + # extract browsergym data from ARIA attributes for ax_tree in frame_axtrees.values(): for node in ax_tree["nodes"]: - # look for the "roledescription" property + data_items = [] + # look for data in the node's "roledescription" property if "properties" in node: for i, prop in enumerate(node["properties"]): if prop["name"] == "roledescription": data_items, new_value = extract_data_items_from_aria(prop["value"]["value"]) prop["value"]["value"] = new_value - # remove the "roledescription" property if empty + # remove the "description" property if empty if new_value == "": del node["properties"][i] - # add all extracted "browsergym" properties to the AXTree - if data_items: - (browsergym_id,) = data_items - node["properties"].append( - { - "name": "browsergym_id", - "value": { - "type": "string", - "value": browsergym_id, - }, - } - ) + break + # look for data in the node's "description" (fallback plan) + if "description" in node: + data_items_bis, new_value = extract_data_items_from_aria( + node["description"]["value"] + ) + node["description"]["value"] = new_value + if new_value == "": + del node["description"] + if not data_items: + data_items = data_items_bis + # add the extracted "browsergym" data to the AXTree + if data_items: + (browsergym_id,) = data_items + node["browsergym_id"] = browsergym_id return frame_axtrees diff --git a/browsergym/core/src/browsergym/utils/obs.py b/browsergym/core/src/browsergym/utils/obs.py index adc552ea..1799cbb6 100644 --- a/browsergym/core/src/browsergym/utils/obs.py +++ b/browsergym/core/src/browsergym/utils/obs.py @@ -239,7 +239,7 @@ def _process_bid( skip_element = True if filter_visible_only: # element without bid have no visibility mark, they could be visible or non-visible - # TODO: we consider them as visible. Is this what we want? Now that duplicate bids are handles, should we mark all non-html elements? + # TODO we consider them as visible. Is this what we want? Now that duplicate bids are handled, should we mark all non-html elements? pass # keep elements without visible property # skip_element = True # filter elements without visible property @@ -283,6 +283,7 @@ def flatten_axtree_to_str( with_center_coords: bool = False, with_bounding_box_coords: bool = False, with_som: bool = False, + skip_generic: bool = True, filter_visible_only: bool = False, filter_with_bid_only: bool = False, filter_som_only: bool = False, @@ -302,8 +303,8 @@ def dfs(node_idx: int, depth: int, parent_node_filtered: bool, parent_node_name: tree_str = "" node = AX_tree["nodes"][node_idx] indent = "\t" * depth - skip_node = False - filter_node = False + skip_node = False # node will not be printed, with no effect on children nodes + filter_node = False # node will not be printed, possibly along with its children nodes node_role = node["role"]["value"] node_name = "" @@ -320,8 +321,11 @@ def dfs(node_idx: int, depth: int, parent_node_filtered: bool, parent_node_name: else: node_value = None + # extract bid + bid = node.get("browsergym_id", None) + + # extract node attributes attributes = [] - bid = None for property in node.get("properties", []): if not "value" in property: continue @@ -331,9 +335,7 @@ def dfs(node_idx: int, depth: int, parent_node_filtered: bool, parent_node_name: prop_name = property["name"] prop_value = property["value"]["value"] - if prop_name == "browsergym_id": - bid = prop_value - elif prop_name in ignored_properties: + if prop_name in ignored_properties: continue elif prop_name in ("required", "focused", "atomic"): if prop_value: @@ -341,7 +343,10 @@ def dfs(node_idx: int, depth: int, parent_node_filtered: bool, parent_node_name: else: attributes.append(f"{prop_name}={repr(prop_value)}") - if node_role == "generic" and not attributes: + if skip_generic and node_role == "generic" and not attributes: + skip_node = True + + if hide_all_children and parent_node_filtered: skip_node = True if node_role == "StaticText": @@ -365,14 +370,17 @@ def dfs(node_idx: int, depth: int, parent_node_filtered: bool, parent_node_name: ) # if either is True, skip the node - skip_node = skip_node or filter_node or (hide_all_children and parent_node_filtered) + skip_node = skip_node or filter_node # insert extra attributes before regular attributes attributes = extra_attributes_to_print + attributes # actually print the node string if not skip_node: - node_str = f"{node_role} {repr(node_name.strip())}" + if node_role == "generic" and not node_name: + node_str = f"{node_role}" + else: + node_str = f"{node_role} {repr(node_name.strip())}" if not ( bid is None From bed5482f1f035b7d230895da7805674aba510c7b Mon Sep 17 00:00:00 2001 From: Maxime Gasse Date: Tue, 27 Aug 2024 16:33:27 -0400 Subject: [PATCH 3/3] mark all html elements (even non-standard tags) --- .../core/javascript/frame_mark_elements.js | 23 ---- tests/core/data/test_page_2.html | 2 +- tests/core/test_observation.py | 110 +++++++++--------- 3 files changed, 55 insertions(+), 80 deletions(-) diff --git a/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js b/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js index 96faa027..2362c7c9 100644 --- a/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js +++ b/browsergym/core/src/browsergym/core/javascript/frame_mark_elements.js @@ -3,24 +3,6 @@ * identifiers (bid), and store custom data in ARIA attributes. */ async ([parent_bid, bid_attr_name]) => { - - // standard html tags - // https://www.w3schools.com/tags/ - const html_tags = new Set([ - "a", "abbr", "acronym", "address", "applet", "area", "article", "aside", "audio", - "b", "base", "basefont", "bdi", "bdo", "big", "blockquote", "body", "br", "button", - "canvas", "caption", "center", "cite", "code", "col", "colgroup", "data", "datalist", - "dd", "del", "details", "dfn", "dialog", "dir", "div", "dl", "dt", "em", "embed", - "fieldset", "figcaption", "figure", "font", "footer", "form", "frame", "frameset", - "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", - "iframe", "img", "input", "ins", "kbd", "label", "legend", "li", "link", "main", - "map", "mark", "menu", "meta", "meter", "nav", "noframes", "noscript", "object", - "ol", "optgroup", "option", "output", "p", "param", "picture", "pre", "progress", - "q", "rp", "rt", "ruby", "s", "samp", "script", "search", "section", "select", - "small", "source", "span", "strike", "strong", "style", "sub", "summary", "sup", - "svg", "table", "tbody", "td", "template", "textarea", "tfoot", "th", "thead", - "time", "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr" - ]); const set_of_marks_tags = new Set([ "input", "textarea", "select", "button", "a", "iframe", "video", "li", "td", "option" ]); @@ -69,11 +51,6 @@ async ([parent_bid, bid_attr_name]) => { ); } i++; - // we will mark only standard HTML tags - if (!elem.tagName || !html_tags.has(elem.tagName.toLowerCase())) { - // Skipping element - continue; // stop and move on to the next element - } // Processing element // register intersection callback on element, and keep track of element for waiting later elem.setAttribute('browsergym_visibility_ratio', 0); diff --git a/tests/core/data/test_page_2.html b/tests/core/data/test_page_2.html index 250a8170..b3b2a5d6 100644 --- a/tests/core/data/test_page_2.html +++ b/tests/core/data/test_page_2.html @@ -29,7 +29,7 @@

Simple Form

- Text within in non-html tag + Text within a non-html tag

diff --git a/tests/core/test_observation.py b/tests/core/test_observation.py index bf3c8fdd..81b265f1 100644 --- a/tests/core/test_observation.py +++ b/tests/core/test_observation.py @@ -185,6 +185,10 @@ def test_simple_shadowdom(): elem_id = elem.get_attribute(BID_ATTR) assert elem_id is not None + # elem should not have an aria-description (it should have been cleaned) + aria_description = elem.get_attribute("aria-description") + assert aria_description is None + # elem should not have an aria-roledescription (it should have been cleaned) aria_roledescription = elem.get_attribute("aria-roledescription") assert aria_roledescription is None @@ -219,6 +223,10 @@ def test_nested_shadowdom(): elem_id = elem.get_attribute(BID_ATTR) assert elem_id is not None + # elem should not have an aria-description (it should have been cleaned) + aria_description = elem.get_attribute("aria-description") + assert aria_description is None + # elem should not have an aria-roledescription (it should have been cleaned) aria_roledescription = elem.get_attribute("aria-roledescription") assert aria_roledescription is None @@ -261,7 +269,7 @@ def test_dom_has_bids_no_aria(url): dom_node_names_without_bid = ["html", "#text", "#document", "#comment"] axtree_roles_without_bid = ["RootWebArea", "none", "generic", "StaticText"] - # 1. test the DOM snapshot for BID_ATTR and "aria-roledescription" + # 1. test the DOM snapshot for BID_ATTR, "aria-description" and "aria-roledescription" # check all HTML elements in the DOM for unique browsergym id dom = obs["dom_object"] @@ -281,41 +289,47 @@ def test_dom_has_bids_no_aria(url): # check that the "aria-roledescription" attribute is absent (this is specific to this test page) assert attr_name != "aria-roledescription" + # check that the "aria-description" attribute is absent (this is specific to this test page) + assert attr_name != "aria-description" + # extract the browsergym id from the BID_ATTR attribute if attr_name == BID_ATTR: bid = attr_value - bids.append(bid) j += 2 - # print(f"{dom['strings'][node_name_id]}: {bid}") - # check that all elements (with exceptions) have a browsergym id if node_name not in dom_node_names_without_bid: assert bid is not None + if bid is not None: + bids.append(bid) + # check that all browsergym ids are unique assert len(bids) == len(set(bids)) - # 2. test the AXTree for "browsergym_id" and "roledescription" properties + # 2. test the AXTree for "browsergym_id" and "description" properties axtree = obs["axtree_object"] bids = [] for node in axtree["nodes"]: - bid = None + bid = node.get("browsergym_id", None) + + # check that the "aria-roledescription" attribute is absent (this is specific to this test page) for property in node.get("properties", []): - # check that the "aria-roledescription" attribute is absent (this is specific to this test page) assert property["name"] != "roledescription" - # extract the browsergym id from the "browsergym_id" property - if property["name"] == "browsergym_id": - bid = property["value"]["value"] - bids.append(bid) - - # print(f"{node['role']['value']}: {bid}") + # check that the "aria-description" attribute is absent (this is specific to this test page) + assert "description" not in node - # check that all elements (with excepttions) have a browsergym id + # check that all elements (with exceptions) have a browsergym id if node["role"]["value"] not in axtree_roles_without_bid: assert bid is not None + if bid is not None: + bids.append(bid) + + # check that all browsergym ids are unique + assert len(bids) == len(set(bids)) + env.close() @@ -365,7 +379,7 @@ def test_dom_to_text(): assert 'clickable="" som="" type="submit" value="Submit" visible=""' in dom assert 'head bid="1">' in dom assert 'clickable="" for="email" visible=""' in dom - assert "Text within in non-html tag" in dom + assert "Text within a non-html tag" in dom assert "Text that should not be visible" in dom dom = flatten_dom_to_str( @@ -373,7 +387,7 @@ def test_dom_to_text(): ) assert 'for="email"' not in dom assert 'type="submit" value="Submit"' in dom - assert "Text within in non-html tag" not in dom + assert "Text within a non-html tag" not in dom assert "Text that should not be visible" not in dom dom = flatten_dom_to_str( @@ -383,7 +397,7 @@ def test_dom_to_text(): ) assert "