In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from selenium import webdriver

In [3]:
from src.driver.service import DriverService

In [4]:
driver = DriverService().get_driver()

In [5]:
url = "https://www.kayak.com/"
driver.get(url)

In [6]:
driver.execute_script("return document.readyState")

'complete'

In [7]:
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC

# WebDriverWait(driver, 20).until(
#     lambda d: d.execute_script("return document.readyState") == "complete"
# )

In [8]:
html_content = driver.page_source

In [9]:
with open("kayak.html", "w") as f:
    f.write(BeautifulSoup(html_content, "lxml").prettify())

In [10]:
soup = BeautifulSoup(html_content, "lxml")

In [11]:
# driver.quit()

# Functions

In [12]:
def _quick_element_filter(element: Tag|NavigableString) -> bool:
    """
    Quick pre-filter to eliminate elements before expensive checks.
    Returns True if element passes initial filtering.
    """
    if isinstance(element, NavigableString):
        return bool(element.strip())
    
    # Quick attribute checks that would make element invisible/non-interactive
    style = element.get('style', '')
    if any(
        [
            element.get('aria-hidden') == 'true',
            element.get('hidden') is not None,
            element.get('disabled') is not None,
            'display: none' in style or 'visibility: hidden' in style,
            any(cls in element.get('class', []) for cls in ['hidden', 'invisible']),
            element.get('type') == 'hidden',
        ]
    ):
        return False
    
    return True


def _is_element_accepted(element: Tag) -> bool:
    """Check if element is accepted based on tag name and special cases."""
    leaf_element_deny_list = {'svg', 'iframe', 'script', 'style', 'link', 'meta'}
    return element.name not in leaf_element_deny_list


def _is_interactive_element(element: Tag) -> bool:
    """Check if element is interactive based on tag name and attributes."""
    interactive_elements = {
        'a', 'button', 'details', 'embed', 'input', 'label', 'menu', 'menuitem',
        'object', 'select', 'textarea', 'summary', 'dialog'
    }
    interactive_roles = {
        'button', 'menu', 'menuitem', 'link', 'checkbox', 'radio', 'slider', 'tab',
        'tabpanel', 'textbox', 'combobox', 'grid', 'listbox', 'option', 'progressbar',
        'scrollbar', 'searchbox', 'switch', 'tree', 'treeitem', 'spinbutton', 'tooltip',
        'dialog', 'alertdialog', 'menuitemcheckbox', 'menuitemradio', 'list', 'listitem'
    }

    return (
        element.name in interactive_elements
        or element.get('role') in interactive_roles
        or element.get('tabindex') == '0'
    )


def _is_leaf_element(element: Tag) -> bool:
    """Check if element is a leaf element (has text and no child tags)."""
    # Must have non-empty text
    if not element.get_text(strip=True):
        return False
    
    # Check for child tags (not text nodes)
    for child in element.children:
        if isinstance(child, Tag):
            return False
    
    return True


def _generate_xpath(element: Tag) -> str:
    # pages like wiki have multiple elements associated with single id
    if isinstance(element, NavigableString):
        return _generate_xpath(element.parent) if element.parent else ''
    
    parts = []
    current = element

    while current.name != '[document]':
        selector = current.name
        position = len(current.find_previous_siblings(current.name)) + 1
        selector += f'[{position}]'
        current = current.parent
        parts.append(selector)

    return  '//' + '/'.join(reversed(parts)) if parts else ''

# Extract Content

In [13]:
# only interactive ones should have index
# soup.body.contents will cause problems. need copy

In [14]:
candidate_elements: list[Tag | NavigableString] = []
dom_queue = list(soup.body.children)[::-1] if soup.body else []

In [15]:
# do not decompose otherwise wrong element index when using `_generate_xpath` and `checkTextFront`
while dom_queue:
    element = dom_queue.pop()

    if not _quick_element_filter(element):
        continue 

    # Handle both Tag elements and text nodes
    if isinstance(element, Tag):
        if not _is_element_accepted(element):
            continue

        for child in reversed(list(element.children)):
            dom_queue.append(child)

        if _is_interactive_element(element):
            candidate_elements.append(element)
    
    elif isinstance(element, NavigableString) and element.strip():
        candidate_elements.append(element)

In [16]:
len(candidate_elements)

750

## JS visibility and top element checks

In [17]:
candidates = [
    {
        "xpath": _generate_xpath(c),
        "is_text": not isinstance(c, Tag),
        "node_index": list(c.parent.children).index(c),
    }
    for c in candidate_elements
]

In [18]:
len(candidates)

750

In [19]:
candidates[0]

{'xpath': '//html[1]/body[1]/div[2]/nav[1]/a[1]',
 'is_text': False,
 'node_index': 0}

---

In [27]:
js_code = """
function checkVisibility(candidates) {
    const results = [];

    for (const candidate of candidates) {

        const elem = document.evaluate(
            candidate.xpath,
            document,
            null,
            XPathResult.FIRST_ORDERED_NODE_TYPE,
            null
        ).singleNodeValue;

        
        results.push(true)
    }
    return results;
}

const candidates = arguments[0];
return checkVisibility(candidates);
"""
mask = driver.execute_script(js_code, candidates)

In [28]:
len(mask), sum(mask)

(513, 513)

In [29]:
js_code = """
function checkVisibility(candidates) {
    const results = [];
    const elementCache = new Map();

    for (const candidate of candidates) {

        const xpath = candidate.xpath
        let elem;

        if (elementCache.has(xpath)) {
            elem = elementCache.get(xpath);
            
        } else {
            elem = document.evaluate(
                xpath,
                document,
                null,
                XPathResult.FIRST_ORDERED_NODE_TYPE,
                null
            ).singleNodeValue;

            elementCache.set(xpath, elem);
        }

        results.push(true)

    }
    return results;
}

const candidates = arguments[0]
return checkVisibility(candidates)
"""
mask = driver.execute_script(js_code, candidates)

In [30]:
len(mask), sum(mask)

(513, 513)

In [31]:
js_code = """
function checkVisibility(candidates) {
    const results = [];
    const elementCache = new Map();

    for (const candidate of candidates) {

        const xpath = candidate.xpath
        let elem;

        if (elementCache.has(xpath)) {
            elem = elementCache.get(xpath);
            
        } else {
            elem = document.evaluate(
                xpath,
                document,
                null,
                XPathResult.FIRST_ORDERED_NODE_TYPE,
                null
            ).singleNodeValue;

            elementCache.set(xpath, elem);
        }

        if (!elem) {
            results.push(false);
            continue;
        }

        const isVisible = elem.checkVisibility({
            checkOpacity: true,
            checkVisibilityCSS: true
        })

        if (!isVisible) {
            results.push(false);
            continue;
        }

        results.push(true)
    }
    return results;
}

const candidates = arguments[0]
return checkVisibility(candidates)
"""
mask = driver.execute_script(js_code, candidates)

In [32]:
len(mask), sum(mask)

(513, 369)

In [None]:
js_code = """
function checkTop(elem) {
    const rect = elem.getBoundingClientRect();

    const points = [
        {x: rect.left + rect.width * 0.2, y: rect.top + rect.height * 0.2},
        {x: rect.left + rect.width * 0.8, y: rect.top + rect.height * 0.2}, 
        {x: rect.left + rect.width * 0.2, y: rect.top + rect.height * 0.8},
        {x: rect.left + rect.width * 0.8, y: rect.top + rect.height * 0.8},
        {x: rect.left + rect.width / 2, y: rect.top + rect.height / 2}
    ];
    
    return points.some(point => {
        const topEl = document.elementFromPoint(point.x, point.y);
        return elem.contains(topEl);
    });
}

function checkTextFront(textNode) {
    const range = document.createRange();
    range.selectNodeContents(textNode);
    const rect = range.getBoundingClientRect();
    
    return (
        rect.width !== 0 && 
        rect.height !== 0 &&
        rect.top > 0 &&
        rect.bottom < window.innerHeight
    );
}

function checkVisibility(candidates) {
    const results = [];
    const elementCache = new Map();

    for (const candidate of candidates) {
        const xpath = candidate.xpath;
        let elem;

        if (elementCache.has(xpath)) {
            elem = elementCache.get(xpath);
            
        } else {
            elem = document.evaluate(
                xpath,
                document,
                null,
                XPathResult.FIRST_ORDERED_NODE_TYPE,
                null
            ).singleNodeValue;

            elementCache.set(xpath, elem);
        }

        if (!elem) {
            results.push(false);
            continue;
        }

        const isVisible = elem.checkVisibility({
            checkOpacity: true,
            checkVisibilityCSS: true
        })

        if (!isVisible) {
            results.push(false);
            continue;
        }

        if (candidate.is_text) {
            const textNode = elem.childNodes[candidate.node_index];
            const isFrontText = checkTextFront(textNode);

            if (!isFrontText) {
                results.push(false);
                continue;
            }
        } else {
            const isTop = checkTop(elem);

            if (!isTop) {
                results.push(false);
                continue;
            }
        }

        results.push(true)
    }
    return results;
}

const candidates = arguments[0]
return checkVisibility(candidates)
"""
mask = driver.execute_script(js_code, candidates)

In [1926]:
len(mask), sum(mask)

(17492, 345)

---

In [33]:
candidate_elements[0]

<a class="screen-reader-only screen-reader-only-focusable skip-to-content" data-hook="skip-to-content" href="#site-content" tabindex="0">Skip to content</a>

In [2322]:
js_code = """
function checkTop(elem) {
    const rect = elem.getBoundingClientRect();

    const points = [
        {x: rect.left + rect.width * 0.2, y: rect.top + rect.height * 0.2},
        {x: rect.left + rect.width * 0.8, y: rect.top + rect.height * 0.2}, 
        {x: rect.left + rect.width * 0.2, y: rect.top + rect.height * 0.8},
        {x: rect.left + rect.width * 0.8, y: rect.top + rect.height * 0.8},
        {x: rect.left + rect.width / 2, y: rect.top + rect.height / 2}
    ];
    
    return points.some(point => {
        const topEl = document.elementFromPoint(point.x, point.y);
        return elem.contains(topEl);
    });
}

function checkTextFront(textNode) {
    const range = document.createRange();
    range.selectNodeContents(textNode);
    const rect = range.getBoundingClientRect();
    
    return (
        rect.width !== 0 && 
        rect.height !== 0 &&
        rect.top > 0 &&
        rect.bottom < window.innerHeight
    );
}

function checkVisibility(candidates) {
    const results = [];
    const elementCache = new Map();
    const accepted_elem_xpaths = new Set();

    for (const candidate of candidates) {
        const xpath = candidate.xpath;
        let elem;

        if (accepted_elem_xpaths.has(xpath)) {
            results.push(false);
            continue;
        }

        if (elementCache.has(xpath)) {
            elem = elementCache.get(xpath);
            
        } else {
            elem = document.evaluate(
                xpath,
                document,
                null,
                XPathResult.FIRST_ORDERED_NODE_TYPE,
                null
            ).singleNodeValue;

            elementCache.set(xpath, elem);
        }

        if (!elem) {
            results.push(false);
            continue;
        }

        const isVisible = elem.checkVisibility({
            checkOpacity: true,
            checkVisibilityCSS: true
        })

        if (!isVisible) {
            results.push(false);
            continue;
        }

        if (candidate.is_text) {
            const textNode = elem.childNodes[candidate.node_index];
            const isFrontText = checkTextFront(textNode);

            if (!isFrontText) {
                results.push(false);
                continue;
            }
        } else {
            const isTop = checkTop(elem);

            if (!isTop) {
                results.push(false);
                continue;
            }
        }

        results.push(true)

        if (!candidate.is_text) {
            accepted_elem_xpaths.add(xpath)     
        }
        
        console.log(elem)
    }
    return results;
}

const candidates = arguments[0]
return checkVisibility(candidates)
"""
mask = driver.execute_script(js_code, candidates)

In [2323]:
len(mask), sum(mask)

(352, 93)

In [31]:
js_code = """
function checkTop(elem) {
    const rect = elem.getBoundingClientRect();

    const points = [
        {x: rect.left + rect.width * 0.2, y: rect.top + rect.height * 0.2},
        {x: rect.left + rect.width * 0.8, y: rect.top + rect.height * 0.2}, 
        {x: rect.left + rect.width * 0.2, y: rect.top + rect.height * 0.8},
        {x: rect.left + rect.width * 0.8, y: rect.top + rect.height * 0.8},
        {x: rect.left + rect.width / 2, y: rect.top + rect.height / 2}
    ];
    
    return points.some(point => {
        const topEl = document.elementFromPoint(point.x, point.y);
        return elem.contains(topEl);
    });
}

function checkTextFront(textNode) {
    const range = document.createRange();
    range.selectNodeContents(textNode);
    const rect = range.getBoundingClientRect();
    
    return (
        rect.width !== 0 && 
        rect.height !== 0 &&
        rect.top > 0 &&
        rect.bottom < window.innerHeight
    );
}

function checkVisibility(candidates) {
    const results = [];
    const elementCache = new Map();
    const accepted_elem_xpaths = new Set();

    for (const candidate of candidates) {
        const xpath = candidate.xpath;
        let elem;

        if (candidate.is_text) {
            let skip = false;
            for (const accepted_xpath of accepted_elem_xpaths) {
                if (xpath.startsWith(accepted_xpath)) {
                    skip = true;
                    break;              
                }
            }
            if (skip) {
                results.push(false);
                continue;
            }
        }

        if (accepted_elem_xpaths.has(xpath)) {
            results.push(false);
            continue;
        }

        if (elementCache.has(xpath)) {
            elem = elementCache.get(xpath);
            
        } else {
            elem = document.evaluate(
                xpath,
                document,
                null,
                XPathResult.FIRST_ORDERED_NODE_TYPE,
                null
            ).singleNodeValue;

            elementCache.set(xpath, elem);
        }

        if (!elem) {
            results.push(false);
            continue;
        }

        const isVisible = elem.checkVisibility({
            checkOpacity: true,
            checkVisibilityCSS: true
        })

        if (!isVisible) {
            results.push(false);
            continue;
        }

        if (candidate.is_text) {
            const textNode = elem.childNodes[candidate.node_index];
            const isFrontText = checkTextFront(textNode);

            if (!isFrontText) {
                results.push(false);
                continue;
            }
        } else {
            const isTop = checkTop(elem);

            if (!isTop) {
                results.push(false);
                continue;
            }
        }

        results.push(true)

        if (!candidate.is_text) {
            accepted_elem_xpaths.add(xpath)     
        }
        
        console.log(elem)
    }
    return results;
}

const candidates = arguments[0]
return checkVisibility(candidates)
"""
mask = driver.execute_script(js_code, candidates)

In [20]:
from typing import Any


def check_visibility_js(candidates: list[dict[str, Any]]) -> list[bool]:
    js_code = """
    function checkTop(elem) {
        const rect = elem.getBoundingClientRect();

        const points = [
            {x: rect.left + rect.width * 0.2, y: rect.top + rect.height * 0.2},
            {x: rect.left + rect.width * 0.8, y: rect.top + rect.height * 0.2}, 
            {x: rect.left + rect.width * 0.2, y: rect.top + rect.height * 0.8},
            {x: rect.left + rect.width * 0.8, y: rect.top + rect.height * 0.8},
            {x: rect.left + rect.width / 2, y: rect.top + rect.height / 2}
        ];
        
        return points.some(point => {
            const topEl = document.elementFromPoint(point.x, point.y);
            return elem.contains(topEl);
        });
    }

    function checkTextFront(textNode) {
        const range = document.createRange();
        range.selectNodeContents(textNode);
        const rect = range.getBoundingClientRect();
        
        return (
            rect.width !== 0 && 
            rect.height !== 0 &&
            rect.top > 0 &&
            rect.bottom < window.innerHeight
        );
    }

    function checkVisibility(candidates) {
        const results = [];
        const elementCache = new Map();
        const accepted_elem_xpaths = new Set();

        for (const candidate of candidates) {
            const xpath = candidate.xpath;
            let elem;

            if (candidate.is_text) {
                let skip = false;
                for (const accepted_xpath of accepted_elem_xpaths) {
                    if (xpath.startsWith(accepted_xpath)) {
                        skip = true;
                        break;              
                    }
                }
                if (skip) {
                    results.push(false);
                    continue;
                }
            }

            if (accepted_elem_xpaths.has(xpath)) {
                results.push(false);
                continue;
            }

            if (elementCache.has(xpath)) {
                elem = elementCache.get(xpath);
                
            } else {
                elem = document.evaluate(
                    xpath,
                    document,
                    null,
                    XPathResult.FIRST_ORDERED_NODE_TYPE,
                    null
                ).singleNodeValue;

                elementCache.set(xpath, elem);
            }

            if (!elem) {
                results.push(false);
                continue;
            }

            const isVisible = elem.checkVisibility({
                checkOpacity: true,
                checkVisibilityCSS: true
            })

            if (!isVisible) {
                results.push(false);
                continue;
            }

            if (candidate.is_text) {
                const textNode = elem.childNodes[candidate.node_index];
                const isFrontText = checkTextFront(textNode);

                if (!isFrontText) {
                    results.push(false);
                    continue;
                }
            } else {
                const isTop = checkTop(elem);

                if (!isTop) {
                    results.push(false);
                    continue;
                }
            }

            results.push(true)

            if (!candidate.is_text) {
                accepted_elem_xpaths.add(xpath)     
            }
            
            // console.log(elem)
        }
        return results;
    }

    const candidates = arguments[0]
    return checkVisibility(candidates)
    """
    mask = driver.execute_script(js_code, candidates)
    return mask

In [21]:
mask = check_visibility_js(candidates)

In [22]:
len(mask), sum(mask)

(750, 50)

In [23]:
candidate_elements[0]

<a href="#main">Skip to main content</a>

In [24]:
candidates[0]

{'xpath': '//html[1]/body[1]/div[2]/nav[1]/a[1]',
 'is_text': False,
 'node_index': 0}

In [25]:
accepted = [
    {
        "xpath": c["xpath"],
        "is_text": c["is_text"],
        "node_index": c["node_index"],
        "element": e,
        
    }
    for e, c, included in 
    zip(candidate_elements, candidates, mask) if included
] 

In [26]:
len(accepted)

50

In [27]:
accepted[0]

{'xpath': '//html[1]/body[1]/div[2]/div[1]/header[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]',
 'is_text': False,
 'node_index': 0,
 'element': <div aria-label="Open main navigation" class="ZGw- ZGw--mod-size-medium" role="button" tabindex="0"><svg height="20" role="presentation" viewbox="0 0 200 200" width="20" xmlns="http://www.w3.org/2000/svg"><path d="M20 153.333h160v-17.778H20v17.778zm0-44.444h160V91.111H20v17.778zm0-62.222v17.778h160V46.667H20z"></path></svg></div>}

In [28]:
def _cap_text_length(text: str, max_length: int = 150) -> str:
    if len(text) > max_length:
        half_length = max_length // 2
        return text[:half_length] + '...' + text[-half_length:]
    return text

In [29]:
from pydantic import BaseModel

In [30]:
class DomContentItem(BaseModel):
    index: int
    text: str
    clickable: bool
    n_parents: int
    addition: bool = False

In [31]:
def _get_essential_attributes(element: Tag) -> str:
    essential_attributes = [
        'id',
        # 'class',
        'href',
        'src',
        'readonly',
        'disabled',
        'checked',
        'selected',
        'role',
        'type',  # Important for inputs, buttons
        'name',  # Important for form elements
        'value',  # Current value of form elements
        'placeholder',  # Helpful for understanding input purpose
        'title',  # Additional descriptive text
        'alt',  # Alternative text for images
        'for',  # Important for label associations
        'autocomplete',  # Form field behavior
    ]

    essential_prefixes = ('aria-', 'data-',)

    attrs = []
    for attr, value in element.attrs.items():
        if attr in essential_attributes or attr.startswith(essential_prefixes):
            if isinstance(value, str):
                value = value[:50]
            elif isinstance(value, (list, tuple)):
                value = ' '.join(str(v)[:50] for v in value)
            attrs.append(f'{attr}="{value}"')
    
    return ' '.join(attrs)

In [32]:
xpaths = set(a["xpath"] for a in accepted)

In [33]:
output_items = []
selector_map = {}

for index, a in enumerate(accepted):
    is_text = a["is_text"]
    elem = a["element"]
    xpath = a["xpath"]
    n_parents = sum(elem_xpath in xpath for elem_xpath in xpaths if elem_xpath != xpath)
    
    if is_text:
        text = _cap_text_length(elem.strip())
        output_items.append(
            DomContentItem(index=index, text=text, clickable=False, n_parents=n_parents)
        )
    else:
        tag_name = elem.name
        text = _cap_text_length(elem.get_text(strip=True, separator=" | "))
        attributes = _get_essential_attributes(elem)
        elm_content = f'<{tag_name}{" " + attributes if attributes else ""}>{text}</{tag_name}>'
        output_items.append(
            DomContentItem(index=index, text=elm_content, clickable=True, n_parents=n_parents)
        )
    
    selector_map[index] = xpath

In [35]:
class ProcessedDomContent(BaseModel):
	items: list[DomContentItem]
	selector_map: dict[int, str]

	def dom_items_to_string(self) -> str:
		formatted_text = ""
		for item in self.items:
			indent = "\t"*item.n_parents
			formatted_text += f"{item.index if item.clickable else '_':>3}:{indent}{item.text}\n"
		return formatted_text

In [36]:
content = ProcessedDomContent(items=output_items, selector_map=selector_map)

In [37]:
print(content.dom_items_to_string())

  0:<div role="button" aria-label="Open main navigation"></div>
  1:<a href="/" aria-label="Go to the kayak homepage"></a>
  2:<div role="button">Sign in</div>
  _:Flights
  _:Stays
  _:Cars
  _:Packages
  _:KAYAK.ai
  _:BETA
  _:Plan your trip
  _:Explore
  _:Flight Tracker
  _:Travel tips
  _:KAYAK for Business
  _:NEW
  _:Trips
  _:English
  _:$
  _:United States dollar
  _:Feedback
  _:Compare flight deals from 100s of sites
  _:.
 22:<li role="menuitem">Flights</li>
 23:	<a href="/flights" aria-label="Search for flights" aria-current="page"></a>
 24:<li role="menuitem">Stays</li>
 25:	<a href="/stays" aria-label="Search for stays" aria-current="false"></a>
 26:<li role="menuitem">Cars</li>
 27:	<a href="/cars" aria-label="Search for cars" aria-current="false"></a>
 28:<li role="menuitem">Packages</li>
 29:	<a href="/packages" aria-label="Search for packages" aria-current="false"></a>
 30:<li role="menuitem">KAYAK.ai</li>
 31:	<a href="#" aria-label="Go to KAYAK.ai" aria-current="f

In [38]:
driver.quit()

Exclude leaf elements? as they are not interactive and only give text while text can also be obtained without them.

Exclude texts if their ancestor is accepted

In [62]:
# check src
from src.dom.service import DomService

In [63]:
dom_service = DomService(driver)

In [64]:
current_state = dom_service.get_current_state()

In [68]:
print(current_state.dom_items_to_string())

  0:<a aria-label="Airbnb homepage" href="/"></a>
  1:<button id="search-block-tab-STAYS" data-testid="header-tab-search-block-tab-STAYS" name="refinement_paths[]" role="tab" aria-selected="true" aria-controls="search-tabpanel" type="button">Homes</button>
  2:<button id="search-block-tab-EXPERIENCES" data-testid="header-tab-search-block-tab-EXPERIENCES" name="refinement_paths[]" role="tab" aria-selected="false" aria-controls="search-tabpanel" type="button">Experiences</button>
  3:<div aria-labelledby="search-block-tab-STAYS" id="search-tabpanel" role="tabpanel" data-panel-bounds="true">Where | Check in | Add dates | Check out | Add dates | Who | Add guests | Search</div>
  4:	<label for="bigsearch-query-location-input">Where</label>
  5:		<input aria-autocomplete="none" autocomplete="off" id="bigsearch-query-location-input" name="query" type="search" aria-describedby="bigsearch-query-location-description" placeholder="Search destinations" data-testid="structured-search-input-field-qu