In [1]:
from typing import List, Dict, Optional

import pandas as pd
from lxml import html

def parse_vizio_apps_lxml(html_path: str) -> List[Dict[str, Optional[str]]]:
    with open(html_path, "r", encoding="utf-8") as f:
        content = f.read()

    tree = html.fromstring(content)

    # Step 1: Get categories from tab buttons
    category_buttons = tree.xpath('//button[contains(@class, "v20-tabs__accordion")]')
    categories = {
        btn.attrib.get("data-tab-index"): btn.text_content().strip()
        for btn in category_buttons if btn.attrib.get("data-tab-index") is not None
    }

    # Step 2: Extract apps and find links
    apps = []
    for elem in tree.xpath('//*[@data-app-name]'):
        # Climb up to find the category index
        tab_index = None
        parent = elem
        while parent is not None and tab_index is None:
            tab_index = parent.attrib.get("data-tab-index")
            parent = parent.getparent()

        # Step 3: Try to find a nearby external link
        link_elem = elem.xpath('.//a[contains(@class, "btn-primary") and starts-with(@href, "http")]')
        if not link_elem:
            link_elem = elem.xpath('.//following::a[contains(@class, "btn-primary") and starts-with(@href, "http")][1]')
        link = link_elem[0].attrib["href"] if link_elem else ""

        # Collect all app info
        app_info = {
            "data-app-name": elem.attrib.get("data-app-name", "").strip(),
            "data-app-id": elem.attrib.get("data-app-id", "").strip(),
            "data-bundle-id": elem.attrib.get("data-bundle-id", "").strip(),
            "data-developer-url": elem.attrib.get("data-developer-url", "").strip(),
            "Category": categories.get(tab_index, ""),
            "link": link
        }
        apps.append(app_info)

    return apps


In [3]:
apps_list = parse_vizio_apps_lxml("vizio.html")
apps_list

[{'data-app-name': 'watchfree',
  'data-app-id': 'vizio.watchfree',
  'data-bundle-id': '',
  'data-developer-url': '',
  'Category': '',
  'link': 'https://www.vizio.com/en/watchfreeplus'},
 {'data-app-name': 'netflix',
  'data-app-id': 'vizio.netflix',
  'data-bundle-id': 'vizio.netflix',
  'data-developer-url': 'https://www.netflix.com/',
  'Category': '',
  'link': 'https://www.netflix.com/'},
 {'data-app-name': 'prime video',
  'data-app-id': 'vizio.amazon',
  'data-bundle-id': 'vizio.primevideo',
  'data-developer-url': 'https://www.primevideo.com/',
  'Category': '',
  'link': 'https://www.amazon.com/Amazon-Video/b/?&node=2858778011&ref=dvm_us_gc_wd_vizio_websitepv'},
 {'data-app-name': 'starz',
  'data-app-id': 'vizio.starz',
  'data-bundle-id': 'vizio.starz',
  'data-developer-url': 'https://www.starz.com/',
  'Category': '',
  'link': 'https://www.starz.com/vizio'},
 {'data-app-name': 'disney',
  'data-app-id': 'vizio.disney',
  'data-bundle-id': 'vizio.disneyplus',
  'data-d

In [4]:
df = pd.DataFrame(apps_list)
df

Unnamed: 0,data-app-name,data-app-id,data-bundle-id,data-developer-url,Category,link
0,watchfree,vizio.watchfree,,,,https://www.vizio.com/en/watchfreeplus
1,netflix,vizio.netflix,vizio.netflix,https://www.netflix.com/,,https://www.netflix.com/
2,prime video,vizio.amazon,vizio.primevideo,https://www.primevideo.com/,,https://www.amazon.com/Amazon-Video/b/?&node=2...
3,starz,vizio.starz,vizio.starz,https://www.starz.com/,,https://www.starz.com/vizio
4,disney,vizio.disney,vizio.disneyplus,https://www.disneyplus.com/,,https://www.disneyplus.com/start?cid=DSS-OFFDE...
...,...,...,...,...,...,...
573,peliculasgratis,vizio.peliculasgratis,vizio.peliculasgratisplus,https://www.ott.studio/,,https://www.ott.studio/
574,bingekorea,vizio.bingekorea,vizio.bingekorea,https://its-newid.net/,,https://paro-vizio.its-newid.net/
575,telemundo,vizio.telemundo,,,,https://www.telemundo.com/
576,amasiantv,vizio.amasiantv,vizio.amasiantv,https://amasian.tv/live/amasian-tv-movies-and-...,,https://amasian.tv/live/amasian-tv-movies-and-...


In [6]:
df.to_parquet("philips_vizio_appstoday.parquet",index=False)