In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import undetected_chromedriver as uc
import selenium.webdriver.support.expected_conditions as EC
from selenium_stealth import stealth
from selenium.webdriver.support.ui import WebDriverWait
import time

options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
# options.add_argument("--headless")

options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)

url = "https://groq.com/pricing/"
driver.get(url)

try:
    accept_button = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//*[text()='Accept' or text()='Accept all']"), )
    )
    accept_button.click()
except:
    print("No accept button found")

time.sleep(3)

html = driver.page_source

with open("temp/page.html", "w", encoding="utf-8") as f:
    f.write(html)

screenshot_path = "temp/screenshot.png"
status = driver.save_screenshot(screenshot_path)

driver.quit()

In [3]:
from bs4 import BeautifulSoup, Comment, Tag
import re

In [4]:
def cleanup_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    initial_size = len(str(soup))

    # Remove comments
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()

    # Remove unwanted tags completely
    unwanted_tags = ["style", "script", "font", "link", "meta"]
    for tag in unwanted_tags:
        for element in soup.find_all(tag):
            element.decompose()

    # Remove nested wrapper tags
    def is_wrapper_tag(tag):
        children = tag.contents 
        return (
            len(children) == 1 
            and isinstance(children[0], Tag)
            and not tag.get("id")
            and not tag.get("class")
        )
    for tag in soup.find_all(["div", "span"]):
        if is_wrapper_tag(tag):
            tag.replace_with(tag.contents[0])  

    # Add counter for class enumeration
    class_counter = 0

    # Remove all attributes except allowed ones
    allowed_attrs = [
        "id",
        "name",
        "href",
        "alt"
    ]

    for tag in soup.find_all(True):
        attrs = dict(tag.attrs)

        for attr in attrs:
            if attr == "class":
                tag["c"] = str(class_counter)
                class_counter += 1
                del tag[attr]
                
            elif attr not in allowed_attrs:
                del tag[attr]   
    
    cleaned_html = str(soup)

    # Remove multiple spaces and newlines
    cleaned_html = " ".join(cleaned_html.split())

    # Remove empty tags
    empty_tags_pattern = r"<[^/>][^>]*>\s*</[^>]+>"
    cleaned_html = re.sub(empty_tags_pattern, "", cleaned_html)

    final_size = len(cleaned_html)
    print(f"Size reduced by {100 - (final_size * 100//initial_size)}%")

    return cleaned_html

In [5]:
cleaned_html = cleanup_html(html)

Size reduced by 75%


In [6]:
def save_formatted_html(html_content, output_file_name):
    soup = BeautifulSoup(html_content, "html.parser")
    formatted_html = soup.prettify()
    with open("temp/"+output_file_name, "w", encoding="utf-8") as f:
        f.write(formatted_html)

In [7]:
save_formatted_html(cleaned_html, "cleaned_html.html")

In [9]:
from tokencost import calculate_prompt_cost, count_string_tokens

In [10]:
model = "gpt-4o"

print(
    "Pure html cost: ", f"${calculate_prompt_cost(html, model):,.2f}",
    "Tokens: ", f"{count_string_tokens(html, model):,}",
)

print(
    "Cleaned html cost: ", f"${calculate_prompt_cost(cleaned_html, model):,.2f}",
    "Tokens: ", f"{count_string_tokens(cleaned_html, model):,}",
)

Pure html cost:  $0.13 Tokens:  53,396
Cleaned html cost:  $0.04 Tokens:  14,964
