In [2]:
url = 'https://www.nj.gov/state/elections/vote.shtml'

In [4]:
from bs4 import BeautifulSoup
import requests

def get_html(url):
    """
    Get the HTML content of a webpage.
    """
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to load page: {response.status_code}")


In [11]:
def parse_html(html):
    """
    Parse the HTML content and extract the voting information.
    """
    soup = BeautifulSoup(html, 'html.parser')

    main = soup.find('main')
    if not main:
        print("Main section not found")

    # Find the relevant sections in the HTML
    sections = main.find_all('div', class_='card')

    voting_info = []
    for section in sections:
        title = section.find('h3').text.strip()
        description = section.find('p').text.strip()
        voting_info.append({
            'title': title,
            'description': description
        })
    return voting_info

In [47]:
#parse_html(get_html(url))

In [16]:
import re

In [35]:
import tiktoken

def get_token_count(text):
    """
    Get the token count of a given text.
    """
    encoding = tiktoken.encoding_for_model("gpt-4o")
    tokens = encoding.encode(text)
    return len(tokens)

In [28]:
def estimate_claude_tokens(text): 
    words = re.findall(r'\w+', text)
    return int(len(words) * 1.3)

def tokens_to_words(tokens):
    return int(tokens / 1.3)

In [29]:
def chunk_html_text(url, max_tokens = 5000): 
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Focus on the main content area only
    main = soup.find('main')
    if not main:
        main = soup.body  # Fallback if no <main> tag
        #print(main)

    raw_chunks = []

    # Go through direct child elements inside <main> that might represent sections
    for elem in main.find_all(['section', 'div', 'article'], recursive=False):
        if any(keyword in (elem.get('class') or []) + [elem.get('id') or ''] 
               for keyword in ['nav', 'navbar', 'footer', 'sidebar']):
            continue

        text = elem.get_text(separator=' ', strip=True)
        if text:
            raw_chunks.append(text)

    # Remove duplicates or overlapping text chunks
    seen = set()
    unique_chunks = []
    for chunk in raw_chunks:
        if chunk not in seen:
            seen.add(chunk)
            unique_chunks.append(chunk)

    # Now enforce max_tokens per chunk
    final_chunks = []

    for chunk in unique_chunks:
        token_count = estimate_claude_tokens(chunk)
        if token_count <= max_tokens:
            final_chunks.append(chunk)
            print('correct size')
        else:
            print('Chunk too large, splitting further...')

            max_words = tokens_to_words(max_tokens)
            words = chunk.split()

            # separate the words into smaller chunks of size max_tokens
            smaller_chunks = [words[i:i + max_words] for i in range(0, len(words), max_words)]
            # Join the smaller chunks back into strings
            smaller_chunks = [' '.join(chunk) for chunk in smaller_chunks]
            print('Big chunks')
            print(smaller_chunks)
            # Append the smaller chunks to the final list
            final_chunks.extend(smaller_chunks)

            # temp_chunk = ""
            # temp_tokens = 0

            # for word in words:
            #     word_tokens = estimate_claude_tokens(word)
            #     if temp_tokens + word_tokens > max_tokens:
            #         final_chunks.append(temp_chunk.strip())
            #         temp_chunk = word + " "
            #         temp_tokens = word_tokens
            #     else:
            #         temp_chunk += word + " "
            #         temp_tokens += word_tokens

            # if temp_chunk:
            #     final_chunks.append(temp_chunk.strip())

    # Print out the final results
    # print(f"\nTotal Chunks: {len(final_chunks)}\n")
    # for i, chunk in enumerate(final_chunks):
    #     print(f"Chunk{i+1} chunk: {chunk} ")
    for i, chunk in enumerate(final_chunks):
        print(f'Chunk {i+1} size: {estimate_claude_tokens(chunk)} tokens')
    return final_chunks

In [30]:
chunk_html_text(url)

correct size
correct size
correct size
correct size
correct size
correct size
correct size
correct size
Chunk too large, splitting further...
Big chunks
correct size
Chunk 1 size: 149 tokens
Chunk 2 size: 27 tokens
Chunk 3 size: 239 tokens
Chunk 4 size: 14 tokens
Chunk 5 size: 248 tokens
Chunk 6 size: 904 tokens
Chunk 7 size: 44 tokens
Chunk 8 size: 22 tokens
Chunk 9 size: 5907 tokens
Chunk 10 size: 5346 tokens
Chunk 11 size: 10 tokens


['Official Site of The State of New Jersey Governor Phil Murphy • Lt. Governor Tahesha Way NJ.gov Services Agencies FAQs Translate close The State of NJ site may contain optional links, information, services and/or content from other websites operated by third parties that are provided as a convenience, such as Google™ Translate. Google™ Translate is an online service for which the user pays nothing to obtain a purported language translation. The user is on notice that neither the State of NJ site nor its operators review any of the services, information and/or content from anything that may be linked to the State of NJ site for any reason. - Read Full Disclaimer Search close',
 'The Hon. Tahesha Way, Lt. Governor and Secretary of State New Jersey Voter Information Portal Department of State, Division of Elections',
 'NJ Voter Information Portal DOS DOE NJ Voter Information Portal NJ Voter Information Portal 3 Ways To Vote Vote-By-Mail Early Voting Polling Location Secure Drop Box Loca

In [41]:
def chunk_html_text_tiktoken(url, max_tokens = 5000): 
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Focus on the main content area only
    main = soup.find('main')
    if not main:
        main = soup.body  # Fallback if no <main> tag
        #print(main)

    raw_chunks = []

    # Go through direct child elements inside <main> that might represent sections
    for elem in main.find_all(['section', 'div', 'article'], recursive=False):
        if any(keyword in (elem.get('class') or []) + [elem.get('id') or ''] 
               for keyword in ['nav', 'navbar', 'footer', 'sidebar']):
            continue

        text = elem.get_text(separator=' ', strip=True)
        if text:
            raw_chunks.append(text)

    # Remove duplicates or overlapping text chunks
    seen = set()
    unique_chunks = []
    for chunk in raw_chunks:
        if chunk not in seen:
            seen.add(chunk)
            unique_chunks.append(chunk)

    # Now enforce max_tokens per chunk
    final_chunks = []

    for chunk in unique_chunks:
        token_count = get_token_count(chunk)
        if token_count <= max_tokens:
            final_chunks.append(chunk)
            print('correct size')
        else:
            print('Chunk too large, splitting further...')

            tokens = tiktoken.encoding_for_model("gpt-4o").encode(chunk)

            smaller_chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
            # Join the smaller chunks back into strings
            smaller_chunks = [tiktoken.encoding_for_model("gpt-4o").decode(chunk) for chunk in smaller_chunks]
            # Append the smaller chunks to the final list
            final_chunks.extend(smaller_chunks)
            print('Reduced chunks')

            # temp_chunk = ""
            # temp_tokens = 0

            # for word in words:
            #     word_tokens = estimate_claude_tokens(word)
            #     if temp_tokens + word_tokens > max_tokens:
            #         final_chunks.append(temp_chunk.strip())
            #         temp_chunk = word + " "
            #         temp_tokens = word_tokens
            #     else:
            #         temp_chunk += word + " "
            #         temp_tokens += word_tokens

            # if temp_chunk:
            #     final_chunks.append(temp_chunk.strip())

    # Print out the final results
    # print(f"\nTotal Chunks: {len(final_chunks)}\n")
    # for i, chunk in enumerate(final_chunks):
    #     print(f"Chunk{i+1} chunk: {chunk} ")
    for i, chunk in enumerate(final_chunks):
        print(f'Chunk {i+1} size: {get_token_count(chunk)} tokens')
    return final_chunks

In [42]:
chunk_html_text_tiktoken(url)

correct size
correct size
correct size
correct size
correct size
correct size
correct size
correct size
Chunk too large, splitting further...
Reduced chunks
correct size
Chunk 1 size: 128 tokens
Chunk 2 size: 27 tokens
Chunk 3 size: 224 tokens
Chunk 4 size: 23 tokens
Chunk 5 size: 253 tokens
Chunk 6 size: 862 tokens
Chunk 7 size: 40 tokens
Chunk 8 size: 18 tokens
Chunk 9 size: 5000 tokens
Chunk 10 size: 5000 tokens
Chunk 11 size: 3028 tokens
Chunk 12 size: 8 tokens


['Official Site of The State of New Jersey Governor Phil Murphy • Lt. Governor Tahesha Way NJ.gov Services Agencies FAQs Translate close The State of NJ site may contain optional links, information, services and/or content from other websites operated by third parties that are provided as a convenience, such as Google™ Translate. Google™ Translate is an online service for which the user pays nothing to obtain a purported language translation. The user is on notice that neither the State of NJ site nor its operators review any of the services, information and/or content from anything that may be linked to the State of NJ site for any reason. - Read Full Disclaimer Search close',
 'The Hon. Tahesha Way, Lt. Governor and Secretary of State New Jersey Voter Information Portal Department of State, Division of Elections',
 'NJ Voter Information Portal DOS DOE NJ Voter Information Portal NJ Voter Information Portal 3 Ways To Vote Vote-By-Mail Early Voting Polling Location Secure Drop Box Loca

In [43]:
import requests
from bs4 import BeautifulSoup
import tiktoken

# tiktoken setup (using the same tokenizer as your model, e.g., gpt-4o uses 'cl100k_base')
tokenizer = tiktoken.get_encoding('cl100k_base')

def num_tokens(text):
    """Calculate the number of tokens in a string using tiktoken."""
    return len(tokenizer.encode(text))

def chunk_element(element, max_tokens=5000):
    """Recursively split an element if its text exceeds max_tokens."""
    text = element.get_text(separator=' ', strip=True)
    if num_tokens(text) <= max_tokens:
        return [text]
    
    # If it's too big and has children, try to split further
    chunks = []
    for child in element.find_all(['section', 'div', 'article'], recursive=False):
        child_text = child.get_text(separator=' ', strip=True)
        if child_text and num_tokens(child_text) <= max_tokens:
            chunks.append(child_text)
        elif child_text:
            # Recursively split again if child is still too big
            chunks.extend(chunk_element(child, max_tokens))
    
    # If no children are found, fallback: just forcibly split the text
    if not chunks and text:
        chunks = force_split_text(text, max_tokens)
    
    return chunks

def force_split_text(text, max_tokens):
    """Fallback: split a huge block of text into hard chunks if no more structure."""
    tokens = tokenizer.encode(text)
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        sub_tokens = tokens[i:i+max_tokens]
        sub_text = tokenizer.decode(sub_tokens)
        chunks.append(sub_text)
    return chunks


In [44]:
def chunk_html(url, max_tokens=5000):
    """Download HTML and chunk it by token size using structural recursion."""
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    main = soup.find('main')
    if not main:
        main = soup.body  # fallback
    
    chunks = []
    # Only start from direct children
    for elem in main.find_all(['section', 'div', 'article'], recursive=False):
        text = elem.get_text(separator=' ', strip=True)
        if not text:
            continue
        
        if num_tokens(text) <= max_tokens:
            chunks.append(text)
        else:
            # Go deeper recursively
            chunks.extend(chunk_element(elem, max_tokens))
    
    return chunks


In [46]:
# Example usage
chunks = chunk_html(url, max_tokens=5000)
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1} ({num_tokens(chunk)} tokens)")

Chunk 1 (129 tokens)
Chunk 2 (27 tokens)
Chunk 3 (223 tokens)
Chunk 4 (24 tokens)
Chunk 5 (254 tokens)
Chunk 6 (864 tokens)
Chunk 7 (40 tokens)
Chunk 8 (18 tokens)
Chunk 9 (55 tokens)
Chunk 10 (2377 tokens)
Chunk 11 (5000 tokens)
Chunk 12 (5000 tokens)
Chunk 13 (603 tokens)
Chunk 14 (9 tokens)
