In [2]:
import os
from PIL import Image, ImageDraw, ImageFont
from bs4 import BeautifulSoup
import requests

In [3]:

# TODO
# make JSON dump
# add clickable field
# Support all input types as classes
# classes for specific elements: image would have caption, URL, etc. button would have text, URL, etc.
# support for article and section
# use clip to determine caption tag for nearby images

# url = 'https://www.wsj.com'
# url = 'https://en.wikipedia.org/wiki/Machine_learning'
url = 'https://github.com/openai'
# url = 'https://www.amazon.com/'

# Size of the browser viewport and final images
viewportHeight = 720
viewportWidth = 1280

# Categories of semantic content
LABELS = {
    'TEXT': 0,
    'CODE': 1,
    'LINK': 2,
    'IMAGE': 3,
    'VIDEO': 4,
    'AUDIO': 5,
    'BUTTON': 6,
    'INPUT': 7,
    'FORM': 8,
    'QUOTE': 9,
    'CUSTOM': 10,
    'ICON': 11,
    'HEADER': 12,
    'SUBMIT': 13,
    'FOOTER': 14,
    'NAV': 15
}

COLORS = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080']

def draw_rect(draw, x, y, w, h, text, color):
    # Fill
    draw.rectangle((x, y, x + w, y + h), fill=color + '40')

    # Stroke
    draw.rectangle((x, y, x + w, y + h), outline=color, width=2)

    # Text
    if text:
        font = ImageFont.truetype('arial.ttf', 12)
        draw.text((x, y), text, fill=color, font=font)

def save_image(image, save_path):
    image.save(save_path)

def clip_viewport(image, viewport_height, index, save=False, save_path=''):
    viewport_image = image.crop((0, viewport_height * index, image.width, viewport_height * (index + 1)))

    if save:
        save_image(viewport_image, save_path)

    return viewport_image

def draw_segments(image, segment_groups, save=False, save_path=''):
    draw = ImageDraw.Draw(image)
    offset = 0
    for segments in segment_groups:
        for segment in segments:
            if segment['label'] == LABELS['CUSTOM']:
                continue
            bbox = segment['bbox']
            description = segment['description']
            color = COLORS[segment['label']]
            text = list(LABELS.keys())[segment['label']]
            draw_rect(draw, bbox['x'], bbox['y'] + offset, bbox['width'], bbox['height'], text, color)
        offset += viewportHeight

    if save:
        save_image(image, save_path)

def get_segments(html):
    soup = BeautifulSoup(html, 'html.parser')

    class Segment:
        def __init__(self, el, offset=0):
            self.el = el
            self.bbox = self.get_bounding_box(el)
            self.label = self.get_label(el)
            self.description = self.get_description(el)

            if offset != 0:
                self.bbox['y'] += offset

        def get_bounding_box(self, el):
            # TODO: Implement the logic to get the bounding box of the element
            return {'x': 0, 'y': 0, 'width': 0, 'height': 0}

        def get_label(self, el):
            # TODO: Implement the logic to classify the semantic content of the element
            return LABELS['TEXT']

        def get_description(self, el):
            # TODO: Implement the logic to get the description of the element
            return ''

        def serialize(self):
            return {
                'label': self.label,
                'bbox': self.bbox,
                'description': self.description
            }

    class Segments:
        def __init__(self, segments=None):
            self.segments = segments or []

        def add(self, segment):
            self.segments.append(segment)
            return self

        def sort(self):
            self.segments.sort(key=lambda x: x.bbox['y'])
            return self

        def __iter__(self):
            return iter(self.segments)

        def uniquify(self):
            unique_segments = []
            seen = set()
            for segment in self.segments:
                key = f"{segment.bbox['x']}_{segment.bbox['y']}_{segment.bbox['width']}_{segment.bbox['height']}"
                if key not in seen:
                    unique_segments.append(segment)
                    seen.add(key)
            return Segments(unique_segments)

        def serialize(self):
            return [segment.serialize() for segment in self.segments]

    class SegmentGroups:
        def __init__(self, segments=None):
            self.groups = []
            self.segments = segments or Segments()

        def group(self, viewport_height):
            group = Segments()
            offset = 0

            for segment in self.segments:
                if segment.bbox['y'] < offset + viewport_height:
                    segment.bbox['y'] -= offset
                    group.add(segment)
                else:
                    self.groups.append(group)
                    group = Segments()
                    offset += viewport_height

            self.groups.append(group)
            return self

        def serialize(self):
            return [group.serialize() for group in self.groups]

    # Types of DOM nodes
    invisible_nodes = ['head', 'meta', 'style', 'noscript', 'script', 'template', 'center', 'data', 'embed', 'bdi']
    skipped_nodes = ['iframe', 'br', 'b', 'i', 'strong', 'em', 'legend']
    leaf_nodes = ['svg', 'img', 'pre', 'code', 'textarea', 'input', 'blockquote']
    leaf_nodes_composite = ['table', 'ul', 'ol', 'dl', 'p', 'button', 'form', 'footer', 'nav']

    min_image_area = 800

    # Breadth-first traversal of DOM tree
    els = [soup.body]
    leaves = Segments()
    while els:
        el = els.pop(0)

        if el.name is None and el.string and el.string.strip():
            leaves.add(Segment(el.parent))
            continue

        if el.name in invisible_nodes or el.name in skipped_nodes:
            continue

        # TODO: Implement the logic to check element visibility based on CSS properties

        # Atomic elements that we want to also double click into
        if el.name in leaf_nodes_composite:
            leaves.add(Segment(el))

        # Atomic elements
        if el.name in leaf_nodes:
            leaves.add(Segment(el))
            continue

        # Other leaf elements: background images and text
        if not el.find_all(recursive=False):
            if el.string and el.string.strip():
                leaves.add(Segment(el))

            # TODO: Implement the logic to check for background images

            continue

        els.extend(el.find_all(recursive=False))

    unique_segments = leaves.uniquify().sort()

    segment_groups = SegmentGroups(unique_segments).group(viewportHeight)

    return segment_groups.serialize()

In [ ]:
# Get the html
html_code = requests.get(url).text



In [0]:

# Sanitize URL for a file path by replacing periods and slashes
img_path_base = 'screenshots/' + url.replace('://', '_').replace('/', '_').replace('.', '_') + '/'

# Create the directory if it doesn't exist
os.makedirs(img_path_base, exist_ok=True)

# Assuming you have the HTML code stored in a variable called `html_code`
segment_groups = get_segments(html_code)

# Generate full-page screenshot
img_path = img_path_base + 'full.png'
# Assuming you have the full-page screenshot stored in a variable called `full_screenshot`
full_screenshot.save(img_path)

# Create two images for screenshot, one for raw and one for annotated
img_full = Image.open(img_path)
img_annotated = img_full.copy()

# Draw bounding boxes on annotated image
img_path_full_annotated = img_path_base + 'full_annotated.png'
draw_segments(img_annotated, segment_groups, save=True, save_path=img_path_full_annotated)

# Split up full-page screenshot into segments according to full-page scrolling of viewport
for index in range(len(segment_groups)):
    img_path = img_path_base + str(index) + '.png'
    img_path_annotated = img_path_base + str(index) + '_annotated.png'

    clip_viewport(img_full, viewportHeight, index, save=True, save_path=img_path)
    clip_viewport(img_annotated, viewportHeight, index, save=True, save_path=img_path_annotated)