In [5]:
import random
from bs4 import BeautifulSoup
import re
import random

random.seed(3)

"""
TODO:
1. add support to image, font size, align, margin
2. add control to the depth of recursive function
3. ...
"""

CFG = {
    "document": [['<html>', 'head', 'body', '</html>']],
    "head": [['<head>', 'meta', '</head>']],
    "meta": [['<meta charset="UTF-8">']],
    "body": [['<body>', 'non-empty content', '</body>']],
    # "content": [['paragraph'], ['link'], ['div'], []],
    "content/empty": [['div'], []],
    "non-empty content": [['paragraph'], ['div']],
    "div": [['content/empty', '<div id="ID">', 'non-empty content', '</div>']],
    "paragraph": [['<p>', 'text', '</p>'], ['<p>', 'text', '</p>', "paragraph"]],
    "link": [['<a href="', 'URL', '">', 'text', '</a>']],
    "text": [['SWORD']],
    "URL": [['https://www.google.com/']],
    "SWORD": [['test', 'EWORD']],
    "EWORD": [[' test'], [' test', 'EWORD'], []]
    # "CHAR": [[chr(i)] for i in range(97, 123)] + [[chr(i)] for i in range(65, 91)] + [[str(i)] for i in range(10)]
}

def generate_random_string(symbol):
    if symbol not in CFG:
        return symbol
    else:
        production = random.choice(CFG[symbol])
        result = ''.join([generate_random_string(s) for s in production])
        return result

def update_ids(html_string):
    id_counter = 0
    updated_html = html_string
    while "id=\"ID\"" in updated_html:
        updated_html = updated_html.replace("id=\"ID\"", f"id=\"ID{id_counter}\"", 1)
        """
        if "id=\"ID\"><p>" in updated_html: 
            updated_html = updated_html.replace("id=\"ID\"><p>", f"id=\"ID{id_counter}\"><p>{id_counter} ", 1)
        else:
            updated_html = updated_html.replace("id=\"ID\">", f"id=\"ID{id_counter}\">", 1)
        """
        id_counter += 1
    for j in range(id_counter + 1):
        if f"id=\"ID{j}\"><p>" in updated_html: 
            updated_html = updated_html.replace(f"id=\"ID{j}\"><p>", f"id=\"ID{j}\"><p>{j} ", 1)
    return updated_html

def generate_css(html_string):
    id_counter = 0
    css_rules = []

    # Predefined list of colors
    color_list = [
        '#FF5733', '#FFBD33', '#DBFF33', '#75FF33',
        '#33FF57', '#33FFBD', '#33DBFF', '#3375FF',
        '#5733FF', '#BD33FF', '#FF33DB', '#FF3375'
    ]

    color_list_length = len(color_list)

    while f'id="ID{id_counter}"' in html_string:
        color = color_list[id_counter % color_list_length]
        css_rule = f'#ID{id_counter} {{ background-color: {color}; }}'
        css_rules.append(css_rule)
        id_counter += 1

    css = ''.join(css_rules)
    return html_string.replace("<meta charset=\"UTF-8\">", "<meta charset=\"UTF-8\"><style>p { margin: 0; } html, body { margin: 0; padding: 0; }" + f"{css}</style>")


def extract_tree_structure(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    tree_structure = {}

    def recurse(element, parent_id):
        if element.name == 'div' and element.has_attr('id'):
            child_id = element['id']
            if parent_id not in tree_structure:
                tree_structure[parent_id] = []
            tree_structure[parent_id].append(child_id)
            for child in element.children:
                recurse(child, child_id)

    for child in soup.body.children:
        recurse(child, "Root")

    # Remove entries for elements with no children
    tree_structure = {k: v if v else '' for k, v in tree_structure.items()}

    return tree_structure


def find_block(text, child_id):
    pattern = re.compile(rf'(#{child_id} \{{.*?\}})', re.DOTALL)
    match = pattern.search(text)
    if match:
        return match.group(1)
    else:
        return None


def modify_html(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    
    def process_element(html_string, element, current_width):
        current_id = element.get("id")
        original_style = find_block(html_string, current_id)
        if current_width == 100:
            new_style = original_style.replace("}", f"width: {current_width}%; }}")
        else:
            new_style = original_style.replace("}", f"width: {current_width}%; float: left; }}")
        html_string = html_string.replace(original_style, new_style)

        children_width = 100
        if element.name == 'div' and len(element.find_all('div', recursive=False)) > 1:
            if random.random() < 0.5:
                children = element.find_all('div', recursive=False)
                children_width = 100 / len(children)

                original_div = str(element)
                new_div = original_div[:-6] + "<div style=\"clear: both;\"></div></div>"
                html_string = html_string.replace(original_div, new_div)
                
                # return
                # original_style = find_block(html_string, current_id)
                # new_style = original_style.replace("}", f"display: flex; }}")
                # html_string = html_string.replace(original_style, new_style)

        if len(element.find_all('div', recursive=False)) > 0:
            for child in element.children:
                if child.name == 'div':
                    html_string = process_element(html_string, child, children_width)
        
        return html_string

    for child in soup.body.children:
        if child.name == 'div':
            html_string = process_element(html_string, child, 100)
    
    return html_string


def beautify_html(html_str):
    soup = BeautifulSoup(html_str, 'html.parser')
    return soup.prettify()


code_list = []
for i in range(10):
    print(i)
    html_code = generate_css(update_ids(generate_random_string("document")))
    tree_structure = extract_tree_structure(html_code)
    print(tree_structure)
    html_code = modify_html(html_code)
    print(html_code)
    code_list.append(beautify_html(html_code))

0
{}
<html><head><meta charset="UTF-8"><style>p { margin: 0; } html, body { margin: 0; padding: 0; }</style></head><body><p>test</p></body></html>
1
{'Root': ['ID0', 'ID1'], 'ID1': ['ID2'], 'ID2': ['ID3']}
<html><head><meta charset="UTF-8"><style>p { margin: 0; } html, body { margin: 0; padding: 0; }#ID0 { background-color: #FF5733; width: 100%; }#ID1 { background-color: #FFBD33; width: 100%; }#ID2 { background-color: #DBFF33; width: 100%; }#ID3 { background-color: #75FF33; width: 100%; }</style></head><body><div id="ID0"><p>0 test test test</p></div><div id="ID1"><div id="ID2"><div id="ID3"><p>3 test test</p><p>test</p><p>test</p><p>test</p><p>test test</p></div></div></div></body></html>
2
{'Root': ['ID0', 'ID2'], 'ID0': ['ID1'], 'ID2': ['ID3', 'ID4'], 'ID4': ['ID5']}
<html><head><meta charset="UTF-8"><style>p { margin: 0; } html, body { margin: 0; padding: 0; }#ID0 { background-color: #FF5733; width: 100%; }#ID1 { background-color: #FFBD33; width: 100%; }#ID2 { background-color: #DB

In [6]:
with open('test.html', 'w') as file:
    # Write the HTML content to the file
    file.write(code_list[5])