In [6]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import random
import os

class AllTextOverlapGenerator:
    
    def __init__(self):
        self.offsets = [
            {'x': 1, 'y': 1}, {'x': 2, 'y': 0}, {'x': 0, 'y': 2},
            {'x': -1, 'y': 1}, {'x': 1, 'y': -1}, {'x': 3, 'y': 1},
            {'x': 0, 'y': 3}, {'x': 2, 'y': 2}, {'x': -2, 'y': 0},
            {'x': 4, 'y': 0}, {'x': 0, 'y': 4}, {'x': -3, 'y': -1}
        ]
        
        self.colors = [
            '#ff0000', '#00ff00', '#0000ff', '#ff00ff', '#ffff00', 
            '#00ffff', '#ffa500', '#800080', '#ffc0cb', '#90ee90'
        ]
        
    def find_text_elements(self, html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        text_elements = []
        
        selectors = ['.time', '.headline', '.label-text', '.title', '.date', '.label-text11', '.label-text12']
        
        for selector in selectors:
            elements = soup.select(selector)
            for idx, element in enumerate(elements):
                text_content = element.get_text(strip=True)
                if text_content and len(text_content) > 0:
                    text_elements.append({
                        'element': element,
                        'text': text_content,
                        'selector': selector,
                        'index': idx
                    })
        
        return text_elements

    def make_overlap_variant(self, html_content, css_content, variant_id):
        soup = BeautifulSoup(html_content, 'html.parser')
        text_elements = self.find_text_elements(html_content)
        
        if len(text_elements) == 0:
            return html_content, css_content
        
        overlap_css = ""
        
        for elem_idx, element_info in enumerate(text_elements):
            found_elements = soup.select(element_info['selector'])
            target_element = None
            
            for elem in found_elements:
                if elem.get_text(strip=True) == element_info['text']:
                    target_element = elem
                    break
            
            if not target_element:
                continue
            
            original_text = target_element.get_text(strip=True)
            
            container_id = f"container-{variant_id}-{elem_idx}"
            original_id = f"original-{variant_id}-{elem_idx}"
            
            container = soup.new_tag('span', id=container_id)
            container['style'] = "position: relative; display: inline-block;"
            
            target_element['id'] = original_id
            target_element.wrap(container)
            
            num_overlaps = random.randint(1, 5)
            
            for overlap_idx in range(num_overlaps):
                overlap_id = f"overlap-{variant_id}-{elem_idx}-{overlap_idx}"
                offset = random.choice(self.offsets)
                color = random.choice(self.colors)
                opacity = random.uniform(0.6, 0.95)
                
                overlap_element = soup.new_tag('span', id=overlap_id)
                overlap_element.string = original_text
                
                target_element.insert_after(overlap_element)
                
                overlap_css += f"""
#{overlap_id} {{
    position: absolute !important;
    top: {offset['y'] + overlap_idx}px !important;
    left: {offset['x'] + overlap_idx}px !important;
    z-index: {100 + overlap_idx} !important;
    color: {color} !important;
    opacity: {opacity} !important;
    font-weight: bold !important;
    pointer-events: none !important;
    white-space: nowrap !important;
}}
"""
        
        overlap_css += f"""
[id^="container-{variant_id}-"] {{
    position: relative !important;
    display: inline-block !important;
}}

[id^="original-{variant_id}-"] {{
    position: relative !important;
    z-index: 1 !important;
}}
"""
        
        updated_css = css_content + "\n\n" + overlap_css
        
        return str(soup), updated_css

    def make_extreme_variant(self, html_content, css_content, variant_id):
        soup = BeautifulSoup(html_content, 'html.parser')
        text_elements = self.find_text_elements(html_content)
        
        if len(text_elements) == 0:
            return html_content, css_content
        
        overlap_css = ""
        
        for elem_idx, element_info in enumerate(text_elements):
            found_elements = soup.select(element_info['selector'])
            target_element = None
            
            for elem in found_elements:
                if elem.get_text(strip=True) == element_info['text']:
                    target_element = elem
                    break
            
            if not target_element:
                continue
            
            original_text = target_element.get_text(strip=True)
            
            container_id = f"extreme-container-{variant_id}-{elem_idx}"
            original_id = f"extreme-original-{variant_id}-{elem_idx}"
            
            container = soup.new_tag('span', id=container_id)
            container['style'] = "position: relative; display: inline-block;"
            
            target_element['id'] = original_id
            target_element.wrap(container)
            
            num_overlaps = random.randint(3, 8)
            
            for overlap_idx in range(num_overlaps):
                overlap_id = f"extreme-overlap-{variant_id}-{elem_idx}-{overlap_idx}"
                
                offset_x = random.randint(-5, 5)
                offset_y = random.randint(-5, 5)
                color = random.choice(self.colors)
                opacity = random.uniform(0.4, 0.9)
                
                font_weight = random.choice(['normal', 'bold', '600', '700'])
                font_size_mod = random.uniform(0.9, 1.2)
                
                overlap_element = soup.new_tag('span', id=overlap_id)
                overlap_element.string = original_text
                target_element.insert_after(overlap_element)
                
                overlap_css += f"""
#{overlap_id} {{
    position: absolute !important;
    top: {offset_y}px !important;
    left: {offset_x}px !important;
    z-index: {200 + overlap_idx} !important;
    color: {color} !important;
    opacity: {opacity} !important;
    font-weight: {font_weight} !important;
    font-size: {font_size_mod}em !important;
    pointer-events: none !important;
    white-space: nowrap !important;
    text-shadow: 1px 1px 2px rgba(0,0,0,0.3) !important;
}}
"""
        
        overlap_css += f"""
[id^="extreme-container-{variant_id}-"] {{
    position: relative !important;
    display: inline-block !important;
}}

[id^="extreme-original-{variant_id}-"] {{
    position: relative !important;
    z-index: 1 !important;
}}
"""
        
        updated_css = css_content + "\n\n" + overlap_css
        
        return str(soup), updated_css

    def create_variants(self, html_content, css_content, num_variants=200):
        variants = []
        
        for variant_id in tqdm(range(num_variants), desc="Creating variants"):
            try:
                if variant_id % 4 == 0:
                    variant_html, variant_css = self.make_extreme_variant(html_content, css_content, variant_id)
                    overlap_type = 'extreme'
                else:
                    variant_html, variant_css = self.make_overlap_variant(html_content, css_content, variant_id)
                    overlap_type = 'normal'
                
                variants.append({
                    'id': variant_id,
                    'type': overlap_type,
                    'html': variant_html,
                    'css': variant_css,
                    'status': 'success'
                })
                
            except Exception as e:
                variants.append({
                    'id': variant_id,
                    'status': 'failed',
                    'error': str(e)
                })
        
        return variants

    def save_files(self, variants, output_dir="all_text_overlap_variants"):
        os.makedirs(output_dir, exist_ok=True)
        
        for variant in tqdm(variants, desc="Saving files"):
            if variant['status'] == 'success':
                html_template = f"""<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <meta name="viewport" content="initial-scale=1, width=device-width">
    <title>All Text Overlap Variant {variant['id']} ({variant['type']})</title>
    <style>
{variant['css']}
    </style>
</head>
<body>
{variant['html'][variant['html'].find('<body>') + 6:variant['html'].find('</body>')]}
</body>
</html>"""
                
                filename = os.path.join(output_dir, f"all_overlap_{variant['id']:03d}_{variant['type']}.html")
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(html_template)

def main():
    try:
        with open('/Users/promachowdhury/whatBreaksIt/m3-dataset/seeds/variants_1/index.html', 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        with open('/Users/promachowdhury/whatBreaksIt/m3-dataset/seeds/variants_1/index.css', 'r', encoding='utf-8') as f:
            css_content = f.read()
    except FileNotFoundError as e:
        print(f"File not found: {e}")
        return
    
    generator = AllTextOverlapGenerator()
    
    variants = generator.create_variants(html_content, css_content, num_variants=200)
    generator.save_files(variants)
    
    successful = [v for v in variants if v['status'] == 'success']
    print(f"\nDone: {len(successful)} variants created")

if __name__ == "__main__":
    main()

Creating variants: 100%|██████████| 200/200 [00:13<00:00, 14.60it/s]
Saving files: 100%|██████████| 200/200 [00:00<00:00, 3532.00it/s]


Done: 200 variants created



