In [9]:
from bs4 import BeautifulSoup
from deep_translator import GoogleTranslator
from tqdm import tqdm
import re
import os
from typing import Dict, List, Tuple

class SimpleHTMLTranslator:
    
    def __init__(self):
        self.languages = {
            'korean': 'ko',
            'japanese': 'ja',
            'hindi': 'hi',
            'greek': 'el',
            'thai': 'th',
            'arabic': 'ar'
        }
        
        self.font_folders = {
            'korean': '/Users/promachowdhury/fonts/korean/',
            'japanese': '/Users/promachowdhury/fonts/japanese/',
            'hindi': '/Users/promachowdhury/fonts/hindi/',
            'greek': '/Users/promachowdhury/fonts/greek/',
            'thai': '/Users/promachowdhury/fonts/thai/',
            'arabic': '/Users/promachowdhury/fonts/arabic/'
        }
        
        self.number_pattern = re.compile(r'\b\d+(?:[.:]\d+)*\b')

    def extract_all_text(self, html_content: str) -> List[Dict]:
        soup = BeautifulSoup(html_content, 'html.parser')
        text_elements = []
        
        for element in soup.find_all(string=True):
            text = element.strip()
            
            if (text and 
                len(text) > 1 and 
                not text.isdigit() and
                element.parent.name not in ['script', 'style', 'meta', 'title']):
                
                text_elements.append({
                    'element': element,
                    'original_text': text,
                    'parent_tag': element.parent.name,
                    'parent_class': element.parent.get('class', [])
                })
        
        return text_elements

    def preserve_numbers(self, text: str) -> Tuple[str, Dict]:
        numbers_map = {}
        modified_text = text
        
        for i, match in enumerate(self.number_pattern.finditer(text)):
            number = match.group()
            placeholder = f"__NUM{i}__"
            numbers_map[placeholder] = number
            modified_text = modified_text.replace(number, placeholder, 1)
        
        return modified_text, numbers_map

    def restore_numbers(self, translated_text: str, numbers_map: Dict) -> str:
        for placeholder, number in numbers_map.items():
            translated_text = translated_text.replace(placeholder, number)
        return translated_text

    def translate_text(self, text: str, target_lang: str) -> str:
        try:
            text_with_placeholders, numbers_map = self.preserve_numbers(text)
            
            clean_text = re.sub(r'__NUM\d+__', '', text_with_placeholders).strip()
            if len(clean_text) < 2:
                return text
            
            translator = GoogleTranslator(source='auto', target=target_lang)
            translated = translator.translate(text_with_placeholders)
            
            final_text = self.restore_numbers(translated, numbers_map)
            
            return final_text
            
        except Exception as e:
            return text

    def get_font_css(self, language: str) -> str:
        font_folder = self.font_folders.get(language, '/Users/promachowdhury/fonts')
        
        if not os.path.exists(font_folder):
            return self.get_fallback_fonts(language)

        font_files = []
        extensions = ['.woff2', '.woff', '.ttf', '.otf']
        
        try:
            for file in os.listdir(font_folder):
                if any(file.lower().endswith(ext) for ext in extensions):
                    font_files.append(file)
        except FileNotFoundError:
            return self.get_fallback_fonts(language)
        
        if not font_files:
            return self.get_fallback_fonts(language)

        font_css = f""
        
        for i, font_file in enumerate(font_files):
            font_name = f"{language}-font-{i+1}"
            font_path = f"{font_folder}{font_file}"

            if font_file.endswith('.woff2'):
                font_format = 'woff2'
            elif font_file.endswith('.woff'):
                font_format = 'woff'
            elif font_file.endswith('.ttf'):
                font_format = 'truetype'
            elif font_file.endswith('.otf'):
                font_format = 'opentype'
            else:
                font_format = 'truetype'
            
            font_css += f"""
@font-face {{
    font-family: '{font_name}';
    src: url('{font_path}') format('{font_format}');
    font-display: swap;
}}
"""

        font_family_list = ', '.join([f"'{language}-font-{i+1}'" for i in range(len(font_files))])
        
        lang_code = self.languages[language]
        font_css += f"""
.lang-{lang_code} {{
    font-family: {font_family_list}, sans-serif;
}}

.lang-{lang_code} * {{
    font-family: {font_family_list}, sans-serif;
}}
"""
        
        if lang_code == 'ar':
            font_css += f"""
.lang-{lang_code} {{
    direction: rtl;
    text-align: right;
}}

.lang-{lang_code} .exampleslibrary-mobile {{
    direction: rtl;
}}
"""
        
        return font_css

    def get_fallback_fonts(self, language: str) -> str:
        lang_code = self.languages[language]
        
        fallback_fonts = {
            'ko': 'font-family: "Malgun Gothic", "Apple SD Gothic Neo", sans-serif;',
            'ja': 'font-family: "Yu Gothic", "Hiragino Sans", sans-serif;',
            'hi': 'font-family: "Mangal", "Lohit Devanagari", sans-serif;',
            'el': 'font-family: "Times New Roman", serif;',
            'th': 'font-family: "Leelawadee UI", "Tahoma", sans-serif;',
            'ar': 'font-family: "Tahoma", "Microsoft Sans Serif", sans-serif; direction: rtl; text-align: right;'
        }
        
        font_rule = fallback_fonts.get(lang_code, 'font-family: sans-serif;')
        
        css = f"""
.lang-{lang_code} {{
    {font_rule}
}}

.lang-{lang_code} * {{
    {font_rule}
}}
"""
        
        if lang_code == 'ar':
            css += """
.lang-ar {
    direction: rtl;
    text-align: right;
}

.lang-ar .exampleslibrary-mobile {
    direction: rtl;
}
"""
        
        return css

    def create_language_variant(self, html_content: str, css_content: str, 
                              language: str) -> Tuple[str, str]:
        lang_code = self.languages[language]
        
        text_elements = self.extract_all_text(html_content)
        
        soup = BeautifulSoup(html_content, 'html.parser')
        
        translated_count = 0
        for text_info in tqdm(text_elements, desc=f"Translating {language}"):
            original_text = text_info['original_text']
            
            translated_text = self.translate_text(original_text, lang_code)
            
            if translated_text != original_text:
                translated_count += 1
                
                for string in soup.find_all(string=True):
                    if string.strip() == original_text:
                        string.replace_with(translated_text)
                        break
        
        main_container = soup.find('div', class_='exampleslibrary-mobile')
        if main_container:
            existing_classes = main_container.get('class', [])
            main_container['class'] = existing_classes + [f'lang-{lang_code}']
        
        font_css = self.get_font_css(language)
        updated_css = css_content + "\n\n" + font_css
        
        return str(soup), updated_css

    def generate_all_variants(self, html_content: str, css_content: str) -> Dict:
        variants = {}
        
        for language in tqdm(self.languages, desc="Languages"):
            try:
                variant_html, variant_css = self.create_language_variant(
                    html_content, css_content, language
                )
                
                variants[language] = {
                    'html': variant_html,
                    'css': variant_css,
                    'lang_code': self.languages[language],
                    'status': 'success'
                }
                
            except Exception as e:
                variants[language] = {
                    'status': 'failed',
                    'error': str(e)
                }
        
        return variants

    def save_variants(self, variants: Dict, output_dir: str = "translated_variants"):
        os.makedirs(output_dir, exist_ok=True)
        
        for language, variant_data in tqdm(variants.items(), desc="Saving"):
            if variant_data['status'] == 'success':
                html_template = f"""<!DOCTYPE html>
<html lang="{variant_data['lang_code']}">
<head>
    <meta charset="utf-8">
    <meta name="viewport" content="initial-scale=1, width=device-width">
    <title>Mobile App - {language.title()}</title>
    <style>
{variant_data['css']}
    </style>
</head>
<body>
{variant_data['html'][variant_data['html'].find('<body>') + 6:variant_data['html'].find('</body>')]}
</body>
</html>"""
                
                filename = os.path.join(output_dir, f"index_{language}.html")
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(html_template)

def main():
    try:
        with open('/Users/promachowdhury/whatBreaksIt/m3-dataset/seeds/variants_1/index.html', 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        with open('/Users/promachowdhury/whatBreaksIt/m3-dataset/seeds/variants_1/index.css', 'r', encoding='utf-8') as f:
            css_content = f.read()
    except FileNotFoundError as e:
        print(f"File not found: {e}")
        return
    
    translator = SimpleHTMLTranslator()
    
    variants = translator.generate_all_variants(html_content, css_content)
    
    translator.save_variants(variants)
    
    successful = [lang for lang, data in variants.items() if data['status'] == 'success']
    print(f"Created {len(successful)} language variants: {', '.join(successful)}")

if __name__ == "__main__":
    main()

Translating korean: 100%|██████████| 41/41 [00:12<00:00,  3.23it/s]
Translating japanese: 100%|██████████| 41/41 [00:11<00:00,  3.68it/s]
Translating hindi: 100%|██████████| 41/41 [00:12<00:00,  3.40it/s]
Translating greek: 100%|██████████| 41/41 [00:12<00:00,  3.24it/s]
Translating thai: 100%|██████████| 41/41 [00:11<00:00,  3.47it/s]
Translating arabic: 100%|██████████| 41/41 [00:11<00:00,  3.50it/s]
Languages: 100%|██████████| 6/6 [01:12<00:00, 12.03s/it]
Saving: 100%|██████████| 6/6 [00:00<00:00, 1456.27it/s]

Created 6 language variants: korean, japanese, hindi, greek, thai, arabic





In [2]:
!pip install deep-translator


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
