# Cell 1 üåç Universal Translator v1.3
NOTES HERE

## Cell 2 üîß Setup & Installation {#setup}
Run these cells once to set up your environment

In [1]:
# Cell 3 Install required packages
%pip install ruff deep-translator pytesseract pillow

# Verify installations
import sys
print(f"‚úÖ Python version: {sys.version}")
print("‚úÖ All packages installed successfully!")
print("üì¶ Installed: ruff, deep-translator, pytesseract, pillow")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
‚úÖ Python version: 3.12.1 (main, Jul 10 2025, 11:57:50) [GCC 13.3.0]
‚úÖ All packages installed successfully!
üì¶ Installed: ruff, deep-translator, pytesseract, pillow


## Cell 4 üîß Code Quality Check
### Ruff Linting & PEP 8 Validation
Run this cell after installation to check and auto-fix code style issues

In [5]:
# Cell 5 - Ruff Code Quality Check & Fix

# Imports at the TOP (fixes the E402 error)
import os
import subprocess

# Clean up any old config files
for file in ['ruff_settings.txt', '../ruff_settings.txt']:
    if os.path.exists(file):
        os.remove(file)
        print(f"üóëÔ∏è Cleaned up {file}")

print("üîç RUFF CODE QUALITY CHECK FOR V1.3")
print("=" * 50)

# First, check what we have
print("üìä Initial check:")
!ruff check translator_v1.3.ipynb --statistics

print("\n" + "=" * 50)
print("üîß Auto-fixing safe issues...")
!ruff check translator_v1.3.ipynb --fix

print("\n" + "=" * 50)
print("üìã Final status:")
!ruff check translator_v1.3.ipynb --statistics

# Show success or what's left (subprocess already imported at top)
result = subprocess.run(['ruff', 'check', 'translator_v1.3.ipynb'], 
                       capture_output=True, text=True)
if result.returncode == 0:
    print("\nüéâ SUCCESS! All checks passed!")
else:
    print("\nüí° Some style issues remain (usually line length)")
    print("These don't affect functionality")

üîç RUFF CODE QUALITY CHECK FOR V1.3
üìä Initial check:
[1m2[0m	[1;31mF541[0m	[[36m*[0m] f-string-missing-placeholders
Found 2 errors.
[[36m*[0m] 2 fixable with the `--fix` option.



üîß Auto-fixing safe issues...
Found 2 errors (2 fixed, 0 remaining).

üìã Final status:

üéâ SUCCESS! All checks passed!


## Cell 6 üíª Main Implementation {#implementation}
### UniversalTranslator Class - Test Ready
PEP 8 compliant implementation with comprehensive documentation

In [3]:
# Standard library imports
import re
from enum import Enum
from typing import Dict

# Third-party imports
import pytesseract
from deep_translator import GoogleTranslator
from PIL import Image, ImageEnhance, ImageFilter

"""
Universal Translator Module v1.3
PEP 8 compliant implementation for image text extraction and translation
Now with Enum support for better type safety
"""

# Module information
__version__ = "1.3"
__author__ = "Victor"
__date__ = "November 2, 2025"

print(f"üìö Universal Translator Module v{__version__} loaded")
print(f"üë§ Author: {__author__}")

üìö Universal Translator Module v1.3 loaded
üë§ Author: Victor


In [4]:
class Language(Enum):
    """
    Enumeration of supported languages with their Tesseract codes.
    
    This enum provides type-safe language selection and includes
    the Tesseract OCR language codes as values.
    """
    ENGLISH = 'eng'
    CHINESE = 'chi_sim+chi_tra'  # Both simplified and traditional
    JAPANESE = 'jpn'
    KOREAN = 'kor'
    HINDI = 'hin'
    
    @classmethod
    def list_supported(cls) -> list:
        """Return list of supported language names."""
        return [lang.name.lower() for lang in cls]
    
    @classmethod
    def from_string(cls, lang_str: str) -> 'Language':
        """Convert string to Language enum (for error messages)."""
        lang_upper = lang_str.upper()
        if hasattr(cls, lang_upper):
            return cls[lang_upper]
        raise ValueError(f"Unsupported language: {lang_str}")


class UniversalTranslator:
    """
    A universal translator for extracting and translating text from images.
    
    This class supports text extraction from images in multiple languages
    using Enum-based language selection for type safety.
    
    Attributes:
        supported_languages (list): List of Language enum members.
    """
    
    # Class constants
    IMAGE_SCALE_FACTOR = 3
    CONTRAST_ENHANCEMENT = 2.5
    BRIGHTNESS_ENHANCEMENT = 1.2
    
    def __init__(self) -> None:
        """
        Initialize the UniversalTranslator.
        
        Sets up supported languages using the Language enum.
        """
        self.supported_languages = list(Language)
        self._setup_complete()
    
    def _setup_complete(self) -> None:
        """Print initialization confirmation."""
        print("‚úÖ Universal Translator v1.3 initialized!")
        print(f"üìö Supported languages: {', '.join([lang.name.lower() for lang in self.supported_languages])}")
    
    def enhance_image(self, image_path: str) -> str:
        """
        Enhance image quality for better OCR results.
        
        Args:
            image_path (str): Path to the input image file.
            
        Returns:
            str: Path to the enhanced image file.
            
        Raises:
            FileNotFoundError: If the image file doesn't exist.
            IOError: If the image cannot be processed.
        """
        try:
            # Open and convert to grayscale
            img = Image.open(image_path)
            img = img.convert('L')
            
            # Upscale image for better OCR accuracy
            width, height = img.size
            new_size = (
                width * self.IMAGE_SCALE_FACTOR,
                height * self.IMAGE_SCALE_FACTOR
            )
            img = img.resize(new_size, Image.Resampling.LANCZOS)
            
            # Apply contrast enhancement
            contrast_enhancer = ImageEnhance.Contrast(img)
            img = contrast_enhancer.enhance(self.CONTRAST_ENHANCEMENT)
            
            # Apply brightness enhancement
            brightness_enhancer = ImageEnhance.Brightness(img)
            img = brightness_enhancer.enhance(self.BRIGHTNESS_ENHANCEMENT)
            
            # Apply sharpening filters
            for _ in range(2):
                img = img.filter(ImageFilter.SHARPEN)
            
            # Save enhanced image
            enhanced_path = f"enhanced_{image_path}"
            img.save(enhanced_path)
            
            print(f"‚úÖ Image enhanced: {enhanced_path}")
            return enhanced_path
            
        except FileNotFoundError as e:
            error_msg = f"‚ùå Image file not found: {image_path}"
            print(error_msg)
            raise FileNotFoundError(error_msg) from e
        except Exception as e:
            error_msg = f"‚ùå Error processing image: {str(e)}"
            print(error_msg)
            raise IOError(error_msg) from e
    
    def _fix_english_text(self, text: str) -> str:
        """
        Apply English-specific text corrections.
        
        Args:
            text (str): Raw text to be corrected.
            
        Returns:
            str: Corrected text.
        """
        if not text:
            return ""
        
        # Dictionary of known OCR errors and corrections
        direct_fixes = {
            'Helloworld': 'Hello World',
            'HelloWorld': 'Hello World',
            'Thisisa': 'This is a',
            'This isa': 'This is a',
            'toour': 'to our',
            'aboutour': 'about our',
            'GRANDOPENING': 'GRAND OPENING',
            'SO OFF': '50% OFF',
            'SOOFF': '50% OFF',
            'Pythonm': 'Python',
        }
        
        # Apply direct replacements
        for incorrect, correct in direct_fixes.items():
            text = text.replace(incorrect, correct)
        
        # Pattern-based corrections
        patterns = [
            (r'\bisa\b', 'is a'),
            (r'([a-z])([A-Z])', r'\1 \2'),
            (r'([a-zA-Z])(\d)', r'\1 \2'),
            (r'(\d)([a-zA-Z])', r'\1 \2'),
        ]
        
        for pattern, replacement in patterns:
            text = re.sub(pattern, replacement, text)
        
        # Fix common OCR errors
        common_errors = {
            ' tbe ': ' the ',
            ' amd ': ' and ',
            ' isa ': ' is a '
        }
        
        for error, correction in common_errors.items():
            text = text.replace(error, correction)
        
        # Clean up extra whitespace
        text = ' '.join(text.split())
        
        return text
    
    def fix_text(self, text: str, language: Language) -> str:
        """
        Apply language-specific text corrections.
        
        Args:
            text (str): Raw text extracted from OCR.
            language (Language): Language enum member.
            
        Returns:
            str: Corrected text.
        """
        if not text:
            return ""
        
        if language == Language.ENGLISH:
            return self._fix_english_text(text)
        
        # TODO: Implement fixes for other languages
        # Placeholder for future implementations
        language_fixers = {
            Language.CHINESE: lambda t: t,   # Future: _fix_chinese_text
            Language.JAPANESE: lambda t: t,  # Future: _fix_japanese_text
            Language.KOREAN: lambda t: t,    # Future: _fix_korean_text
            Language.HINDI: lambda t: t      # Future: _fix_hindi_text
        }
        
        fixer = language_fixers.get(language, lambda t: t)
        return fixer(text)
    
    def _get_ocr_config(self, image_path: str) -> str:
        """
        Determine optimal OCR configuration based on image type.
        
        Args:
            image_path (str): Path to the image file.
            
        Returns:
            str: Tesseract configuration string.
        """
        image_lower = image_path.lower()
        
        # Configuration based on image type
        configs = {
            'document': r'--oem 3 --psm 6',   # Uniform text block
            'sign': r'--oem 3 --psm 11',      # Sparse text
            'screenshot': r'--oem 3 --psm 3',  # Automatic
            'default': r'--oem 3 --psm 3'      # Automatic
        }
        
        for key, config in configs.items():
            if key in image_lower:
                return config
        
        return configs['default']
    
    def process(
        self,
        image_path: str,
        language: Language = Language.ENGLISH
    ) -> Dict[str, str]:
        """
        Process an image to extract and optionally translate text.
        
        Args:
            image_path (str): Path to the image file.
            language (Language): Source language enum member. 
                                Defaults to Language.ENGLISH.
            
        Returns:
            Dict[str, str]: Dictionary containing:
                - 'original': Raw extracted text
                - 'fixed': Corrected text
                - 'translated': English translation
                - 'language': Source language name
                
        Raises:
            TypeError: If language is not a Language enum member.
            FileNotFoundError: If image file doesn't exist.
        """
        # Validate language is enum
        if not isinstance(language, Language):
            raise TypeError(
                "‚ùå Language must be a Language enum member. "
                "Use: Language.ENGLISH, Language.CHINESE, etc."
            )
        
        print(f"üîç Processing image: {image_path}")
        print(f"üåê Language: {language.name.lower()}")
        
        try:
            # Step 1: Enhance image
            enhanced_path = self.enhance_image(image_path)
            
            # Step 2: Extract text with OCR
            lang_code = language.value  # Get Tesseract code from enum
            config = self._get_ocr_config(image_path)
            
            print(f"üîß Using Tesseract config: {config}")
            raw_text = pytesseract.image_to_string(
                enhanced_path,
                lang=lang_code,
                config=config
            )
            
            # Step 3: Apply text corrections
            fixed_text = self.fix_text(raw_text, language)
            
            # Step 4: Translate if necessary
            if language != Language.ENGLISH and fixed_text:
                print("üåç Translating to English...")
                translator = GoogleTranslator(source='auto', target='en')
                translated_text = translator.translate(fixed_text)
            else:
                translated_text = fixed_text
            
            result = {
                'original': raw_text,
                'fixed': fixed_text,
                'translated': translated_text,
                'language': language.name.lower()
            }
            
            print("‚úÖ Processing complete!")
            return result
            
        except Exception as e:
            print(f"‚ùå Error processing image: {str(e)}")
            raise


# Initialize the translator
print("\n" + "="*50)
print("üöÄ Initializing Universal Translator v1.3...")
print("="*50)
translator = UniversalTranslator()


üöÄ Initializing Universal Translator v1.3...
‚úÖ Universal Translator v1.3 initialized!
üìö Supported languages: english, chinese, japanese, korean, hindi


## üß™ Testing & Examples {#testing}
Test the translator with sample images

In [None]:
# Test cell - Examples and demonstrations

def test_translator():
    """Test the translator with a sample image."""
    
    # Example usage (uncomment and modify as needed)
    """
    # Test with English text
    result = translator.process('english_test.png', 'english')
    
    print("üìÑ Test Results:")
    print("-" * 40)
    print(f"Original text: {result['original'][:100]}...")
    print(f"Fixed text: {result['fixed'][:100]}...")
    print(f"Language: {result['language']}")
    """
    
    print("üìù Test function ready!")
    print("Uncomment the code above and add your test image path")

# Call test function
test_translator()

# Quick test of core functions
print("\nüìã Core Functions Check:")
print(f"‚úÖ Supported languages: {translator.supported_languages}")
print(f"‚úÖ Language codes: {translator.language_codes}")

## üìö Development Notes {#notes}

### ‚úÖ Completed Features:
- Add notes Here

### üîÑ Future Improvements:
-  Add notes Here

### üìñ Change Log:
-  Add notes Here

### üêõ Known Issues:
-  Add notes Here

### üìö References:
-  Add notes Here