# Burmese OCR using Tesseract

This notebook provides OCR functionality for Burmese text using Google Tesseract.

In [None]:
# Install required packages
!apt-get update
!apt-get install -y tesseract-ocr poppler-utils
!pip install pytesseract Pillow pdf2image

# Install Burmese language data
!wget https://github.com/tesseract-ocr/tessdata/raw/main/mya.traineddata
!mv mya.traineddata /usr/share/tesseract-ocr/4.00/tessdata/

In [None]:
import os
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
from google.colab import files
import tempfile

# Configure Tesseract
custom_config = r'--oem 3 --psm 6 -l mya+eng'

In [None]:
def process_image(image_path):
    """Process a single image and return extracted text"""
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image, lang='mya', config=custom_config)
    # Apply the same text corrections as in desktop version
    text = text.replace('|', 'I').replace('၀', '0').replace('သ်', 'ာ')
    text = text.replace('\n\n', '\n').strip()
    return text

def process_pdf(pdf_path):
    """Convert PDF to images and process each page"""
    try:
        # Convert PDF to images with higher DPI for better quality
        pages = convert_from_path(pdf_path, dpi=300)
        text_results = []
        
        print(f"Processing {len(pages)} pages...")
        
        for i, page in enumerate(pages):
            print(f"Processing page {i+1}/{len(pages)}")
            with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
                # Save page as PNG
                page.save(tmp.name, 'PNG')
                # Process the page
                text = process_image(tmp.name)
                text_results.append(f"\n## Page {i+1}\n\n```burmese\n{text}\n```\n")
                # Clean up temporary file
                os.unlink(tmp.name)
        
        return '\n'.join(text_results)
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        raise

In [None]:
# File upload widget
print("Upload your PDF or image file (supported formats: PDF, PNG, JPG, JPEG)")
uploaded = files.upload()

for filename in uploaded.keys():
    print(f"\nProcessing {filename}...")
    
    # Save uploaded file
    with open(filename, 'wb') as f:
        f.write(uploaded[filename])
    
    try:
        # Process based on file type
        if filename.lower().endswith('.pdf'):
            text = process_pdf(filename)
        else:
            text = process_image(filename)
        
        print("\nExtracted Text:")
        print(text)
        
        # Save output to a markdown file
        output_filename = f"{os.path.splitext(filename)[0]}_extracted.md"
        with open(output_filename, 'w', encoding='utf-8') as f:
            f.write(f"# OCR Results - {filename}\n\n{text}")
        
        # Provide download link
        files.download(output_filename)
        
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")
    finally:
        # Clean up
        if os.path.exists(filename):
            os.remove(filename)
        if os.path.exists(output_filename):
            os.remove(output_filename)