# Burmese OCR using Tesseract

This notebook provides OCR functionality for Burmese text using Google Tesseract.

In [None]:
# Install required packages
!apt-get install -y tesseract-ocr
!pip install pytesseract Pillow pdf2image

# Install Burmese language data
!wget https://github.com/tesseract-ocr/tessdata/raw/main/mya.traineddata
!mv mya.traineddata /usr/share/tesseract-ocr/4.00/tessdata/

In [None]:
import os
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
from google.colab import files
import tempfile

In [None]:
def process_image(image_path):
    """Process a single image and return extracted text"""
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image, lang='mya')
    return text

def process_pdf(pdf_path):
    """Convert PDF to images and process each page"""
    pages = convert_from_path(pdf_path)
    text_results = []
    
    for i, page in enumerate(pages):
        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
            page.save(tmp.name, 'PNG')
            text = process_image(tmp.name)
            text_results.append(f"Page {i+1}:\n{text}\n")
            os.unlink(tmp.name)
    
    return '\n'.join(text_results)

In [None]:
# File upload widget
print("Upload your PDF or image file (supported formats: PDF, PNG, JPG, JPEG)")
uploaded = files.upload()

for filename in uploaded.keys():
    print(f"\nProcessing {filename}...")
    
    # Save uploaded file
    with open(filename, 'wb') as f:
        f.write(uploaded[filename])
    
    # Process based on file type
    if filename.lower().endswith('.pdf'):
        text = process_pdf(filename)
    else:
        text = process_image(filename)
    
    print("\nExtracted Text:")
    print(text)
    
    # Clean up
    os.remove(filename)