In [None]:
# !pip install -q pytesseract pdf2image pillow pypdf

In [None]:
import sys
from pathlib import Path

# Add CUBO to path
cubo_root = Path(".").resolve().parent
if str(cubo_root) not in sys.path:
    sys.path.insert(0, str(cubo_root))

print(f"CUBO root: {cubo_root}")

| PDF (text) | pypdf | Fastest, preserves formatting |

In [None]:
import subprocess

def check_tesseract():
    """Check if Tesseract OCR is installed."""
    try:
        result = subprocess.run(['tesseract', '--version'], 
                               capture_output=True, text=True)
        version = result.stdout.split('\n')[0]
        return True, version
    except FileNotFoundError:
        return False, None

available, version = check_tesseract()
print(f"Tesseract: {'‚úÖ ' + version if available else '‚ùå Not installed'}")

if not available:
    print("\nInstall Tesseract:")
    print("  Windows: choco install tesseract")
    print("  macOS: brew install tesseract")
    print("  Linux: apt-get install tesseract-ocr")

## 2Ô∏è‚É£ Using the PDF OCR Processor

CUBO's `PDFOCRProcessor` handles both text and scanned PDFs:

In [None]:
try:
    from cubo.processing.pdf_ocr_processor import PDFOCRProcessor
    
    # Initialize processor
    processor = PDFOCRProcessor()
    print("‚úÖ PDFOCRProcessor loaded")
    print(f"   OCR Enabled: {processor.ocr_enabled}")
    print(f"   Tesseract Available: {processor.tesseract_available}")
except ImportError as e:
    print(f"‚ùå Could not import PDFOCRProcessor: {e}")
    processor = None

## 3Ô∏è‚É£ Process a PDF Document

In [None]:
# Create a sample text-based PDF for testing
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

def create_sample_pdf(output_path: str):
    """Create a simple test PDF."""
    c = canvas.Canvas(output_path, pagesize=letter)
    c.drawString(100, 750, "CUBO Test Document")
    c.drawString(100, 730, "This is a sample PDF for OCR testing.")
    c.drawString(100, 710, "Page 1 of 1")
    c.drawString(100, 670, "Key Information:")
    c.drawString(120, 650, "- Invoice Number: INV-2024-001")
    c.drawString(120, 630, "- Date: November 30, 2024")
    c.drawString(120, 610, "- Amount: $1,234.56")
    c.save()
    return output_path

try:
    test_pdf = create_sample_pdf("test_document.pdf")
    print(f"‚úÖ Created test PDF: {test_pdf}")
except ImportError:
    print("‚ÑπÔ∏è Install reportlab to create test PDFs: pip install reportlab")
    test_pdf = None

In [None]:
# Process the PDF
if processor and test_pdf and Path(test_pdf).exists():
    result = processor.process(test_pdf)
    
    print("üìÑ Processing Result:")
    print(f"   Pages: {result.get('page_count', 'unknown')}")
    print(f"   Method: {result.get('extraction_method', 'unknown')}")
    print(f"\n   Text Preview:")
    text = result.get('text', '')[:500]
    print(f"   {text}...")

## 4Ô∏è‚É£ OCR on Images

Process images directly using OCR:

In [None]:
from PIL import Image, ImageDraw, ImageFont

def create_sample_image(output_path: str):
    """Create a test image with text."""
    img = Image.new('RGB', (400, 200), color='white')
    draw = ImageDraw.Draw(img)
    
    # Add text
    draw.text((20, 20), "CUBO OCR Test", fill='black')
    draw.text((20, 60), "Contract ID: CNT-2024-999", fill='black')
    draw.text((20, 100), "Effective Date: January 1, 2025", fill='black')
    draw.text((20, 140), "Value: EUR 50,000.00", fill='black')
    
    img.save(output_path)
    return output_path

test_image = create_sample_image("test_image.png")
print(f"‚úÖ Created test image: {test_image}")

# Display the image
from IPython.display import display, Image as IPImage
display(IPImage(filename=test_image))

In [None]:
# OCR the image
if available:  # Tesseract available
    import pytesseract
    from PIL import Image
    
    img = Image.open(test_image)
    text = pytesseract.image_to_string(img)
    
    print("üìù OCR Result:")
    print(text)
else:
    print("‚ö†Ô∏è Tesseract not available for OCR")

## 5Ô∏è‚É£ Dolphin Vision Integration

For complex documents, CUBO can use the Dolphin vision-language model:

In [None]:
try:
    from cubo.models.dolphin_processor import DolphinProcessor
    
    dolphin = DolphinProcessor()
    dolphin_available = dolphin.is_available()
    print(f"Dolphin Vision: {'‚úÖ Available' if dolphin_available else '‚ùå Not loaded'}")
    
    if not dolphin_available:
        print("\nTo enable Dolphin vision:")
        print("  python download_dolphin.py --download")
except ImportError:
    print("‚ÑπÔ∏è Dolphin processor not available")
    dolphin_available = False

## 6Ô∏è‚É£ Best Practices for Document Processing

### Supported Formats
| Format | Method | Notes |
|--------|--------|-------|
| PDF (text) | PyPDF2 | Fastest, preserves formatting |
| PDF (scanned) | Tesseract OCR | Requires Tesseract |
| Images (PNG/JPG) | Tesseract OCR | Good quality needed |
| DOCX | python-docx | Full text extraction |

### Tips
1. **High-quality scans**: 300 DPI minimum for OCR
2. **Preprocessing**: CUBO auto-adjusts contrast and rotation
3. **Language packs**: Install Tesseract language packs for non-English
4. **Batch processing**: Use `/api/ingest` for multiple files

In [None]:
# Cleanup
import os
for f in ["test_document.pdf", "test_image.png"]:
    if os.path.exists(f):
        os.remove(f)
print("‚úÖ Cleaned up test files")

## üéØ Next Steps

- Try processing your own documents
- Configure OCR settings in `config.json`
- See [API Documentation](../docs/API_INTEGRATION.md) for batch processing