In [None]:
import os
import sys
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import json

# Add project root directory to path
sys.path.append('..')

# Import project modules
from models.extractor import DocumentExtractor
from utils.preprocess import process_image_for_ocr
from utils.visualize import highlight_regions, format_extracted_data


In [None]:
# Initialize document extractor with Tesseract OCR
tesseract_extractor = DocumentExtractor(ocr_engine='tesseract')

# Initialize document extractor with EasyOCR (uncomment if needed)
# easyocr_extractor = DocumentExtractor(ocr_engine='easyocr')


In [None]:
# Path to sample invoice
invoice_path = '../data/examples/sample_invoice.jpg'

# Check if file exists
if os.path.exists(invoice_path):
    # Load image
    invoice_image = Image.open(invoice_path)
    
    # Display original image
    plt.figure(figsize=(10, 10))
    plt.imshow(invoice_image)
    plt.title('Sample Invoice')
    plt.axis('off')
    plt.show()
    
    # Extract information from invoice
    invoice_info = tesseract_extractor.extract_info(invoice_image, 'invoice')
    
    # Print extracted information
    print("Extracted Information from Invoice:")
    print(json.dumps(invoice_info, indent=4))
    
    # Detect text regions
    invoice_np = np.array(invoice_image)
    text_regions = tesseract_extractor.get_text_regions(invoice_np)
    
    # Highlight text regions
    highlighted = highlight_regions(invoice_np, text_regions)
    
    # Display highlighted image
    plt.figure(figsize=(10, 10))
    plt.imshow(cv2.cvtColor(highlighted, cv2.COLOR_BGR2RGB))
    plt.title('Detected Text Regions')
    plt.axis('off')
    plt.show()
else:
    print(f"Sample invoice not found at '{invoice_path}'. Please add a sample invoice for testing.")


In [None]:
# Path to sample resume
resume_path = '../data/examples/sample_resume.jpg'

# Check if file exists
if os.path.exists(resume_path):
    # Load image
    resume_image = Image.open(resume_path)
    
    # Display original image
    plt.figure(figsize=(10, 10))
    plt.imshow(resume_image)
    plt.title('Sample Resume')
    plt.axis('off')
    plt.show()
    
    # Extract information from resume
    resume_info = tesseract_extractor.extract_info(resume_image, 'resume')
    
    # Print extracted information
    print("Extracted Information from Resume:")
    print(json.dumps(resume_info, indent=4))
else:
    print(f"Sample resume not found at '{resume_path}'. Please add a sample resume for testing.")


In [None]:
# Path to sample receipt
receipt_path = '../data/examples/sample_receipt.jpg'

# Check if file exists
if os.path.exists(receipt_path):
    # Load image
    receipt_image = Image.open(receipt_path)
    
    # Display original image
    plt.figure(figsize=(10, 10))
    plt.imshow(receipt_image)
    plt.title('Sample Receipt')
    plt.axis('off')
    plt.show()
    
    # Extract information from receipt
    receipt_info = tesseract_extractor.extract_info(receipt_image, 'receipt')
    
    # Print extracted information
    print("Extracted Information from Receipt:")
    print(json.dumps(receipt_info, indent=4))
else:
    print(f"Sample receipt not found at '{receipt_path}'. Please add a sample receipt for testing.")


In [None]:
# Create custom extraction rules for invoice
invoice_rules = {
    'invoice_number': {
        'type': 'regex',
        'pattern': r'Invoice\s*#?:?\s*([A-Z0-9-]+)',
        'first_match_only': True
    },
    'date': {
        'type': 'regex',
        'pattern': r'Date:?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
        'first_match_only': True
    },
    'total_amount': {
        'type': 'regex',
        'pattern': r'Total:?\s*\$?(\d+\.\d{2})',
        'first_match_only': True
    }
}

# Test custom extraction rules if invoice image exists
if os.path.exists(invoice_path):
    # Extract information using custom rules
    custom_info = tesseract_extractor.custom_extract(invoice_image, 'invoice', invoice_rules)
    
    # Print extracted information
    print("Information extracted with custom rules:")
    print(json.dumps(custom_info, indent=4))
    
    # Add extraction rules to extractor's configuration
    for field_name, rule in invoice_rules.items():
        if rule['type'] == 'regex':
            tesseract_extractor.add_field_extraction_rule(
                doc_type='invoice',
                field_name=field_name,
                rule_type='regex',
                pattern=rule['pattern'],
                first_match_only=rule.get('first_match_only', True)
            )
    
    # Save extraction configuration
    tesseract_extractor.save_config('../models/extraction_config.json')
    print("Extraction configuration saved.")
