In [None]:
import boto3
import io
import pytesseract
from PIL import Image, ImageDraw, ImageColor
from typing import Tuple, Dict, List, Union

# Get Document Layout

In [1]:
def get_coordinates(box: Dict[str, float], width: int, height: int, extra_width: float = 0.005) -> Tuple[float, float, float, float]:
    """
    Calculate coordinates (X1, Y1, X2, Y2) of a bounding box within an image.

    Parameters:
    - box (dict): A dictionary containing 'Left', 'Top', 'Width', and 'Height' values representing the bounding box.
    - width (int): Width of the image.
    - height (int): Height of the image.
    - extra_width (float): Additional width to be considered for the bounding box.

    Returns:
    tuple: A tuple containing X1, Y1, X2, Y2 coordinates of the bounding box.
    """
    X1 = width * (box['Left'] - extra_width)
    Y1 = height * box['Top']
    X2 = X1 + (width * (box['Width'] + 2 * extra_width))
    Y2 = Y1 + (height * box['Height'])

    return X1, Y1, X2, Y2


def process_text_analysis(s3_connection, client, bucket: str, document: str, show_image: bool = True)\
             -> Dict[str, Union[str, List[Dict[str, Union[str, float, Tuple[float, float, float, float]]]]]]:
    """
    Analyze the layout of a document, draw bounding boxes, and perform OCR on text blocks.

    Parameters:
    - s3_connection: The connection to the S3 service.
    - client: The AWS Textract client.
    - bucket (str): The S3 bucket containing the document.
    - document (str): The name of the document.
    - show_image (bool): Whether to display the image with bounding boxes.

    Returns:
    dict: A dictionary containing information about the processed document layout.
    """
    layout_color_palette: Dict[str, Union[Tuple[int, int, int], str]] = {
        "LAYOUT_TITLE": ImageColor.getrgb("#e41a1c"),
        "LAYOUT_HEADER": ImageColor.getrgb("#377eb8"),
        "LAYOUT_FOOTER": ImageColor.getrgb("#4daf4a"),
        "LAYOUT_SECTION_HEADER": ImageColor.getrgb("#984ea3"),
        "LAYOUT_PAGE_NUMBER": ImageColor.getrgb("#ff7f00"),
        "LAYOUT_LIST": ImageColor.getrgb("#ffff33"),
        "LAYOUT_FIGURE": ImageColor.getrgb("#a65628"),
        "LAYOUT_TABLE": ImageColor.getrgb("#f781bf"),
        "LAYOUT_KEY_VALUE": ImageColor.getrgb("#999999"),
        "LAYOUT_TEXT": "yellow"
    }

    # Get the document from S3
    s3_object = s3_connection.Object(bucket, document)
    s3_response = s3_object.get()

    stream = io.BytesIO(s3_response['Body'].read())
    image = Image.open(stream)

    # Analyze the document
    image_binary = stream.getvalue()
    response = client.analyze_document(Document={'Bytes': image_binary}, FeatureTypes=["LAYOUT"])

    # Get the text blocks
    blocks = response['Blocks']
    width, height = image.size
    
    print('Detecting Document Layout')
    layout_information: List[Dict[str, Union[str, float, Tuple[float, float, float, float]]]] = []

    # Create image showing bounding box/polygon the detected lines/text
    for block in blocks:
        draw = ImageDraw.Draw(image)
        block_type = block['BlockType']
        block_color = layout_color_palette.get(block_type, None)
        if block_color:
            info = {}
            img_coords = get_coordinates(block['Geometry']['BoundingBox'], width, height)

            if show_image:
                draw.rectangle(img_coords, outline=block_color, width=3)
            info = {
                "id": block['Id'],
                "layout_type": block_type,
                "coordinates": img_coords,
                "block_color": block_color,
                "reading_order": len(layout_information) + 1
            }
            layout_information.append(info)

    # Display the image
    if show_image:
        image.show()

    return {"bucket": bucket, "document": document, "layout": layout_information}


In [2]:
session = boto3.Session(profile_name='tobi')
s3_connection = session.resource('s3')
client = session.client('textract', region_name='us-east-1')
bucket = 'hifeyinc-cluster'
document = 'projects/newspaper-ocr/data/jpeg/december_1994/PM_News_December_16__1994_Pg_8.jpeg'
docu_layout = process_text_analysis(s3_connection, client, bucket, document)

Detecting Document Layout


In [3]:
docu_layout["layout"][3]

{'id': '0e55e44a-9d5c-41ab-a83c-3c3ecb76af81',
 'layout_type': 'LAYOUT_TITLE',
 'coordinates': (522.9276955270767,
  102.64209349825978,
  797.3626277750731,
  285.9271633885801),
 'block_color': (228, 26, 28),
 'reading_order': 4}

# Perform OCR

In [4]:
def perform_ocr(img: Image.Image, img_coordinates: Tuple[float, float, float, float]) -> str:
    """
    Perform Optical Character Recognition (OCR) on a cropped image.

    Parameters:
    - img (PIL.Image.Image): The input image.
    - img_coordinates (tuple): A tuple containing the coordinates (X1, Y1, X2, Y2) of the cropped image.

    Returns:
    str: The OCR result as a string.
    """
    # Crop the image section within the bounding box
    cropped_image = img.crop(img_coordinates)

    # Perform OCR using pytesseract
    ocr_result = pytesseract.image_to_string(cropped_image)

    return ocr_result