In [3]:
#Detects text in a document stored in an S3 bucket. Display polygon box around text and angled text 
import boto3
import io
from PIL import Image, ImageDraw

# Displays information about a block returned by text detection and text analysis
def DisplayBlockInformation(block):
    print('Id: {}'.format(block['Id']))
    if 'Text' in block:
        print('    Detected: ' + block['Text'])
    print('    Type: ' + block['BlockType'])
   
    if 'Confidence' in block:
        print('    Confidence: ' + "{:.2f}".format(block['Confidence']) + "%")

    if block['BlockType'] == 'CELL':
        print("    Cell information")
        print("        Column: " + str(block['ColumnIndex']))
        print("        Row: " + str(block['RowIndex']))
        print("        ColumnSpan: " + str(block['ColumnSpan']))
        print("        RowSpan: " + str(block['RowSpan']))    
    
    if 'Relationships' in block:
        print('    Relationships: {}'.format(block['Relationships']))
    print('    Geometry: ')
    print('        Bounding Box: {}'.format(block['Geometry']['BoundingBox']))
    print('        Polygon: {}'.format(block['Geometry']['Polygon']))
    
    if block['BlockType'] == "KEY_VALUE_SET":
        print ('    Entity Type: ' + block['EntityTypes'][0])
    if 'Page' in block:
        print('Page: ' + block['Page'])
    print()

def process_text_detection(bucket, document, region):

    #Get the document from S3
    s3_connection = boto3.resource('s3')
                          
    s3_object = s3_connection.Object(bucket,document)
    s3_response = s3_object.get()

    stream = io.BytesIO(s3_response['Body'].read())
    image=Image.open(stream)

   
    # Detect text in the document
    client = boto3.client('textract', region_name=region)

    #To process using image bytes:                      
    #image_binary = stream.getvalue()
    #response = client.detect_document_text(Document={'Bytes': image_binary})

    #process using S3 object
    response = client.detect_document_text(
        Document={'S3Object': {'Bucket': bucket, 'Name': document}})

    #Get the text blocks
    blocks=response['Blocks']
    width, height =image.size    
    print ('Detected Document Text')
   
    # Create image showing bounding box/polygon the detected lines/text
    for block in blocks:
            print('Type: ' + block['BlockType'])
            if block['BlockType'] != 'PAGE':
                print('Detected: ' + block['Text'])
                print('Confidence: ' + "{:.2f}".format(block['Confidence']) + "%")

            print('Id: {}'.format(block['Id']))
            if 'Relationships' in block:
                print('Relationships: {}'.format(block['Relationships']))
            print('Bounding Box: {}'.format(block['Geometry']['BoundingBox']))
            print('Polygon: {}'.format(block['Geometry']['Polygon']))
            print()
            draw=ImageDraw.Draw(image)
            # Draw WORD - Green -  start of word, red - end of word
            if block['BlockType'] == "WORD":
                draw.line([(width * block['Geometry']['Polygon'][0]['X'],
                height * block['Geometry']['Polygon'][0]['Y']),
                (width * block['Geometry']['Polygon'][3]['X'],
                height * block['Geometry']['Polygon'][3]['Y'])],fill='green',
                width=2)
            
                draw.line([(width * block['Geometry']['Polygon'][1]['X'],
                height * block['Geometry']['Polygon'][1]['Y']),
                (width * block['Geometry']['Polygon'][2]['X'],
                height * block['Geometry']['Polygon'][2]['Y'])],
                fill='red',
                width=2)    

                 
            # Draw box around entire LINE  
            if block['BlockType'] == "LINE":
                points=[]

                for polygon in block['Geometry']['Polygon']:
                    points.append((width * polygon['X'], height * polygon['Y']))

                draw.polygon((points), outline='black')    

    # Display the image
    image.show()

    return len(blocks)

def main():

    bucket = 'tejasbucket123'
    document = 'gz16-stk-software-examples-documentation-as-pdf-file.jpg'
    region='ap-south-1'
    block_count=process_text_detection(bucket,document,region)
    print("Blocks detected: " + str(block_count))
    
if __name__ == "__main__":
    main()



Detected Document Text
Type: PAGE
Id: 2f04385d-d7ab-47e0-9022-8f5844b71479
Relationships: [{'Type': 'CHILD', 'Ids': ['180c3d1d-f60d-4c6c-a202-788f88df3d07', '897b1dfe-1232-4ec6-b952-4c12a61dbc28', '13a74dab-1d45-4b6d-8dcf-39916cce22db', '4b89a309-e2dd-470b-88c4-a159414f6a68', 'fb54e19a-eb26-431a-80be-155fa6acf9b1', 'b6f682e3-f4d2-43f6-b9f5-cd0dbeacae48', '563ba87c-e8cf-40ee-ab8b-55730e4dd7a5', 'c1a5cc79-a102-4b54-8ce7-ef1d32c3885f', 'ccba45b4-0ac3-4c22-aa71-3b444300443e', '1c6e7920-08fe-4b48-aa8d-72d059628442', '0523baea-244c-4159-8be9-66ce6cfdeba8', 'b12978e2-2244-435f-a37b-f6214b07a3ab', '4a3744db-957e-4162-8618-20d7f9faba9e', '810f0140-f4cc-4d19-9de4-c0874ff9dc97', '5701baec-2c05-4b86-96b6-ba7e2823735e', '368cabc3-2258-41a4-ba8b-03ed8018f033', 'a81d7e2d-881e-4203-81c3-9110d361e62c', '31601fc1-1b1d-4a4b-99de-8664ee18fd4e', 'bb1c6242-c0a0-42f5-9ab8-b002af0656d1', '8083b9d8-8f0e-4c07-92e8-ef236d2f2185', 'e2a38268-e06a-44a7-bd6e-b69590c827eb', '1ed352db-ff9f-4561-a50e-9fe46077dd96', '20

Blocks detected: 568
