In [None]:
!pip install amazon-textract-response-parser

In [None]:

import boto3
from IPython.display import Image, display
from trp import Document
from PIL import Image as PImage, ImageDraw
import time
from IPython.display import IFrame

# In this section, we will deep dive into Amazon Textract APIs and its feature. 
Amazon Textract includes simple, easy-to-use APIs that can analyze image files and PDF files.
Amazon Textract APIs can be classified into synchronous APIs for real time processing and asynchronous APIs for batch processing.
We will deep dive into each:
•	Synchronous APIs(Real time processing use case)
•	Asynchronous APIs(Batch processing use cases)
Synchronous APIs (Real time processing use case): There are two APIs which can help with real time analysis:
                         Analyze Text 
                         Analyze Document API


In [None]:
# Curent AWS Region. Use this to choose corresponding S3 bucket with sample content

mySession = boto3.session.Session()
awsRegion = mySession.region_name

In [None]:
# S3 bucket that contains sample documents. Download the sample documents and craete an Amazon s3 Bucket 

s3BucketName = "enter-your-s3-bucket-name"

In [None]:
# Amazon S3 client
s3 = boto3.client('s3')

# Amazon Textract client
textract = boto3.client('textract')

In [None]:
# 1. Detect text from image with

https://docs.aws.amazon.com/textract/latest/dg/API_DetectDocumentText.html

In [None]:
# Document
documentName = "sample-invoice.png"

In [None]:
display(Image(filename=documentName))

In [None]:
# Read document content
with open(documentName, 'rb') as document:
    imageBytes = bytearray(document.read())

# Call Amazon Textract
response = textract.detect_document_text(Document={'Bytes': imageBytes})


In [None]:
import json

print (json.dumps(response, indent=4, sort_keys=True))


# 2. Detect text from S3 object

https://docs.aws.amazon.com/textract/latest/dg/API_DetectDocumentText.html

## Lines and Words of Text - JSON Structure

https://docs.aws.amazon.com/textract/latest/dg/API_BoundingBox.html

https://docs.aws.amazon.com/textract/latest/dg/text-location.html

https://docs.aws.amazon.com/textract/latest/dg/how-it-works-lines-words.html

In [None]:
#  Reading order

In [None]:
# Document
documentName = "two-column-image.jpeg"

In [None]:
display(Image(filename=documentName))

In [None]:
import boto3

s3 = boto3.resource('s3')
s3.Bucket(s3BucketName).upload_file(documentName,documentName)

In [None]:
# Call Amazon Textract
response = textract.detect_document_text(
    Document={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': documentName
        }
    })

print(response)

In [None]:
#using trp.py to parse the json into reading order
doc = Document(response)
for page in doc.pages:
    for line in page.getLinesInReadingOrder():
          print(line[1])

# Analyze Document API for tables and Forms: Key/Values

https://docs.aws.amazon.com/textract/latest/dg/API_AnalyzeDocument.html

In [None]:
# Document
documentName = "sample-invoice.png"

In [None]:
display(Image(filename=documentName))

In [None]:

s3.Bucket(s3BucketName).upload_file(documentName,documentName)

In [None]:
# Call Amazon Textract
response = textract.analyze_document(
    Document={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': documentName
        }
    },
    FeatureTypes=["FORMS","TABLES"])

In [None]:


#print(response)

doc = Document(response)

for page in doc.pages:
    # Print fields
    print("Fields:")
    for field in page.form.fields:
        print("Key: {}, Value: {}".format(field.key, field.value))

    # Get field by key
    print("\nGet Field by Key:")
    key = "Phone Number:"
    field = page.form.getFieldByKey(key)
    if(field):
        print("Key: {}, Value: {}".format(field.key, field.value))

    # Search fields by key
    print("\nSearch Fields:")
    key = "address"
    fields = page.form.searchFieldsByKey(key)
    for field in fields:
        print("Key: {}, Value: {}".format(field.key, field.value))

In [None]:
doc = Document(response)

for page in doc.pages:
     # Print tables
    for table in page.tables:
        for r, row in enumerate(table.rows):
            for c, cell in enumerate(row.cells):
                print("Table[{}][{}] = {}".format(r, c, cell.text))

# 12. PDF Processing

https://docs.aws.amazon.com/textract/latest/dg/API_StartDocumentTextDetection.html
https://docs.aws.amazon.com/textract/latest/dg/API_GetDocumentTextDetection.html
https://docs.aws.amazon.com/textract/latest/dg/API_StartDocumentAnalysis.html
https://docs.aws.amazon.com/textract/latest/dg/API_GetDocumentAnalysis.html

In [None]:
def startJob(s3BucketName, objectName):
    response = None
    response = textract.start_document_text_detection(
    DocumentLocation={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': objectName
        }
    })

    return response["JobId"]

def isJobComplete(jobId):
    response = textract.get_document_text_detection(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))

    while(status == "IN_PROGRESS"):
        time.sleep(5)
        response = textract.get_document_text_detection(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))

    return status

def getJobResults(jobId):

    pages = []
    response = textract.get_document_text_detection(JobId=jobId)
    
    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']

    while(nextToken):
        response = textract.get_document_text_detection(JobId=jobId, NextToken=nextToken)

        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']

    return pages

In [None]:
# Document
documentName = "job-application-form.pdf"

In [None]:

s3.Bucket(s3BucketName).upload_file(documentName,documentName)

In [None]:
jobId = startJob(s3BucketName, documentName)
print("Started job with id: {}".format(jobId))
if(isJobComplete(jobId)):
    response = getJobResults(jobId)

#print(response)
doc = Document(response)


In [None]:

#Print detected text
for page in doc.pages:
    for line in page.getLinesInReadingOrder():
          print(line[1])

# Clean UP

Delete the S3 bucket and sample documents from S3 https://docs.aws.amazon.com/AmazonS3/latest/userguide/delete-objects.html