In [None]:

import pandas as pd
import webbrowser, os
import json
import boto3
import re
import sagemaker
from sagemaker import get_execution_role
from sagemaker.s3 import S3Uploader, S3Downloader
import uuid
import time
import io
from io import BytesIO
import sys
import csv
from pprint import pprint
from IPython.display import Image, display
from PIL import Image as PImage, ImageDraw

# Define IAM role
role = get_execution_role()
print("RoleArn: {}".format(role))
sess = sagemaker.Session()
s3BucketName =  "<enter your bucket name>"
prefix = 'chapter5'

s3 = boto3.client('s3')

In [None]:

# initialize the boto3 handle for comprehend
comprehend = boto3.client('comprehend')
textract= boto3.client('textract')
kendra= boto3.client('kendra')

In [None]:
# Document
documentName = "resume_Sample.pdf"

In [None]:

s3 = boto3.resource('s3')
s3.Bucket(s3BucketName).upload_file(documentName,documentName)

In [None]:
def startJob(s3BucketName, objectName):
    response = None
    response = textract.start_document_text_detection(
    DocumentLocation={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': objectName
        }
    })

    return response["JobId"]

def isJobComplete(jobId):
    response = textract.get_document_text_detection(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))

    while(status == "IN_PROGRESS"):
        time.sleep(5)
        response = textract.get_document_text_detection(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))

    return status

def getJobResults(jobId):

    pages = []
    response = textract.get_document_text_detection(JobId=jobId)
    
    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']

    while(nextToken):
        response = textract.get_document_text_detection(JobId=jobId, NextToken=nextToken)

        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']

    return pages

In [None]:


jobId = startJob(s3BucketName, documentName)
print("Started job with id: {}".format(jobId))
if(isJobComplete(jobId)):
    response = getJobResults(jobId)

#print(response)




In [None]:
# Print detected text
text=""
for resultPage in response:
    for item in resultPage["Blocks"]:
        if item["BlockType"] == "LINE":
            #print ('\033[94m' +  item["Text"] + '\033[0m')
            text += item['Text']+"\n"
print(text)

# Call Amazon Comprehend

In [None]:
entities= comprehend.detect_entities(Text=text, LanguageCode='en')


In [None]:
print(json.dumps(entities, sort_keys=True, indent=4))

# Create Kendra Index 
go to Kendra console https://console.aws.amazon.com/kendra/home?region=us-east-1#indexes/create
to create an index by following book instructions and skip creating using API.
 
Alternatively, Please craete an IAM role and provide in Role ARN, 

https://docs.aws.amazon.com/kendra/latest/dg/deploying.html

In [None]:
# run this code only once as it will craete multiple indexes
#response = kendra.create_index(
#    Name='Search',
#    Edition='DEVELOPER_EDITION',
#    RoleArn='<enter your role arn>')
#print(response)


Get IndexId from Console and paste it in ID or run above code to create Index which will give 36 digit Index ID.

In [None]:
response = kendra.update_index(
    Id="<enter kendra index id>",
    DocumentMetadataConfigurationUpdates=[
        {
            'Name':'ORGANIZATION',
            'Type':'STRING_LIST_VALUE',
            'Search': {
                'Facetable': True,
                'Searchable': True,
                'Displayable': True
            }
        },
        {
            'Name':'PERSON',
            'Type':'STRING_LIST_VALUE',
            'Search': {
                'Facetable': False,
                'Searchable': True,
                'Displayable': True
            }
        },
        {
            'Name':'DATE',
            'Type':'STRING_LIST_VALUE',
            'Search': {
                'Facetable': False,
                'Searchable': True,
                'Displayable': True
            }
        },
        {
            'Name':'COMMERCIAL_ITEM',
            'Type':'STRING_LIST_VALUE',
            'Search': {
                'Facetable': True,
                'Searchable': False,
                'Displayable': True
            }
        },
        {
            'Name':'OTHER',
            'Type':'STRING_LIST_VALUE',
            'Search': {
                'Facetable': True,
                'Searchable': True,
                'Displayable': True
            }
        }
        ,
        {
            'Name':'QUANTITY',
            'Type':'STRING_LIST_VALUE',
            'Search': {
                'Facetable': True,
                'Searchable': True,
                'Displayable': True
            }
        }
        ,
        {
            'Name':'TITLE',
            'Type':'STRING_LIST_VALUE',
            'Search': {
                'Facetable': False,
                'Searchable': True,
                'Displayable': True
            }
        }
    ])
    
print(response)

In [None]:
#List of categories recognized by Comprehend 
categories = ["ORGANIZATION", "PERSON", "DATE", "COMMERCIAL_ITEM", "OTHER", "TITLE", "QUANTITY"]

In [None]:
#List of JSON objects to store entities
entity_data = dict()
#List of observed text strings recognized as categories
category_text = dict()
#Frequency of each text string
text_frequency = dict()
#The Kendra attributes JSON object with metadata list to be populated
attributes = dict()
metadata = dict()

In [None]:
for et in categories:
        entity_data[et] = set()
        #print(entity_data[et])
        category_text[et] = []
        text_frequency[et] = dict()

In [None]:
for e in entities["Entities"]:
    if (e["Text"].isprintable()) and (not "\"" in e["Text"]) and (not e["Text"].upper() in category_text[e["Type"]]):
                #Append the text to entity data to be used for a Kendra custom attribute
                entity_data[e["Type"]].add(e["Text"])
                #Keep track of text in upper case so that we don't treat the same text written in different cases differently
                category_text[e["Type"]].append(e["Text"].upper())
                #Keep track of the frequency of the text so that we can take the text with highest frequency of occurrance
                text_frequency[e["Type"]][e["Text"].upper()] = 1
    elif (e["Text"].upper() in category_text[e["Type"]]):
                #Keep track of the frequency of the text so that we can take the text with highest frequency of occurrance
                text_frequency[e["Type"]][e["Text"].upper()] += 1

print(entity_data)

In [None]:
#Populate the metadata list
elimit = 10
for et in categories:
        #Take at most elimit number of recognized text strings having the highest frequency of occurrance
    el = [pair[0] for pair in sorted(text_frequency[et].items(), key=lambda item: item[1], reverse=True)][0:elimit]
    metadata[et] = [d for d in entity_data[et] if d.upper() in el]
metadata["_source_uri"] = documentName
attributes["Attributes"] = metadata
print(json.dumps(attributes, sort_keys=True, indent=4))

In [None]:
with open("metadata.json", "w") as f:
     json.dump(attributes, f)

In [None]:
s3 = boto3.client('s3')
prefix= 'meta/'
with open("metadata.json", "rb") as f:
    #s3.upload_fileobj(f,s3BucketName, prefix+"resume_Sample.pdf.metadata.json")
    s3.upload_file( "metadata.json", s3BucketName,'%s/%s' % ("meta","resume_Sample.pdf.metadata.json"))
print("Uploaded to Amazon S3 meta folder")

# Run Kendra Sync in AWS Console

# Clean UP

# Delete the Amazon S3 Data source and the Kendra Index 
https://docs.aws.amazon.com/kendra/latest/dg/delete-data-source.html