Load the needed libraries

In [None]:
import boto3
import io
from io import BytesIO, StringIO
import sys
import webbrowser, os
import json
import sys
from pprint import pprint
import pandas as pd
import math
from PIL import Image, ImageDraw, ImageFont
import re

---
Textract will provide the output in JSON format with nested sections containing details about the relations. Following functions are used to obtain the Key-Value pair and table relationships.

In [None]:
def DisplayBlockInformation(block):
    print('Id: {}'.format(block['Id']))
    if 'Text' in block:
        print('    Detected: ' + block['Text'])
    print('    Type: ' + block['BlockType'])
   
    if 'Confidence' in block:
        print('    Confidence: ' + "{:.2f}".format(block['Confidence']) + "%")

    if block['BlockType'] == 'CELL':
        print("    Cell information")
        print("        Column:" + str(block['ColumnIndex']))
        print("        Row:" + str(block['RowIndex']))
        print("        Column Span:" + str(block['ColumnSpan']))
        print("        RowSpan:" + str(block['ColumnSpan']))    
    
    if 'Relationships' in block:
        print('    Relationships: {}'.format(block['Relationships']))
    print('    Geometry: ')
    print('        Bounding Box: {}'.format(block['Geometry']['BoundingBox']))
    print('        Polygon: {}'.format(block['Geometry']['Polygon']))
    
    if block['BlockType'] == "KEY_VALUE_SET":
        print ('    Entity Type: ' + block['EntityTypes'][0])
    if 'Page' in block:
        print('Page: ' + block['Page'])
    print()
    

def DrawBoundingBox(draw,box,width,height,boxColor):
             
    left = width * box['Left']
    top = height * box['Top'] 
    draw.rectangle([left,top, left + (width * box['Width']), top +(height * box['Height'])],outline=boxColor)


def get_kv_relationship(key_map, value_map, block_map):
    kvs = {}
    for block_id, key_block in key_map.items():
        value_block = find_value_block(key_block, value_map)
        key = get_text(key_block, block_map)
        val = get_text(value_block, block_map)
        kvs[key] = val
    return kvs


def find_value_block(key_block, value_map):
    for relationship in key_block['Relationships']:
        if relationship['Type'] == 'VALUE':
            for value_id in relationship['Ids']:
                value_block = value_map[value_id]
    return value_block


def get_text(result, blocks_map):
    text = ''
    if 'Relationships' in result:
        for relationship in result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    word = blocks_map[child_id]
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
    return text


def print_kvs(kvs):
    for key, value in kvs.items():
        print(key, ":", value)


def search_value(kvs, search_key):
    for key, value in kvs.items():
        if re.search(search_key, key, re.IGNORECASE):
            return value


def get_rows_columns_map(table_result, blocks_map):
    rows = {}
    for relationship in table_result['Relationships']:
        if relationship['Type'] == 'CHILD':
            for child_id in relationship['Ids']:
                cell = blocks_map[child_id]
                if cell['BlockType'] == 'CELL':
                    row_index = cell['RowIndex']
                    col_index = cell['ColumnIndex']
                    if row_index not in rows:
                        # create new row
                        rows[row_index] = {}
                        
                    # get the text value
                    rows[row_index][col_index] = get_text(cell, blocks_map)
    return rows


def get_text(result, blocks_map):
    text = ''
    if 'Relationships' in result:
        for relationship in result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    word = blocks_map[child_id]
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
    return text


# def generate_table_csv(table_result, blocks_map, table_index):
def generate_table_csv(table_result, blocks_map):
    rows = get_rows_columns_map(table_result, blocks_map)

#     table_id = 'Table_' + str(table_index)
    
#     # get cells.
#     csv = 'Table: {0}\n\n'.format(table_id)
    csv = ''
    for row_index, cols in rows.items():
        for col_index, text in cols.items():
#             if text == '':
#                 text = rows[row_index - 1][col_index]
            if col_index < len(cols):
                csv += '{}'.format(text.rstrip()) + ","
            else:
                csv += '{}'.format(text.rstrip())
        csv += '\n'
        
    csv += '\n\n\n'
    return csv


---
Using Texract analyze the `CV.png` document

In [None]:
with open('CV.png', 'rb') as file: 
    stream = io.BytesIO(file.read())

image=Image.open(stream)

display(image)

# Get Textract client object
client = boto3.client('textract', region_name='us-east-1')
                     
image_binary = stream.getvalue()
response = client.analyze_document(Document={'Bytes': image_binary}, FeatureTypes=["TABLES", "FORMS"])

---
List all the Key/Value pairs identified in the document.

Displayed image below will have a red bounding box for keys and green bounding box for values. Similarly the tables will have blue bounding boxes.

In [None]:
#Get the text blocks
blocks = response['Blocks']
width, height = image.size  
draw = ImageDraw.Draw(image)  

# Obtain Key-Value map information
key_map = {}
value_map = {}
block_map = {}
for block in blocks:
    block_id = block['Id']
    block_map[block_id] = block
    if block['BlockType'] == "KEY_VALUE_SET":
        if 'KEY' in block['EntityTypes']:
            key_map[block_id] = block
        else:
            value_map[block_id] = block
            
# Get Key Value relationship
kvs = get_kv_relationship(key_map, value_map, block_map)
print("\n\n== FOUND KEY : VALUE pairs ===\n")
print_kvs(kvs)
print('')

# Create image showing bounding box/polygon the detected lines/text
for block in blocks:

#     DisplayBlockInformation(block)

    draw = ImageDraw.Draw(image)
    if block['BlockType'] == "KEY_VALUE_SET":
        if block['EntityTypes'][0] == "KEY":
            DrawBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'red')
        else:
            DrawBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'green')  

    if block['BlockType'] == 'TABLE':
        DrawBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'blue')

    if block['BlockType'] == 'CELL':
        DrawBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'yellow')

        #uncomment to draw polygon for all Blocks
        points=[]
        for polygon in block['Geometry']['Polygon']:
           points.append((width * polygon['X'], height * polygon['Y']))
        draw.polygon((points), outline='blue')

# Display the image
display(image)

---
Now let's try to lookup the values based on the key.

For example, search for `Mobile`, `Nationality`, etc. When you are done, enter **n** to exit 

In [None]:
# Start searching a key value
while input('\n Do you want to search a value for a key? (enter "n" for exit) ') != 'n':
    search_key = input('\n Enter a search key:')
    print('The value is:', search_value(kvs, search_key))

---
How let's display the table obtained from the document.

In [None]:
# Get the text blocks
blocks = response['Blocks']
# pprint(blocks)

blocks_map = {}
table_blocks = []
for block in blocks:
    blocks_map[block['Id']] = block
    if block['BlockType'] == "TABLE":
        table_blocks.append(block)

if len(table_blocks) <= 0:
    print("NO Table FOUND")
else:
    tables = {}
    for index, table in enumerate(table_blocks):
        csv = generate_table_csv(table, blocks_map)
        tables['table_' + str(index + 1)] = pd.read_csv(StringIO(csv))

print('Data from Table 1')
display(tables['table_1'])

print('Data from Table 2')
display(tables['table_2'])

print('Data from Table 3')
display(tables['table_3'])

Now let's filter the table and display results based on the condition.

1. Display the jobs the candidate had since 2005. (`tables['table_2'][tables['table_2']['From'] > 2005]`)
2. Display the education for which the candidate holds a diploma. (`tables['table_1'][tables['table_1']['Degree'] == 'Yes']`)


In [None]:
print('Filtering data from specific table')
display(tables['table_2'][tables['table_2']['From'] > 2005])