"""
need to install boto3 package (AWS SDK) from conda -c conda-forge  
  
conda create -n aws-cloud  
conda activate aws-cloud  
conda install -c conda-forge boto3 ipykernel jupyterlab notebook python=3.12.0  
** to set up env as new kernel in jupyterlabs:  
python -m ipykernel install --user --name=aws-cloud  
  
repo: https://github.com/TechExpertTutorials/aws-textract-solution  
  
setup credentials:  

credentials setup instructions: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html
  
~/.aws/credentials  
  
[default]  
aws_access_key_id = YOUR_ACCESS_KEY_ID  
aws_secret_access_key = YOUR_SECRET_ACCESS_KEY  

"""


In [1]:
import boto3
from textract_functions import get_lines, get_kv_map

In [2]:
# if ~/.aws/credentials not configured
# client = boto3.client('textract', 
#                       region_name='<aws-region>', 
#                       aws_access_key_id='<Your access key here>',
#                       aws_secret_access_key='<Your secret access key here>')
# else
client = boto3.client('textract')

In [8]:
# image to read text from
with open('dl2.png', 'rb') as file:
    img_test = file.read()
    bytes_test = bytearray(img_test)

AWS Text Extraction Hierarchy:  
Page -> Line -> Word

In [9]:
response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes = ['TABLES'])
print(response)

{'DocumentMetadata': {'Pages': 1}, 'Blocks': [{'BlockType': 'PAGE', 'Geometry': {'BoundingBox': {'Width': 0.9902695417404175, 'Height': 0.9867692589759827, 'Left': 0.0, 'Top': 0.004162239376455545}, 'Polygon': [{'X': 0.0, 'Y': 0.004162239376455545}, {'X': 0.9902695417404175, 'Y': 0.008060899563133717}, {'X': 0.9895803332328796, 'Y': 0.990931510925293}, {'X': 0.0, 'Y': 0.9899323582649231}]}, 'Id': '52fe0f7c-d0b6-41bc-8891-fa244b654f84', 'Relationships': [{'Type': 'CHILD', 'Ids': ['a5961c45-c25d-4dd9-805d-1a08c94dd8a6', '996cb337-2184-41b5-a88c-4eb0cca4440c', '49187ec2-d813-4e74-900d-78597f97ebea', 'd462d963-1006-439b-85c7-9f25c628cc72', '9a6f3146-03b7-40ac-8110-7a94870f9493', '961fdc9b-a0ae-45eb-8497-3228dd08c9bd', 'e7f02e8c-ba8c-41ca-93d9-2013c60e0eec', 'e3a20614-845e-40fb-83c4-b6f7266824ba', '5e592b9e-480e-456c-9e07-1f052875447f', '4620eafe-bd73-4ee8-a616-e6f6197993b0', '6849f9f8-69e4-4402-b8b4-fd18c2c2773e', '1c208e78-1313-4aaa-9f52-9aedd94ae7fd', 'dceec441-8572-4c00-884e-9ad5da64b69

In [10]:
blocks = response['Blocks']
text = ""
for block in blocks:
    if block['BlockType'] == 'WORD':
        text += block['Text'] + "\n"
print(text)

California
USA
DRIVER
LICENSE
I1234568
CLASS
C
DL
EXP
01/01/2024
END
NONE
LNCARDHOLDER
on
FNIMA
SAMPLE
2570
24TH
STREET
ANYTOWN,
CA
95818
DOB
08/31/1977
RSTR
NONE
08311977
DONOR
VETERAN
IMa
Cardholder
SEX
X
HAIR
BRN
EYES
BRN
HGT
5'-05"
WGT
125
lb
ISS
DD
00/00/0000NNNAN/ANFD/YY
01/01/2019



In [11]:
# print(get_kv_map(blocks))

In [12]:
lines, blocks, confidences = get_lines(blocks)

In [13]:
for line in lines:
    print(f"Text: {lines[line]}  --  Confidence: {round(confidences[line])} pct")

Text: California  --  Confidence: 100 pct
Text: USA  --  Confidence: 85 pct
Text: DRIVER LICENSE  --  Confidence: 100 pct
Text: I1234568  --  Confidence: 51 pct
Text: CLASS C  --  Confidence: 87 pct
Text: DL  --  Confidence: 100 pct
Text: EXP  --  Confidence: 100 pct
Text: 01/01/2024  --  Confidence: 100 pct
Text: END NONE  --  Confidence: 100 pct
Text: LNCARDHOLDER  --  Confidence: 93 pct
Text: on  --  Confidence: 15 pct
Text: FNIMA SAMPLE  --  Confidence: 99 pct
Text: 2570 24TH STREET  --  Confidence: 100 pct
Text: ANYTOWN, CA 95818  --  Confidence: 88 pct
Text: DOB 08/31/1977  --  Confidence: 100 pct
Text: RSTR NONE  --  Confidence: 100 pct
Text: 08311977  --  Confidence: 100 pct
Text: DONOR  --  Confidence: 93 pct
Text: VETERAN  --  Confidence: 100 pct
Text: IMa Cardholder  --  Confidence: 61 pct
Text: SEX X  --  Confidence: 91 pct
Text: HAIR BRN  --  Confidence: 100 pct
Text: EYES BRN  --  Confidence: 100 pct
Text: HGT 5'-05"  --  Confidence: 98 pct
Text: WGT 125 lb  --  Confidenc