# Using textract for get confidence score

In [9]:
import base64
import json
import boto3
from botocore.exceptions import ClientError
import requests
runtime = boto3.client("bedrock-runtime")
from IPython.display import display, Image
from botocore.config import Config
my_config = Config(retries={'max_attempts': 10, 'mode': 'adaptive'})
from io import BytesIO
s3_client = boto3.client('s3')
import pandas as pd
import time
import re

In [134]:
def display_image(bucket_name, object_key):
    s3_client = boto3.client('s3')
    response = s3_client.get_object(Bucket=bucket_name, Key=object_key)
    # Read the content of the response
    image_bytes = response['Body'].read()
    # Display the image using IPython's display module
    display(Image(data=image_bytes))

def get_textract_op(bucket, key):
    print(f"Starting Textract for {key} in {bucket}")
    textract = boto3.client('textract')
    response = textract.start_document_text_detection(
        DocumentLocation={'S3Object': {'Bucket': bucket, 'Name': key}}
    )
    job_id = response['JobId']  # Retrieve the job ID from the response

    # Poll the job status until it is either 'SUCCEEDED' or 'FAILED'
    while True:
        job_status = textract.get_document_text_detection(JobId=job_id)['JobStatus']
        if job_status in ['SUCCEEDED', 'FAILED']:
            break
        time.sleep(5)  # Sleep for 5 seconds before checking again

    # Retrieve the final job response once the text detection is complete
    response = textract.get_document_text_detection(JobId=job_id)
    print('Received Textract response')
    return response



def refine_json(json_data):
    ground_truth_corpus = []
    for i in range(int(len(json_data['Blocks']))):
        if json_data['Blocks'][i]['BlockType'] == 'LINE':
            words = json_data['Blocks'][i]['Text']
            confidence_score = json_data['Blocks'][i]['Confidence']
            text_n_confidence  = f'''Text: {words}, Accuracy_score: {confidence_score}'''
            ground_truth_corpus.append(text_n_confidence )
    from_textract = '\n'.join(ground_truth_corpus)
    return from_textract


def get_claude_op(bucket_name,object_key):
    prompt = '''
            <role>
            ROLE :  You are tasked with extracting the sender and receiver's addresses from the provided context of an envelope. This involves analyzing the text to identify postal addresses based on the sequence of zip codes and specific clues within the text.
            </role>
            
            <instruction>
            Instructions for Extracting Sender and Receiver Addresses:
            Begin with a Detailed Analysis: Start by examining the text from the beginning, noting all postal addresses and associated zip codes as they appear.
            
            Sender Address Cue: Sender address will be usually be at the top right hand corner. Try to capture all the deatils like sender name, address, city, state and zipcode. try to keep the sender name as it is and do not skip or manipulate any line.
            
            Receiver Address Cue: Receiver address will be in the bottom. Try to capture all the deatils like sender name, address, city, state and zipcode. Try to keep the receiver name as it is and do not skip or manipulate any line. 'Attn' can be a part of receiver name
            
            </instruction>
            
            
            <note>
            NOTE :
            i don't need this line and any line similar to this Based on the provided context --> "I have identified the sender and receiver addresses as follows:"
            Please adhere to the following output format:
            do not provide explanation for your answer\
            Do not add anything extra apart from JSON output not even a single line\
            Do not add any extra line before or after the JSON output\
            make you add quotes " as mentioned in the output format
            </note>
            
            <output format>
            {
              "Sender Address": {
                "SENDER NAME": "{sender name}",
                "SENDER ADDRESS": "{sender address}",
                "SENDER CITY": "{city}",
                "SENDER STATE": "{state}",
                "SENDER ZIP": "{zip}"
              },
              "Recipient Address": {
                "RECIPIENT NAME": "{recipient name}",
                "RECIPIENT ADDRESS": "{recipient address}",
                "RECIPIENT CITY": "{city}",
                "RECIPIENT STATE": "{state}",
                "RECIPIENT ZIP": "{zip}"
              }
            }
            
            </output format>
            
            '''

    s3_client = boto3.client('s3')
    image_path = s3_client.generate_presigned_url('get_object',Params={'Bucket': bucket_name,'Key': object_key},ExpiresIn=900)
    response = requests.get(image_path)
    encoded_image = base64.b64encode(response.content).decode("utf8")

    response = requests.get(image_path)
    encoded_image = base64.b64encode(response.content).decode("utf8")


    body = json.dumps(
        {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 1000,
            "temperature":0,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": encoded_image,
                            },
                        },
                        {"type": "text", "text": prompt},
                    ],
                }
            ],
        }

    )

    response = runtime.invoke_model(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        body=body
    )

    
    result = json.loads(response.get("body").read())
    
    usage = result['usage']
    response = result['content'][0]['text']
    return response, usage



def call_text_claude(prompt):
    
    body = json.dumps(
        {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 1000,
            "temperature":0,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                    ],
                }
            ],
        }

    )

    response = runtime.invoke_model(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        body=body
    )

    # Process and print the response
    result = json.loads(response.get("body").read())
    response = result['content'][0]['text']
    usage = result['usage']
    return response, usage


def get_string_match_per(textract_json_op, claude_json_op):

    prompt = '''
    <query>
    Here is the textract output:
    ''' + textract_json_op + '''
    
    And here is the Claude output:
    ''' + claude_json_op + '''
    Tell me confidence score of claude output when it is matched with textract output
    </query>

    
    <instruction>
    Confidence score will be text match between address mentioned in  and claude output
    make sure you match sender and receiver address and then provoide the confidenc score
    </instruction>
    
    <note>
    Make sure you match word by word correctly for all the values of claude 
    There are 10 values of claude to checked
    </note>
    
    <output format>
    {
      "Confidence Score": "{Confidence Score in percentage}",
      "Explanation": "{Explanation for confidence score}"
    }
    </output format>

    '''
    response, usage = call_text_claude(prompt) 
    return response, usage


def get_final_op(textract_json_op, claude_json_op, explnation, bucket_name ,object_key):
    
    prompt = '''
    <query>
    Claude output confidence is low
    Your task is to improve the confidence score of claude output by replacing the textract value into claude output.
    I know the reason why confidence score is low, I have provided the reason in the below text.
    Make use you use reason to improve the confidence score
    you need to replace the correct text from textract output to claude output to improve string match percentage 

    HERE IS TEXTRACT OUTPUT
    '''+textract_json_op+'''

    HERE IS CLAUDE OUTPUT
    '''+claude_json_op+'''

    AND HERE IS REASON FOR LOW CONFIDENCE
    '''+explnation+'''

    Follow json format to provide the output
    </query>
    
    
    <instruction>
    - spelling of Name, address, state,city, zip, of sender/receiver might have typo error in claude output try to find similar correct text in textract
    - Refer improved claude output as 'NEW CLAUDE OUTPUT' in explnation
    </instruction>

    
    <think>
    you need understand to remove word like 'from' in sender's name
    </think>

    <output format>
    {
      "Sender Address": {
        "SENDER NAME": "{sender name}",
        "SENDER ADDRESS": "{sender address}",
        "SENDER CITY": "{city}",
        "SENDER STATE": "{state}",
        "SENDER ZIP": "{zip}"
      },
      "Recipient Address": {
        "RECIPIENT NAME": "{recipient name}",
        "RECIPIENT ADDRESS": "{recipient address}",
        "RECIPIENT CITY": "{city}",
        "RECIPIENT STATE": "{state}",
        "RECIPIENT ZIP": "{zip}"
      },
      "confidenc score": {
        "STRING MATCH": "{ string match % between NEW CLAUDE OUTPUT  and textract output}"
      },
      "Explain": {
        "Explanation": "{ provide explnation on confidence score, also tell which value is affecting the accuracy }"
      }
    }

    <output format>
    
    '''

    s3_client = boto3.client('s3')
    image_path = s3_client.generate_presigned_url('get_object',Params={'Bucket': bucket_name,'Key': object_key},ExpiresIn=900)
    response = requests.get(image_path)
    encoded_image = base64.b64encode(response.content).decode("utf8")

    response = requests.get(image_path)
    encoded_image = base64.b64encode(response.content).decode("utf8")


    body = json.dumps(
        {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 1000,
            "temperature":0,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": encoded_image,
                            },
                        },
                        {"type": "text", "text": prompt},
                    ],
                }
            ],
        }

    )

    response = runtime.invoke_model(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        body=body
    )
    
    result = json.loads(response.get("body").read())
  
    
    usage = result['usage']
    response = result['content'][0]['text']

    return response, usage


In [135]:
def pass_png(bucket, key):

    token_add = []
    textract_json = get_textract_op(bucket,key)
    textract_json_op = refine_json(textract_json)
    claude_json_op, token1 = get_claude_op(bucket,key) # prompt is used for fetch address and token
    token_add.append(token1)

    string_match_per, token2 = get_string_match_per(textract_json_op, claude_json_op) # prompt is used to match strings
    token_add.append(token2)
    
    
    final_op, token3 = get_final_op(textract_json_op, claude_json_op, string_match_per,bucket,key) # prompt is used to fetch final output
    token_add.append(token3)
    
    
    display_image(bucket,key)
    print('*** TEXTRACT TEXT ****\n',textract_json_op,'\n')
    print('*** INITIAL CLAUDE OUTPUT ****\n',claude_json_op,'\n')
    print('*** INITIAL CONFIDENCE ***', string_match_per,'\n')
    print('*** FINAL OUTPUT ***', final_op,'\n\n')

    total_input_tokens = sum(item['input_tokens'] for item in token_add)
    total_output_tokens = sum(item['output_tokens'] for item in token_add)
    print('*** TOTAL INPUT TOKEN ***', total_input_tokens,'\n\n')
    print('*** TOTAL OUTPUT TOKEN ***', total_output_tokens,'\n\n')

    

In [None]:
bucket = 'bucket_name'
key = 'folder_path/file_name.png'
pass_png(bucket,key)