# Table Extraction with Anthropic's Claude Sonnet Model 
This notebook demonstrates how we might perform OCR table extraction with the Claude Sonnet model.

Before starting, you should check that you have set your API key using an environmental variable `ANTHROPIC_API_KEY` or you will be prompted to set it. See [the Anthropic documentation](https://docs.anthropic.com/en/docs/quickstart) for details.

In [1]:
# Import all libraries used in this notebook
import base64
import getpass
import io
import os

import anthropic
from PIL import Image


In [2]:
API_KEY_VAR = "ANTHROPIC_API_KEY"
if not API_KEY_VAR in os.environ:
    print("API key was not set. Enter it in the prompt.")
    os.environ[API_KEY_VAR] = getpass.getpass("Submit your Claude API key")
    print("API key should now be set.")
else:
    print("API key was already set.")


API key was not set. Enter it in the prompt.
API key should now be set.


In [3]:
# Choose the image you will use for testing
ORIGINAL_TEST_IMG_PATH = "../data/MSF_data/Sheet 01 (1).jpg"

# A temporary file for the resized image that will be sent
TEMP_IMG_PATH = "tmp.jpg"

# Check that constant values for estimating cost are accurate
# For estimating costs, the cost per million tokens in dollars
ANTHROPIC_MODEL = "claude-3-5-sonnet-20240620"
COST_PER_MILL_INPUT_TOKS = 3.0
COST_PER_MILL_OUTPUT_TOKS = 15.0

OUPUT_SIZE_LIMIT = 1024 # Max desired output tokens
MAXIMUM_PIXEL_LENGTH = 1568 # Maximum pixel edge length allowed by Claude
MAXIMUM_BYTES = 5242880 # Maximum image upload size allowed by Claude

In [4]:
def estimate_img_tokens(pixel_width, pixel_height):
    return (pixel_height * pixel_width)/750

def estimate_total_cost_by_tokens(estimated_num_tokens, cost_per_million_tokens):
    cost_per_token = cost_per_million_tokens * 1e-9
    return estimated_num_tokens * cost_per_token


In [6]:
# This reads in the test image, resizes and encodes it appropriately for Claude.
# It also prints out Anthropic cost estimates for working with similar images.
with Image.open(ORIGINAL_TEST_IMG_PATH) as test_img:
    print("Image pixel size:", test_img.size)
    img_to_send = test_img.copy()
    img_to_send.thumbnail((MAXIMUM_PIXEL_LENGTH, MAXIMUM_PIXEL_LENGTH))
    img_as_bytes = io.BytesIO()
    img_to_send.save(img_as_bytes, "jpeg")
    encoded_img = base64.b64encode(img_as_bytes.getvalue()).decode("utf-8")

byte_size = encoded_img.__sizeof__()

print("Image size in bytes:", byte_size)
current_max_pixel_length = MAXIMUM_PIXEL_LENGTH
while byte_size > MAXIMUM_BYTES:
    quality_scale = MAXIMUM_BYTES/byte_size
    print("Resizing image at", quality_scale, "percent")
    current_max_pixel_length = int(quality_scale * current_max_pixel_length)
    img_to_send.thumbnail((current_max_pixel_length, current_max_pixel_length))   
    img_to_send.save(TEMP_IMG_PATH)
    img_as_bytes = io.BytesIO()
    img_to_send.save(img_as_bytes, "jpeg")
    encoded_img = base64.b64encode(img_as_bytes.getvalue()).decode("utf-8")
    byte_size = encoded_img.__sizeof__()    
    print("Resized Image pixel size:", img_to_send.size)
    print("Resized Image size in bytes:", byte_size)
    

input_cost = estimate_total_cost_by_tokens(estimate_img_tokens(*img_to_send.size), COST_PER_MILL_INPUT_TOKS)
print("Estimated input cost for this image:", input_cost)
output_cost = estimate_total_cost_by_tokens(OUPUT_SIZE_LIMIT, COST_PER_MILL_OUTPUT_TOKS)
print("Estimated output cost for this image:", output_cost)
print("Total estimated cost for 1000 images:", (input_cost + output_cost)*1000)

    

Image pixel size: (3000, 3869)
Image size in bytes: 246725
Estimated input cost for this image: 7.626752000000002e-06
Estimated output cost for this image: 1.5360000000000002e-05
Total estimated cost for 1000 images: 0.022986752000000003


In [7]:
anthropic_client = anthropic.Anthropic()


message = anthropic_client.messages.create(
    model=ANTHROPIC_MODEL, 
    max_tokens=OUPUT_SIZE_LIMIT,
    messages = [
            {
            "role":"user", 
            "content":[
                {
                    "type":"image",
                    "source": {
                        "type": "base64", 
                        "media_type": "image/jpeg",
                        "data": encoded_img,
                    },
                },
                {
                    "type":"text",
                    "text":"Identify the tables in the image and return them as a json object, where headers and row labels are keys in the json. Return only correctly formatted json. "
                },
            ],
        }
    ]
)

In [8]:
print(message)

Message(id='msg_01EV5AiHwwZYx5qbDsngUdsY', content=[TextBlock(text='Here\'s the JSON representation of the tables in the image:\n\n{\n  "Total consultations": {\n    "Total outpatient SRH consultations": "682+1"\n  },\n  "ANC - diagnostics": {\n    "Pop1: Resident": {\n      "First ANC visit this pregnancy": "65",\n      "Follow-up ANC visit": "178",\n      "Syphilis number tested": "65",\n      "Syphilis number positive": "",\n      "HIV number tested (First test in ANC)": "",\n      "HIV number positive (First test in ANC)": "",\n      "HIV number tested (later visit)": "",\n      "HIV number positive (later visit)": "",\n      "Hep B number tested": "66",\n      "Hep B number positive": "",\n      "Haemoglobin number tested": "65",\n      "Haemoglobin number anaemia": "18",\n      "Haemoglobin number severe anaemia": ""\n    },\n    "Pop2: Displaced": {\n      "First ANC visit this pregnancy": "05",\n      "Follow-up ANC visit": "08",\n      "Syphilis number tested": "05",\n      "S