In [1]:
import os
from dotenv import load_dotenv
import boto3
import json
import base64
import threading
from typing import List

from aws_summary_template import SUMMARY_TEMPLATE

load_dotenv()

True

In [2]:
AWS_REGION_NAME = 'us-west-2'
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')

# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime.html
bedrock = boto3.client(
    service_name='bedrock-runtime',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=AWS_REGION_NAME
)

In [3]:
# trouble-shooting: use a different client with service_name 'bedrock', not 'bedrock-runtime'
# https://docs.aws.amazon.com/bedrock/latest/APIReference/welcome.html
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock.html

# client = boto3.client(
#     service_name='bedrock',
#     aws_access_key_id=aws_access_key_id,
#     aws_secret_access_key=aws_secret_access_key,
#     region_name=AWS_REGION_NAME
# )

# summ = client.list_foundation_models()['modelSummaries']
# [model for model in summ if 'Sonnet' in model['modelName']]

In [4]:
# [m for m in dir(bedrock) if not m.startswith('_')]
# help(bedrock.converse)

In [5]:
filename = "IT Consulting in the US.pdf"
content = open(filename, 'rb').read()
encoded = base64.b64encode(content)

prompt = """
You are an expert in extracting market and financial data from documents.
Extract essential data from text in the enclosed document.

Return the result in JSON format. Do not use non-JSON tags such as <property> or <UNKNOWN>.
Use only simple keys with units, such as "historical_revenue_growth_percentage" or "establishments_count" or "revenue_dollars".
"""

# Return the result in JSON format. Do not use non-JSON tags or values, such as <property> or <UNKNOWN>.
# historical_revenue_growth
# Do not use text in graphics, only use plain text.

# Split paragraphs into sentences preceded by the subject. For example:
# "Revenue Growth: Revenue has grown at a CAGR of 2.8% to $692.9 billion over the past five years."


initial_message = {
    "role": "user",
    "content": [
        {
            "text": prompt,
        },
        {
            "document": {
                "format": "pdf",
                "name": 'document',
                "source": {
                    "bytes": content
                }
            }
        }
    ],
}


def response_to_template(template: dict, result: dict) -> dict:
    tool_list = [{
        "toolSpec": template
    }]
    response = bedrock.converse(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        # modelId="meta.llama3-1-405b-instruct-v1:0",
        messages=[initial_message],
        inferenceConfig={
            "temperature": 0
        },
        toolConfig={
            "tools": tool_list,
            "toolChoice": {
                "tool": {
                    "name": "summarize_document"
                }
            }
        }
    )
    core_response = response['output']['message']['content'][0]['toolUse']['input']
    if 'properties' in core_response:
        core_response = core_response['properties']
    result.update(core_response)
    return core_response

In [6]:
def result_from_threads() -> dict:
    """Populate the separate templates and merge the result."""
    total = {}
    threads: List[threading.Thread] = []

    for st in SUMMARY_TEMPLATE:
        thread = threading.Thread(target=response_to_template, args=(st, total))
        thread.start()
        threads.append(thread)
    
    for thread in threads:
        thread.join()
    
    return total

In [7]:
total = result_from_threads()

In [8]:
print(json.dumps(total, indent=2))

{
  "properties": {
    "competitive_landscape": "The IT Consulting industry is highly competitive, with many small players and low barriers to entry. The five largest firms account for just over 5% of industry revenue. Competition is based on technical expertise, quality of service, value-added services, and price. Maintaining client relationships and a strong reputation is crucial for obtaining new business through referrals.",
    "costs_and_operations": "The industry has low capital requirements but is highly labor-intensive, requiring ongoing investment in skilled workers, R&D, and technology to remain competitive. Wages are the largest expense at 43% of revenue in 2023. Profit margins are relatively steady at around 6.4% of revenue.",
    "major_players": [
      "Dell Technologies Inc.",
      "Tata Consultancy Services Ltd",
      "Booz Allen Hamilton Inc.",
      "Accenture Plc"
    ],
    "related_international_industries": [
      "Computer System Design Services in Australi