In [1]:
import os
from dotenv import load_dotenv
import boto3
import json
import base64

from aws_summary_template import SUMMARY_TEMPLATE_1, SUMMARY_TEMPLATE_2

load_dotenv()

True

In [2]:
AWS_REGION_NAME = 'us-west-2'
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')

# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime.html
bedrock = boto3.client(
    service_name='bedrock-runtime',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=AWS_REGION_NAME
)

In [3]:
# trouble-shooting: use a different client with service_name 'bedrock', not 'bedrock-runtime'
# https://docs.aws.amazon.com/bedrock/latest/APIReference/welcome.html
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock.html

# client = boto3.client(
#     service_name='bedrock',
#     aws_access_key_id=aws_access_key_id,
#     aws_secret_access_key=aws_secret_access_key,
#     region_name=AWS_REGION_NAME
# )

# summ = client.list_foundation_models()['modelSummaries']
# [model for model in summ if 'Sonnet' in model['modelName']]

In [4]:
# [m for m in dir(bedrock) if not m.startswith('_')]
# help(bedrock.converse)

In [5]:
filename = "IT Consulting in the US.pdf"
content = open(filename, 'rb').read()
encoded = base64.b64encode(content)

prompt = """
You are an expert in extracting market and financial data from documents.
Extract essential data from text in the enclosed document.

Return the result in JSON format. Do not use non-JSON tags, such as <property>.
Use only simple keys with units, such as "historical_revenue_growth_percentage" or "establishments_count" or "revenue_dollars".
"""

# Return the result in JSON format. Do not use non-JSON tags or values, such as <property> or <UNKNOWN>.
# historical_revenue_growth
# Do not use text in graphics, only use plain text.

# Split paragraphs into sentences preceded by the subject. For example:
# "Revenue Growth: Revenue has grown at a CAGR of 2.8% to $692.9 billion over the past five years."


initial_message = {
    "role": "user",
    "content": [
        {
            "text": prompt,
        },
        {
            "document": {
                "format": "pdf",
                "name": 'document',
                "source": {
                    "bytes": content
                }
            }
        }
    ],
}


def response_to_template(template: dict) -> dict:
    tool_list = [{
        "toolSpec": template
    }]
    response = bedrock.converse(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        # modelId="meta.llama3-1-405b-instruct-v1:0",
        messages=[initial_message],
        inferenceConfig={
            "temperature": 0
        },
        toolConfig={
            "tools": tool_list,
            "toolChoice": {
                "tool": {
                    "name": "summarize_document"
                }
            }
        }
    )
    return response['output']['message']['content'][0]['toolUse']["input"]

In [6]:
# core_response = response_to_template(SUMMARY_TEMPLATE_1)
core_response = response_to_template(SUMMARY_TEMPLATE_2)

for k, v in core_response.items():
    if isinstance(v, str):
        try:
            core_response[k] = json.loads(v)
        except Exception:
            pass

print(json.dumps(core_response, indent=2))

{
  "key_trends": [
    "The industry has experienced a shift towards cloud computing and data analytics",
    "Major players have looked to gain an advantage through mergers and acquisitions",
    "The pandemic and expansion of business IT budgets increased demand for services",
    "Continued innovation and shifting technological trends will boost future demand"
  ],
  "market_segmentation": [
    {
      "segment": "Financial services companies",
      "percentage": 20.8
    },
    {
      "segment": "Communications, media and technology companies",
      "percentage": 15.0
    },
    {
      "segment": "Manufacturing and retail companies",
      "percentage": 22.2
    },
    {
      "segment": "Healthcare companies",
      "percentage": 12.9
    },
    {
      "segment": "Public sector and nonprofit organizations",
      "percentage": 18.8
    },
    {
      "segment": "Other sectors",
      "percentage": 10.3
    }
  ],
  "products_and_services": [
    {
      "product_or_service"

In [7]:
list(core_response.keys())

['key_trends',
 'market_segmentation',
 'products_and_services',
 'related_international_industries',
 'supply_chain']

In [10]:
core_response['supply_chain']

{'tier_1_suppliers': ['Software Publishing in the US',
  'Computer & Packaged Software Wholesaling in the US',
  'Internet Service Providers in the US'],
 'tier_2_suppliers': ['Computer Manufacturing in the US',
  'Computer Peripheral Manufacturing in the US',
  'Communication Equipment Manufacturing in the US'],
 'tier_1_buyers': ['Retail Trade in the US',
  'Hospitals in the US',
  'Finance and Insurance in the US',
  'Healthcare and Social Assistance in the US',
  'Manufacturing in the US']}