In [1]:
import os
from dotenv import load_dotenv
import boto3
import json
import base64
import threading
from typing import List

from Templates.ibis_aws_summary_template import IBIS_SUMMARY_TEMPLATE

load_dotenv()

True

In [2]:
AWS_REGION_NAME = 'us-west-2'
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')

# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime.html
bedrock = boto3.client(
    service_name='bedrock-runtime',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=AWS_REGION_NAME
)

In [3]:
# trouble-shooting: use a different client with service_name 'bedrock', not 'bedrock-runtime'
# https://docs.aws.amazon.com/bedrock/latest/APIReference/welcome.html
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock.html

# client = boto3.client(
#     service_name='bedrock',
#     aws_access_key_id=aws_access_key_id,
#     aws_secret_access_key=aws_secret_access_key,
#     region_name=AWS_REGION_NAME
# )

# summ = client.list_foundation_models()['modelSummaries']
# [model for model in summ if 'Sonnet' in model['modelName']]

In [4]:
# [m for m in dir(bedrock) if not m.startswith('_')]
# help(bedrock.converse)

In [5]:
filename = "IndustrySource/IBIS/IT Consulting in the US.pdf"
filename = 'IndustrySource/IBIS/Furniture Stores in the US.pdf'
filename = 'IndustrySource/IBIS/Household Furniture Manufacturing in the US.pdf'

content = open(filename, 'rb').read()
encoded = base64.b64encode(content)
out_filename = filename.replace('Source', 'Summary').replace('.pdf', '.json')

prompt = """
You are an expert in extracting market and financial data from documents.
Extract essential data from text in the enclosed document.

Return the result in JSON format. Do not use non-JSON tags such as <property> or <UNKNOWN>.
Use only simple keys with units, such as "historical_revenue_growth_percentage" or "establishments_count" or "revenue_dollars".
"""

# Return the result in JSON format. Do not use non-JSON tags or values, such as <property> or <UNKNOWN>.
# historical_revenue_growth
# Do not use text in graphics, only use plain text.

# Split paragraphs into sentences preceded by the subject. For example:
# "Revenue Growth: Revenue has grown at a CAGR of 2.8% to $692.9 billion over the past five years."


initial_message = {
    "role": "user",
    "content": [
        {
            "text": prompt,
        },
        {
            "document": {
                "format": "pdf",
                "name": 'document',
                "source": {
                    "bytes": content
                }
            }
        }
    ],
}


def response_to_template(template: dict, result: dict) -> dict:
    tool_list = [{
        "toolSpec": template
    }]
    response = bedrock.converse(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        # modelId="meta.llama3-1-405b-instruct-v1:0",
        messages=[initial_message],
        inferenceConfig={
            "temperature": 0
        },
        toolConfig={
            "tools": tool_list,
            "toolChoice": {
                "tool": {
                    "name": "summarize_document"
                }
            }
        }
    )
    core_response = response['output']['message']['content'][0]['toolUse']['input']
    if 'properties' in core_response:
        core_response: dict = core_response['properties']
    for k, v in core_response.items():
        if isinstance(v, str) and v[0] in '{[' and v[-1] in ']}':
            try:
                core_response[k] = json.loads(v)
            except Exception:
                pass
    result.update(core_response)
    return core_response

In [6]:
def ibis_industry_summary() -> dict:
    """Populate the separate templates and merge the result."""
    total = {}
    threads: List[threading.Thread] = []

    for ist in IBIS_SUMMARY_TEMPLATE:
        thread = threading.Thread(target=response_to_template, args=(ist, total))
        thread.start()
        threads.append(thread)
    
    for thread in threads:
        thread.join()
    
    return total

In [7]:
ibis_summary = ibis_industry_summary()

In [8]:
out = {
    'source': 'IBIS',
    'type': 'Industry research',
    'subtype': 'Industry at a Glance',

    'industry_name': ibis_summary['industry_name'],
    'last_updated': ibis_summary['last_updated'],
    'industry_summary': ibis_summary,
}

json.dump(out, open(out_filename, 'w'), indent=2)