# Rely on LLM's to extract info from docs

Will save them in JSON for future reuse.

In [1]:
import os
from dotenv import load_dotenv
import boto3
import json
import dateparser
from typing import List
from glob import glob

from Templates.ibis_aws_summary_template import IBIS_SUMMARY_TEMPLATE

from Templates.aws_basic_info_template import TEMPLATE as BASIC_TEMPLATE
from Templates.aws_templates_common import build_aws_template

load_dotenv()

True

In [2]:
AWS_REGION_NAME = 'us-west-2'
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')

# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime.html
bedrock = boto3.client(
    service_name='bedrock-runtime',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=AWS_REGION_NAME
)

In [3]:
# trouble-shooting: use a different client with service_name 'bedrock', not 'bedrock-runtime'
# https://docs.aws.amazon.com/bedrock/latest/APIReference/welcome.html
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock.html

# client = boto3.client(
#     service_name='bedrock',
#     aws_access_key_id=aws_access_key_id,
#     aws_secret_access_key=aws_secret_access_key,
#     region_name=AWS_REGION_NAME
# )

# summ = client.list_foundation_models()['modelSummaries']
# [model for model in summ if 'Sonnet' in model['modelName']]

In [4]:
# [m for m in dir(bedrock) if not m.startswith('_')]
# help(bedrock.converse)

In [5]:
def get_raw_pdf_part(filename: str) -> dict:
    """This works best and parses quickly."""
    content = open(filename, 'rb').read()
    return {
        "document": {
            "format": "pdf",
            "name": 'document',
            "source": {
                "bytes": content
            }
        }
    }


def response_to_template(template: dict, filename: str) -> dict:

    prompt = """
    You are an expert in extracting market and financial data from documents.
    Extract essential data from text in the enclosed document.

    Return the result in JSON format. Do not use non-JSON tags such as <property> or <UNKNOWN>.
    Use only simple keys with units, such as "historical_revenue_growth_percentage" or "establishments_count" or "revenue_dollars".
    """

    initial_message = {
        "role": "user",
        "content": [
            {
                "text": prompt,
            },
        ],
    }

    initial_message['content'].append(get_raw_pdf_part(filename))

    tool_list = [{
        "toolSpec": template
    }]
    response = bedrock.converse(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        # modelId="meta.llama3-1-405b-instruct-v1:0",
        messages=[initial_message],
        inferenceConfig={
            "temperature": 0
        },
        toolConfig={
            "tools": tool_list,
            "toolChoice": {
                "tool": {
                    "name": "info_extract"
                }
            }
        }
    )
    core_response = response['output']['message']['content'][0]['toolUse']['input']
    if 'properties' in core_response:
        core_response: dict = core_response['properties']
    for k, v in core_response.items():
        if isinstance(v, str) and v[0] in '{[' and v[-1] in ']}':
            try:
                core_response[k] = json.loads(v)
            except Exception:
                pass

    return core_response

In [6]:
def info_from_doc_template(filename: str, template_parts: List[dict]) -> dict:
    """Populate the separate templates and merge the result."""
    full_templates = build_aws_template(template_parts)
    results = [response_to_template(part, filename) for part in full_templates]

    total = {}
    for result in results:
        total.update(result)

    return total

In [11]:
# filenames = glob('IndustrySource/Misc/*')
filenames = [
    'IndustrySource/Misc/51 Information in the US Industry Report copy.pdf',
    # 'IndustrySource/Misc/United_States_Healthcare_Repor copy.pdf'
]
out = []

for filename in filenames[:5]:
    basic_info = info_from_doc_template(filename, BASIC_TEMPLATE)
    out.append(basic_info)

In [8]:
out[0]

{'title': 'United States Healthcare Report',
 'source': 'BMI',
 'last_updated': 'April 2024'}

In [9]:
filenames[0]

'IndustrySource/Misc/United_States_Healthcare_Repor copy.pdf'

In [10]:
0/0

ZeroDivisionError: division by zero

In [7]:
def parse_save_file(filename: str):
    """Extract info and save it."""
    ibis_summary = ibis_industry_summary(filename)
    last_updated = dateparser.parse(ibis_summary['last_updated']).isoformat()
    last_updated = last_updated.replace(' ', 'T') + '+00:00'

    out = {
        'title': f'{ibis_summary["industry_name"]} Industry Report',

        'category': 'Industry research',
        'subcategory': 'IBIS Industry at a Glance',
        'tags': ['test', 'industry report', 'IBIS'],
        
        'last_updated': last_updated,

        'summary': ibis_summary,
    }

    out_filename = filename.replace('Source', 'Summary').replace('.pdf', '.json')
    json.dump(out, open(out_filename, 'w'), indent=2)

In [8]:
filenames = [
    'IndustrySource/IBIS/Household Furniture Manufacturing in the US.pdf',
    'IndustrySource/IBIS/Furniture Stores in the US.pdf',
    "IndustrySource/IBIS/IT Consulting in the US.pdf",
]
for filename in filenames:
    parse_save_file(filename, from_text=False)