# Rely on LLM's to extract info from docs

Will save them in JSON for future reuse.

In [1]:
import os
import boto3
import json
import dateparser
from glob import glob
from rag.basic_retrieval import file_id
from cachier import cachier
from typing import List

from Templates.ibis_aws_summary_template import TEMPLATE as IBIS_SUMMARY_TEMPLATE
from Templates.aws_basic_info_template import TEMPLATE as BASIC_TEMPLATE
from Templates.aws_sections_template import TEMPLATE as SECTIONS_TEMPLATE
from Templates.aws_markdown_template import TEMPLATE as MARKDOWN_TEMPLATE
from Templates.aws_templates_common import build_aws_template

from loading_utils import extract_pages, number_of_pages

from IPython.display import Markdown

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
AWS_REGION_NAME = 'us-west-2'
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')

# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime.html
bedrock = boto3.client(
    service_name='bedrock-runtime',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=AWS_REGION_NAME
)

In [3]:
# trouble-shooting: use a different client with service_name 'bedrock', not 'bedrock-runtime'
# https://docs.aws.amazon.com/bedrock/latest/APIReference/welcome.html
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock.html

# client = boto3.client(
#     service_name='bedrock',
#     aws_access_key_id=aws_access_key_id,
#     aws_secret_access_key=aws_secret_access_key,
#     region_name=AWS_REGION_NAME
# )

# summ = client.list_foundation_models()['modelSummaries']
# [model for model in summ if 'Sonnet' in model['modelName']]

In [4]:
# [m for m in dir(bedrock) if not m.startswith('_')]
# help(bedrock.converse)

In [5]:
INFO_EXTRACTION_PROMPT = """
You are an expert in extracting market and financial data from documents.
Extract essential data from text in the enclosed document.

Return the result in JSON format. Do not use non-JSON tags such as <property> or <UNKNOWN>.
Use only simple keys with units, such as "historical_revenue_growth_percentage" or "establishments_count" or "revenue_dollars".
"""

In [6]:
STRUCTURE_EXTRACTION_PROMPT = """
You are an expert in extracting logical structure from PDF documents.

A PDF document is generally divided into chapters, sections, subsections, and such.

A chapter or section title has larger font size, more visible color, and is often boldfaced.
It starts on a new line and does not mix with the rest of the text in terms of size or color.

Return the the result in JSON format as a list of headers of different levels.
Use the following header attributes: name, level, page number, and line number.

Make sure to scan the document from start to finish and extract the headers.
"""

In [7]:
MARKDOWN_PROMPT = """
You are an expert in converting PDF documents and their parts to a Markdown representation.
In particular, most of the documents documents are incomplete in that they don't have the title
and can start on any page in the middle.

You pay special attention to sections and paragraphs.

Depending on the level of the section, you use an appropriate number of hash signs (#) to mark their headers.

You also keep paragraphs together and do not insert new lines in the middle of paragraphs.
"""

In [8]:
def get_raw_pdf_part(filename: str) -> dict:
    """This works best and parses quickly."""
    with open(filename, 'rb') as f:
        content = f.read()
        return {
            "document": {
                "format": "pdf",
                "name": 'document',
                "source": {
                    "bytes": content
                }
            }
        }


def response_to_template(filename: str, template: dict, prompt: str) -> dict:

    initial_message = {
        "role": "user",
        "content": [
            {
                "text": prompt,
            },
        ],
    }

    initial_message['content'].append(get_raw_pdf_part(filename))

    tool_list = [{
        "toolSpec": template
    }]
    response = bedrock.converse(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        # modelId="meta.llama3-1-405b-instruct-v1:0",
        messages=[initial_message],
        inferenceConfig={
            "temperature": 0
        },
        toolConfig={
            "tools": tool_list,
            "toolChoice": {
                "tool": {
                    "name": "info_extract"
                }
            }
        }
    )
    core_response = response['output']['message']['content'][0]['toolUse']['input']
    if 'properties' in core_response:
        core_response: dict = core_response['properties']
    for k, v in core_response.items():
        if isinstance(v, str) and v[0] in '{[' and v[-1] in ']}':
            try:
                core_response[k] = json.loads(v)
            except Exception:
                pass

    return core_response

In [9]:
# def filename_template_hash(filename: str, template: dict):
def filename_template_hash(*args):
    # TODO: figure out why it works this way
    filename = args[1]['filename']
    template = args[1]['template']
    return (file_id(filename), template['name'])


# @cachier(hash_func=filename_template_hash)
def info_from_doc_template(filename: str, template: dict, prompt: str) -> dict:
    """Populate the separate templates and merge the result."""

    template_parts = template['data']
    full_templates = build_aws_template(template_parts)
    results = []
    for part in full_templates:
        try:
            results.append(response_to_template(filename, part, prompt))
        except Exception as e:
            print("EXCEPTION IN GETTING RESPONSE")
            print(str(e))

    total = {}
    for result in results:
        total.update(result)

    return total

In [10]:
def extract_basic_info(filename: str) -> dict:
    """Extract basic info based on the template and initial part of the file."""
    with extract_pages(filename, last_page=10) as pages_filename:
        result = info_from_doc_template(filename=pages_filename, template=BASIC_TEMPLATE, prompt=INFO_EXTRACTION_PROMPT)
    return result

In [11]:
def extract_section_info(filename: str) -> dict:
    """Not working well..."""
    with extract_pages(filename, first_page=10, last_page=20) as pages_filename:
        result = info_from_doc_template(filename=pages_filename, template=SECTIONS_TEMPLATE, prompt=STRUCTURE_EXTRACTION_PROMPT)
    return result

In [12]:
def extract_markdown(filename: str, first_page: int, last_page: int) -> dict:
    """Works for a few pages..."""
    with extract_pages(filename, first_page=first_page, last_page=last_page) as pages_filename:
        result = info_from_doc_template(filename=pages_filename, template=MARKDOWN_TEMPLATE, prompt=MARKDOWN_PROMPT)
    return result

In [13]:
def ibis_industry_summary(filename: str) -> dict:
    """Extract full summary from the PDF file."""
    return info_from_doc_template(filename=filename, template=IBIS_SUMMARY_TEMPLATE, prompt=INFO_EXTRACTION_PROMPT)

In [14]:
# filenames = glob('IndustrySource/Misc/*')

filenames = [
    # 'IndustrySource/Misc/Global Markets for Advanced Aerospace Materials.pdf',
    # 'IndustrySource/Misc/51 Information in the US Industry Report copy.pdf',
    # 'IndustrySource/Misc/United_States_Healthcare_Repor copy.pdf',
    'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf',
]

In [15]:
filenames = glob('IndustrySource/HVAC/*.pdf')

In [16]:
step = 1
for filename in filenames:
    print("PROCESSING", filename)
    markdowns = []
    num_pages = number_of_pages(filename)
    for page in range(0, num_pages, step):
        print("  PAGE", page)
        markdowns.append(extract_markdown(filename, page, page+step-1))
        with open(f'{filename}.json', 'w') as out_file:
            json.dump(markdowns, out_file)

PROCESSING IndustrySource/HVAC/HVAC Equipment Manufacturing.pdf
  PAGE 0
  PAGE 1
  PAGE 2
  PAGE 3
  PAGE 4
  PAGE 5
  PAGE 6
  PAGE 7
  PAGE 8
  PAGE 9
  PAGE 10
  PAGE 11
  PAGE 12
  PAGE 13
  PAGE 14
  PAGE 15
  PAGE 16
PROCESSING IndustrySource/HVAC/Heating & Air Conditioning Equipment Manufacturing in the US.pdf
  PAGE 0
  PAGE 1
  PAGE 2
  PAGE 3
  PAGE 4
  PAGE 5
  PAGE 6
  PAGE 7
  PAGE 8
  PAGE 9
  PAGE 10
  PAGE 11
  PAGE 12
  PAGE 13
  PAGE 14
  PAGE 15
  PAGE 16
  PAGE 17
  PAGE 18
  PAGE 19
  PAGE 20
  PAGE 21
  PAGE 22
  PAGE 23
  PAGE 24
  PAGE 25
  PAGE 26
  PAGE 27
  PAGE 28
  PAGE 29
  PAGE 30
  PAGE 31
  PAGE 32
  PAGE 33
  PAGE 34
  PAGE 35
  PAGE 36
  PAGE 37
  PAGE 38
  PAGE 39
  PAGE 40
  PAGE 41
  PAGE 42
  PAGE 43
  PAGE 44
PROCESSING IndustrySource/HVAC/heating-ventilation-and-air-conditioning-industry-worldwide.pdf
  PAGE 0
  PAGE 1
  PAGE 2
  PAGE 3
  PAGE 4
  PAGE 5
  PAGE 6
  PAGE 7
  PAGE 8
  PAGE 9
  PAGE 10
  PAGE 11
  PAGE 12
  PAGE 13
  PAGE 14
  PAG

In [17]:
0/0

ZeroDivisionError: division by zero

In [None]:
display(Markdown(markdowns[4]['markdown']))

In [None]:
markdown = [extract_markdown(filename, 12, 14) for filename in filenames[:5]]
markdown

In [None]:
display(Markdown(markdown[0]['markdown']))

In [None]:
0/0

In [None]:
out_sections = [extract_section_info(filename) for filename in filenames[:5]]
out_sections

"""
[{'sections': [{'name': 'Introduction',
    'level': 1,
    'page_number': 1,
    'line_number': 1},
   {'name': 'Study Goals and Objectives',
    'level': 2,
    'page_number': 1,
    'line_number': 6},
   {'name': 'Reasons for Doing This Study',
    'level': 2,
    'page_number': 2,
    'line_number': 16},
   {'name': 'Scope of Report',
    'level': 2,
    'page_number': 3,
    'line_number': 28},
   {'name': "What's New in This Update?",
    'level': 2,
    'page_number': 5,
    'line_number': 51},
   {'name': 'Research Methodology',
    'level': 2,
    'page_number': 5,
    'line_number': 67},
   {'name': 'Information Sources',
    'level': 2,
    'page_number': 7,
    'line_number': 100},
   {'name': 'Geographic Breakdown',
    'level': 2,
"""

In [14]:
# out_basic = [extract_basic_info(filename) for filename in filenames[:5]]
# out_basic

In [None]:
0/0

In [None]:
out_full = [ibis_industry_summary(filename) for filename in filenames[:5]]
out_full

In [12]:
def parse_save_file(filename: str):
    """Extract info and save it."""
    ibis_summary = ibis_industry_summary(filename)
    last_updated = dateparser.parse(ibis_summary['last_updated']).isoformat()
    last_updated = last_updated.replace(' ', 'T') + '+00:00'

    out = {
        'title': f'{ibis_summary["industry_name"]} Industry Report',

        'category': 'Industry research',
        'subcategory': 'IBIS Industry at a Glance',
        'tags': ['test', 'industry report', 'IBIS'],
        
        'last_updated': last_updated,

        'summary': ibis_summary,
    }

    out_filename = filename.replace('Source', 'Summary').replace('.pdf', '.json')
    with open(out_filename, 'w') as f:
        json.dump(out, f, indent=2)

In [13]:
filenames = [
    'IndustrySource/IBIS/Household Furniture Manufacturing in the US.pdf',
    'IndustrySource/IBIS/Furniture Stores in the US.pdf',
    "IndustrySource/IBIS/IT Consulting in the US.pdf",
]
for filename in filenames:
    parse_save_file(filename)

In [None]:
def headers_and_subheaders(filename: str) -> List[dict]:

    prompt = """
You are an expert in inferring document structure in terms of sections, subsections, etc. from PDF documents.

Extract from the given document sections, subsections, and subsubsections.

Return the result as a JSON list, with each element being a dictionary with the fields: "name", "level", "page"

Return the result in JSON format. Do not use non-JSON tags such as <property> or <UNKNOWN>.
Use only simple keys with units, such as "historical_revenue_growth_percentage" or "establishments_count" or "revenue_dollars".
"""

    initial_message = {
        "role": "user",
        "content": [
            {
                "text": prompt,
            },
        ],
    }

    initial_message['content'].append(get_raw_pdf_part(filename))

    # tool_list = [{
    #     "toolSpec": template
    # }]
    response = bedrock.converse(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        # modelId="meta.llama3-1-405b-instruct-v1:0",
        messages=[initial_message],
        inferenceConfig={
            "temperature": 0
        },
        # toolConfig={
        #     "tools": tool_list,
        #     "toolChoice": {
        #         "tool": {
        #             "name": "info_extract"
        #         }
        #     }
        # }
    )
    core_response = response['output']['message']['content'][0]['toolUse']['input']
