In [1]:
import os
import boto3
import json
import dateparser
from glob import glob
# from rag.basic_retrieval import file_id
# from cachier import cachier
from typing import List

from Templates.ibis_aws_summary_template_all import TEMPLATE as IBIS_SUMMARY_TEMPLATE

from loading_utils import get_initial_pages

from IPython.display import Markdown

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
import pandas as pd

import anthropic
import nest_asyncio
nest_asyncio.apply()
from dotenv import load_dotenv
from pdf2image import convert_from_path
import base64
import requests
from llama_index.core import Document

In [3]:

use_braintrust_dataset = True

# COMPARISON_FILE = 'claude-3-5-sonnet-20240620_qa.csv'
COMPARISON_FILE = "major_questions.csv"

# PDF_LOCATION = 'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf'
# PDF_LOCATION = 'IndustrySource/Misc/Aerospace Ceramics copy.pdf'
# DOC_ID = 'aerospace-ceramics'
# PDF_LOCATION = 'IndustrySource/Misc/3D Printer Manufacturing in the US.pdf'
PDF_LOCATION = 'IndustrySource/Misc/3D Printer Manufacturing in the US.pdf'
DOC_ID = PDF_LOCATION.split('/')[-1].split('.')[0].lower().replace(' ', '-')
# DOC_ID = 'ibis-healthcare-social-assistance'
MODEL_ID = 'gpt-4o-mini'
QUESTION_COL = 'question'
RESPONSE_COL = 'rag_model_response'
NUM_QUESTIONS = -1
PARSER = "claude" # "claude" or "llama-parse"
CHUNK_SIZE = 600
SPLITTER = "tree"
TOP_K = 3
OUTPUT_FOLDER = f'./rag_outputs/{DOC_ID}'
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
OUTPUT_FILE = f'{OUTPUT_FOLDER}/output_{MODEL_ID}_{PARSER}_{CHUNK_SIZE}_{SPLITTER}_{TOP_K}.csv'


In [4]:
# AWS_REGION_NAME = 'us-west-2'
AWS_REGION_NAME = 'us-east-1'

aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')

# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime.html
bedrock = boto3.client(
    service_name='bedrock-runtime',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=AWS_REGION_NAME
)

In [5]:
PAGE_TITLE_PROMPT = """ You are expert in determining the page numbers where a specific section begins from the list of titles and corresponding page numbers.
There are 6 sections in the document: ["Industry at a Glance", "Supply Chain", "Competitive Landscape", "Costs & Operations", "Questions for Owners", "Datatables & Glossary"]
Section 1: Industry at a Glance contains subsections such as Key Statistics, Executive Summary, Current Performance, Future Outlook, Industry Definition, Industry Impact, SWOT Analysis, Key Trends.
Section 2: Supply Chain contains subsections such as External drivers, Supply Chain, Similar Industries, Related International Industries, Products & Services, Demand Determinants, Market Segmentation, Business Locations
Section 3: Competitive Landscape contains subsections such as Basis of Competition, Barriers to Entry, Market Share Concentration, Industry Globalization
Section 4: Costs & Operations contains subsections such as Cost Structure, Capital Intensity, Revenue Volatility, Regulation & Policy, Industry Assistance
Section 5: Questions for Owners contains some questions and answers
Section 6: Datatables & Glossary contains some tables for Industry Data and glossary of industry terms.
You will be given a list of titles parsed from pdf and the corresponding page number where each title was parsed from. Each title begins in a new line with a page number as prefix enclosed in angle brackets <>.
Use the given information and your best jusdgement to determine the page number where each section begins.
Use the tool to output the page numbers in json format. If you are not sure about any section, output 0.
"""

In [6]:
from pypdf import PdfReader
import time

def get_number_of_pages(filename: str) -> int:
    with open(filename, 'rb') as f:
        inputpdf = PdfReader(f)
        return len(inputpdf.pages)

In [7]:
def response_to_text(content_text: str, template: dict, main_prompt: str, system_prompt: str, final_prompt: str, tool_name: str="info_extract") -> dict:
    initial_message = {
        "role": "user",
        "content": [
            {
                "text": main_prompt,
            },
        ],
    }

    initial_message['content'].append({"text": content_text})
    if final_prompt is not None:
        initial_message['content'].append({"text": final_prompt})
    

    tool_list = [{
        "toolSpec": template
    }]
    # model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
    model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"
    response = bedrock.converse(
        modelId=model_id,
        # modelId="meta.llama3-1-405b-instruct-v1:0",
        messages=[initial_message],
        # system = system_prompt,
        inferenceConfig={
            "temperature": 0,
        },
        toolConfig={
            "tools": tool_list,
            "toolChoice": {
                "tool": {
                    "name": tool_name
                }
            }
        }
    )
    core_response = response['output']['message']['content'][0]['toolUse']['input']
    if 'properties' in core_response:
        core_response: dict = core_response['properties']
    for k, v in core_response.items():
        if isinstance(v, str) and v[0] in '{[' and v[-1] in ']}':
            try:
                core_response[k] = json.loads(v)
            except Exception:
                pass

    return core_response, response

In [8]:
sections_names = ["Industry at a Glance", "Supply Chain", "Competitive Landscape", "Costs & Operations", "Questions for Owners", "Datatables & Glossary"]
import unstructured
from unstructured.partition.pdf import partition_pdf

def sample_sentences_from_each_page(pdf_path):
    page_nums = get_number_of_pages(pdf_path)
    all_titles = []
    for i in range(page_nums):
        page_titles = []
        page_pdf = get_initial_pages(pdf_path, pmin=i, pmax=i+1)
        elements = partition_pdf(filename=page_pdf)
        for element in elements:
            if isinstance(element, unstructured.documents.elements.Title) or isinstance(element, unstructured.documents.elements.Header):
                page_titles.append(element.text)
        all_titles.append(page_titles)
        os.remove(page_pdf)
    return all_titles
    

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
from Templates.aws_templates_common import build_aws_template
from Templates.aws_section_page_number_template import TEMPLATE as page_number_template

def get_page_numbers_for_sections(pdf_path):
    template = page_number_template
    template_parts = template['data']
    page_title_template = build_aws_template(template_parts, tool_name="page_number_inference")[0]
    all_titles = sample_sentences_from_each_page(pdf_path)
    #add page number as prefix to each title
    all_titles_with_page = []
    for i, titles in enumerate(all_titles):
        titles_with_page = [f"<Page {i+1}> {title}" for title in titles]
        all_titles_with_page.extend(titles_with_page)
    print(all_titles_with_page)
    titles_text = "\n".join(all_titles_with_page)
    response, response_raw = response_to_text(titles_text, page_title_template, PAGE_TITLE_PROMPT, None, None, tool_name="page_number_inference")
    return response, response_raw

In [10]:
page_mapping, page_response_raw = get_page_numbers_for_sections(PDF_LOCATION)

['<Page 1> 8/19/24, 9:19 PM', '<Page 1> 3D Printer Manufacturing in the US', '<Page 1> Industry Research (/ibisworld) > 3D Printer Manufacturing in the US', '<Page 1> Industry at a Glance', '<Page 1> K E Y S TAT I S T I C S', '<Page 1> P', '<Page 1> 742 $ million', '<Page 1> P R O F I T', '<Page 1> P', '<Page 1> P R O F I T M A R G I N', '<Page 2> 8/19/24, 9:19 PM', '<Page 2> R', '<Page 2> 4,578 $ million', '<Page 2> R E V E N U E', '<Page 2> E', '<Page 2> 165 Units', '<Page 2> E N T E R P R I S E S', '<Page 2> E', '<Page 2> 174 Units', '<Page 2> E', '<Page 2> 5,481 Units', '<Page 2> E M P L O Y M E N T', '<Page 2> 3D Printer Manufacturing in the US', '<Page 3> 8/19/24, 9:19 PM', '<Page 3> W', '<Page 3> 459 $ million', '<Page 3> W A G E S', '<Page 3> I', '<Page 3> 1,275 $ million', '<Page 3> I V A', '<Page 3> I', '<Page 3> 3,153 $ million', '<Page 3> I M P O R T S', '<Page 3> E', '<Page 3> 1,313 $ million', '<Page 3> E X P O R T S', '<Page 3> 3D Printer Manufacturing in the US', '<Page

In [11]:
import pickle
pickle.dump((page_mapping, page_response_raw), open(f"{OUTPUT_FOLDER}/page_mappings.pkl", "wb"))

In [12]:
page_mapping

{'industry_at_a_glance': 1,
 'supply_chain': 12,
 'competitive_landscape': 30,
 'costs_operations': 34,
 'questions_for_owners': 39,
 'datatables_glossary': 43}

In [48]:
page_mapping_list = []
total_pages = get_number_of_pages(PDF_LOCATION)
for sec_ix, sec in enumerate(sections_names):
    sec_mod = sec.lower().replace(' & ','_').replace(' ', '_')
    f_p = page_mapping[sec_mod]
    prev_ix = sec_ix-1
    while f_p == 0 and prev_ix >= 0:
        prev_sec = sections_names[prev_ix].lower().replace(' & ','_').replace(' ', '_')
        f_p = page_mapping[prev_sec]
        prev_ix -= 1   
    if f_p == 0:
        f_p = 1
    
    if sec_ix == len(sections_names)-1:
        e_p = total_pages
    else:
        sec_next = sections_names[sec_ix+1].lower().replace(' & ','_').replace(' ', '_')
        e_p = min(page_mapping[sec_next], total_pages)
    next_ix = sec_ix+1
    while e_p == 0 and next_ix < len(sections_names):
        next_sec = sections_names[next_ix].lower().replace(' & ','_').replace(' ', '_')
        e_p = page_mapping[next_sec]
        next_ix += 1
    if e_p == 0:
        e_p = total_pages
    page_mapping_list.append((sec, f_p, e_p))
page_mapping_list
    

[('Industry at a Glance', 1, 12),
 ('Supply Chain', 12, 30),
 ('Competitive Landscape', 30, 34),
 ('Costs & Operations', 34, 39),
 ('Questions for Owners', 39, 43),
 ('Datatables & Glossary', 43, 50)]

In [49]:
def get_raw_pdf_part(filename: str) -> dict:
    """This works best and parses quickly."""
    with open(filename, 'rb') as f:
        content = f.read()
        return {
            "document": {
                "format": "pdf",
                "name": 'document',
                "source": {
                    "bytes": content
                }
            }
        }


def response_to_template(filename: str, template: dict, prompt: str, tool_name:str="info_extract") -> dict:
    initial_message = {
        "role": "user",
        "content": [
            {
                "text": prompt,
            },
        ],
    }

    initial_message['content'].append(get_raw_pdf_part(filename))
    

    tool_list = [{
        "toolSpec": template
    }]
    # model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
    model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"
    response = bedrock.converse(
        modelId=model_id,
        # modelId="meta.llama3-1-405b-instruct-v1:0",
        messages=[initial_message],
        inferenceConfig={
            "temperature": 0
        },
        toolConfig={
            "tools": tool_list,
            "toolChoice": {
                "tool": {
                    "name": "info_extract"
                }
            }
        }
    )
    core_response = response['output']['message']['content'][0]['toolUse']['input']
    if 'properties' in core_response:
        core_response: dict = core_response['properties']
    for k, v in core_response.items():
        if isinstance(v, str) and v[0] in '{[' and v[-1] in ']}':
            try:
                core_response[k] = json.loads(v)
            except Exception:
                pass

    return core_response, response

In [51]:
INFO_EXTRACTION_PROMPT = """
You are an expert in extracting market and financial data from documents.
Use the given tool to extract essential data from the pdf content taken from market research report of an industry. Do not make any assumptions or add any information that is not present in the text.
Some data may be present in the form of tables, graphs, plots or images. You should interpret such components and then then extract the data that is asked for.

Return the result in JSON format. Do not use non-JSON tags. If some numeric data is not present in the text, simply output the number 101 as an answer where numeric data is expected.
For titles and names, limit the output to 20 words. For descriptions and key points, limit the output to 50 words.
"""

In [54]:

def extract_info_for_section(template, main_prompt, start_page, end_page):
    page_filename = get_initial_pages(PDF_LOCATION, pmin=start_page-1, pmax=end_page)
    num_failed = 0
    result = None
    while num_failed < 5:
        try:
            result = response_to_template(page_filename, template, main_prompt)
            break
        except Exception as e:
            num_failed += 1
            print(f"Error: {e}")
            print(f"Failed {num_failed} times. Sleeping for 60 seconds.")
            time.sleep(60)
            continue
    os.remove(page_filename)
    return result

def extract_info_for_all_sections(page_mapping_list, full_templates):
    results = []
    for idx, elem in enumerate(page_mapping_list):
        print(f"Extracting for section {elem}")
        section_name, start_page, end_page = elem
        if idx >= len(full_templates):
            break
        result = extract_info_for_section(full_templates[idx], INFO_EXTRACTION_PROMPT, start_page, end_page)
        results.append(result)
        print(f"Extracted for section {section_name}")
        print("Sleeping for 60 seconds.")
        time.sleep(60)
    return results


In [55]:
from Templates.ibis_aws_summary_template_all import TEMPLATE as IBIS_SUMMARY_TEMPLATE
template_parts = IBIS_SUMMARY_TEMPLATE['data']
full_templates = build_aws_template(template_parts)
# print(full_templates)
results = extract_info_for_all_sections(page_mapping_list, full_templates)

Extracting for section ('Industry at a Glance', 1, 12)
Extracted for section Industry at a Glance
Sleeping for 60 seconds.
Extracting for section ('Supply Chain', 12, 30)
Error: An error occurred (ThrottlingException) when calling the Converse operation (reached max retries: 4): Too many requests, please wait before trying again. You have sent too many requests.  Wait before trying again.
Failed 1 times. Sleeping for 60 seconds.
Error: An error occurred (ThrottlingException) when calling the Converse operation (reached max retries: 4): Too many requests, please wait before trying again. You have sent too many requests.  Wait before trying again.
Failed 2 times. Sleeping for 60 seconds.
Error: An error occurred (ThrottlingException) when calling the Converse operation (reached max retries: 4): Too many requests, please wait before trying again. You have sent too many requests.  Wait before trying again.
Failed 3 times. Sleeping for 60 seconds.
Error: An error occurred (ThrottlingExcepti

In [58]:
results[3]

({'cost_structure_breakdown': [{'cost_type': 'Purchases',
    'cost_type_percentage': 101},
   {'cost_type': 'Wages', 'cost_type_percentage': 101},
   {'cost_type': 'Profit', 'cost_type_percentage': 101},
   {'cost_type': 'Depreciation', 'cost_type_percentage': 101},
   {'cost_type': 'Marketing', 'cost_type_percentage': 101},
   {'cost_type': 'Rent', 'cost_type_percentage': 101},
   {'cost_type': 'Utilities', 'cost_type_percentage': 101},
   {'cost_type': 'Other', 'cost_type_percentage': 101}],
  'cost_factors': [{'cost_factor_title': 'Profit influenced by demand',
    'cost_factor_description': 'High innovation, increasing market acceptance, and rising demand improve economies of scale, boosting profitability.'},
   {'cost_factor_title': 'Purchase costs fluctuate',
    'cost_factor_description': 'Commodity prices and supply chain efficiencies affect purchase costs. Long-term contracts and strategic sourcing mitigate fluctuations.'},
   {'cost_factor_title': 'Wages change with labor ma

In [39]:
import pickle

pickle.dump(results, open(f"{OUTPUT_FOLDER}/section_summaries_v0.pkl", "wb"))

In [20]:
from Templates.build_markdown_report import build_markdown_report_func
report_md = build_markdown_report_func(section_summaries)

In [21]:
print(report_md)
# save the markdown report
with open(f"{OUTPUT_FOLDER}/summary_report.md", "w") as file:
    file.write(report_md)


    # 3D Printer Manufacturing in the US
    Last Updated: 2024-08-17

    ## Industry Overview
    ### Key Statistics
    - **Profit**
        - **Annual Profit:** $742.0M
        - **Historical CAGR of Profit:** 19.6% (2005 - 2024)
    - **Profit Margins**
        - **Profit Margins:** 16.2%
        - **Historical CAGR of Profit Margins:** 2.3% (2005 - 2024)
    - **Revenue**
        - **Annual Revenue:** $4.578B
        - **Historical CAGR of Revenue:** 17.0% (2005 - 2024)
        - **Projected CAGR of Revenue:** 18.5% (2024 - 2030)
    - **Enterprises:** 165
    - **Establishments:** 174
    - **Employees:** 5481
    - **Wages:** $459.0M
    - **Industry Value Added (IVA):** $1.275B
    - **Imports:** $3.153B
    - **Exports:** $1.313B

    ### Executive Summary
    The 3D printer manufacturing industry has seen significant advancements and cost reductions, making the technology more accessible. Commercial and desktop 3D printers are now available at various price points. Enhanced

In [None]:
from unstructured.partition.pdf import partition_pdf
print(file_path)
pages_filename = get_initial_pages(file_path, pmin=42, pmax=43)
elements = partition_pdf(filename=pages_filename, pages=[1])  # Unstructured's page index starts from 1

el_text = ""
for el in elements:
    # print(el.to_dict())
    el_text += el.to_dict()['text'] + "\n\n"
    # print(el.text)

print(len(el_text)/6)