# Rely on LLM's to extract info from docs

Will save them in JSON for future reuse.

In [1]:
import os
import boto3
import json
import dateparser
from glob import glob
# from rag.basic_retrieval import file_id
# from cachier import cachier
from typing import List


from Templates.aws_markdown_template import TEMPLATE as MARKDOWN_TEMPLATE
from Templates.aws_templates_common import build_aws_template

from loading_utils import get_initial_pages

from IPython.display import Markdown

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# PDF_LOCATION = 'IndustrySource/Misc/3D Printer Manufacturing in the US.pdf'
PDF_LOCATION = 'IndustrySource/Misc/HVAC%20Service%20Franchises%20in%20the%20US.pdf'
DOC_ID = PDF_LOCATION.split('/')[-1].split('.')[0].lower().replace(' ', '-')
OUTPUT_FOLDER = f'./rag_outputs/{DOC_ID}'
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

In [3]:
# AWS_REGION_NAME = 'us-west-2'
AWS_REGION_NAME = 'us-east-1'

aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')

# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime.html
bedrock = boto3.client(
    service_name='bedrock-runtime',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=AWS_REGION_NAME
)

In [4]:
MARKDOWN_PROMPT = """
We are sequentially converting a pdf document page by page to markdown format. You are an expert in converting the given PDF content to a Markdown representation.

Follow these instructions to complete the task:
- Infer the headings and subheadings of the given content with their levels from the appearance and semantic context. Generally, the larger font size, more visible color, and boldface indicate a lower level of the heading. For example, level 1 headings are expected to be more prominent than level 2 headings.
- Depending on the level of the section, you use an appropriate number of hash signs (#) to mark their headers in markdown format. # for level 1, ## for level 2, ### for level 3 and so on.
- The provided content may start from any page of the document. So the heading at first does not necessarily mean a level 1 heading.
- Do not insert any new content. Just convert the existing content to Markdown format while keeping the structure.
- Text is converted as it is. If the text is present in conflicting format that can be confusing, please interpret the text correctly and convert it to markdown.
- For visuals such as graphs, plots and figures, interpret them, be objective and explain the interpretation of data in detail with numbers and use that explanation in place of the visuals. Explanation provided should capture all the data insights that can be inferred from the figure. Use your best judgement to interpret the visuals.
- For tables, convert them to markdown table format without any explanation.
- The output should be in markdown format. Do not modify any content. 
Next is the pdf content:\n\n
"""

In [5]:
def get_raw_pdf_part(filename: str) -> dict:
    """This works best and parses quickly."""
    with open(filename, 'rb') as f:
        content = f.read()
        return {
            "document": {
                "format": "pdf",
                "name": 'document',
                "source": {
                    "bytes": content
                }
            }
        }


def response_to_template(filename: str, template: dict, prompt: str) -> dict:
    initial_message = {
        "role": "user",
        "content": [
            {
                "text": prompt,
            },
        ],
    }

    initial_message['content'].append(get_raw_pdf_part(filename))
    

    tool_list = [{
        "toolSpec": template
    }]
    model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"
    response = bedrock.converse(
        modelId=model_id,
        # modelId="meta.llama3-1-405b-instruct-v1:0",
        messages=[initial_message],
        inferenceConfig={
            "temperature": 0
        },
        toolConfig={
            "tools": tool_list,
            "toolChoice": {
                "tool": {
                    "name": "info_extract"
                }
            }
        }
    )
    core_response = response['output']['message']['content'][0]['toolUse']['input']
    if 'properties' in core_response:
        core_response: dict = core_response['properties']
    for k, v in core_response.items():
        if isinstance(v, str) and v[0] in '{[' and v[-1] in ']}':
            try:
                core_response[k] = json.loads(v)
            except Exception:
                pass

    return core_response

In [6]:

# @cachier(hash_func=filename_template_hash)
def info_from_doc_template(filename: str, template: dict, prompt: str, **kwargs) -> dict:
    """Populate the separate templates and merge the result."""

    template_parts = template['data']
    full_templates = build_aws_template(template_parts)
    results = [response_to_template(filename, part, prompt, **kwargs) for part in full_templates]

    total = {}
    for result in results:
        total.update(result)

    return total

In [7]:
def extract_markdown(filename: str, first_page: int, last_page: int) -> dict:
    pages_filename = get_initial_pages(filename, pmin=first_page, pmax=last_page+1)
    result = info_from_doc_template(filename=pages_filename, template=MARKDOWN_TEMPLATE, prompt=MARKDOWN_PROMPT)
    os.remove(pages_filename)
    return result

In [8]:
from pypdf import PdfReader
import time

def get_number_of_pages(filename: str) -> int:
    with open(filename, 'rb') as f:
        inputpdf = PdfReader(f)
        return len(inputpdf.pages)

def sequentially_process_pdf(filename, np=1):
    # find total number of pages in the pdf document
    total_pages = get_number_of_pages(filename)

    results = []
    skipped_pages = []
    for i in range(0, total_pages, np):
        print(f"Processing pages {i} to {i+np}")
        pages_filename = get_initial_pages(filename, pmin=i, pmax=i+np)
        failures = 0
        success = False
        result = None
        while (not success) and (failures < 3):
            try:
                result = info_from_doc_template(filename=pages_filename, template=MARKDOWN_TEMPLATE, prompt=MARKDOWN_PROMPT)
                success = True
                time.sleep(5)
            except:
                print(f"Error processing page {i+1}")
                failures += 1
                print(f"Retrying in 60 seconds.")
                time.sleep(60)
                if failures == 3:
                    result = {
                        'markdown': "**skipped**",
                    }
                    print(f"Failed to process page {i+1} after 3 attempts.")
                    skipped_pages.append(i+1)

        results.append(result)
        os.remove(pages_filename)
    print(f" Had to skip pages: {len(skipped_pages)}")
    return results, skipped_pages

        

    

In [9]:
file_path = PDF_LOCATION

results, skipped_pages = sequentially_process_pdf(file_path, np=1)

Processing pages 0 to 1
Processing pages 1 to 2
Error processing page 2
Retrying in 60 seconds.
Processing pages 2 to 3
Error processing page 3
Retrying in 60 seconds.
Processing pages 3 to 4
Error processing page 4
Retrying in 60 seconds.
Processing pages 4 to 5
Error processing page 5
Retrying in 60 seconds.
Processing pages 5 to 6
Error processing page 6
Retrying in 60 seconds.
Processing pages 6 to 7
Error processing page 7
Retrying in 60 seconds.
Processing pages 7 to 8
Error processing page 8
Retrying in 60 seconds.
Processing pages 8 to 9
Error processing page 9
Retrying in 60 seconds.
Processing pages 9 to 10
Error processing page 10
Retrying in 60 seconds.
Processing pages 10 to 11
Error processing page 11
Retrying in 60 seconds.
Processing pages 11 to 12
Error processing page 12
Retrying in 60 seconds.
Processing pages 12 to 13
Error processing page 13
Retrying in 60 seconds.
Processing pages 13 to 14
Error processing page 14
Retrying in 60 seconds.
Processing pages 14 to 15


In [None]:
print(results[0]['markdown'])

In [12]:
from llama_index.core import Document
import pickle
model = "anthropic.claude-3-5-sonnet-20240620-v1:0"

output_pickle_path = os.path.join(OUTPUT_FOLDER, f"_{model}_pages.pkl")

documents = []
for i, response in enumerate(results):
    doc = Document(text=results[i]['markdown'], metadata={"page_number": i+1})
    documents.append(doc)
parsed_result = {"parsed_documents": documents, "skipped_pages": skipped_pages}
pickle.dump(parsed_result, open(output_pickle_path, 'wb'))






In [None]:
# filenames = [
#     'IndustrySource/Misc/3D Printer Manufacturing in the US.pdf'
# ]
# markdown = [extract_markdown(filename, 45, 48) for filename in filenames[:5]]
# markdown

# Phase 2 : Load documents and do post processing to correct the heading structure

In [None]:
from llama_index.core import Document
import pickle

PDF_LOCATION = 'IndustrySource/Misc/3D Printer Manufacturing in the US.pdf'
DOC_ID = PDF_LOCATION.split('/')[-1].split('.')[0].lower().replace(' ', '-')
OUTPUT_FOLDER = f'./rag_outputs/{DOC_ID}'


model = "anthropic.claude-3-5-sonnet-20240620-v1:0"

output_pickle_path = os.path.join(OUTPUT_FOLDER, f"_{model}_pages.pkl")

# load pickle file if it exists
if os.path.exists(output_pickle_path):
    with open(output_pickle_path, 'rb') as f:
        parsed_result = pickle.load(f)

documents = parsed_result['parsed_documents']

In [None]:
docs = [document.text for document in documents]
# iterate over docs and see if the first sentence contains the title, then remove the whole line from the text
all_sentences = []
for i, doc in enumerate(docs):
    sentences = doc.split("\n")
    all_sentences.append(sentences)


In [None]:

headings = []
for i, sentences in enumerate(all_sentences):
    for j, sentence in enumerate(sentences):
        # sentence is a heading if it starts with # or ## or ### or #### or #####
        if sentence.startswith("#"):
            headings.append({"text": sentence, "page": i+1, "sentence_number": j+1})

print(len(headings))
headings[:2]

In [None]:
HEADINGS_PROMPT = ''' You will be given a text containing headings and subheadings parsed from the market research report of a single industry in markdown format. 
However, the issue is that these subheadings were parsed one page at a time. So this makes it possible that the structure of the document is not preserved.
The heading 2 in the document might be parsed as heading 1 if it is the first heading on the page. Similarly, all the following subheadings on the same page might be parsed as different heading levels.
However, the order in wich the headings appear is preserved from top to bottom of each page. So you don't have to worry about the order of the headings.
Other issue is that parsed headings/ subheadings may accidentally include date or page number or report title that might have been present as a header or footer in the document. Some headings might be repeated as well so you need to remove the repetitions.
Your task is to identify the correct structure of the document by taking into account the semantic meanings of the headings and subheadings.
Your input will be a text file with each line starting with <page_number, line_number> of the heading/ subheading followed by the heading or subheading text starting with # or ## or ### or #### or ##### denoting heading 1, heading 2, heading 3, heading 4 and heading 5 respectively.
Your output will be a text file with each line containing the same <page_number, line_number> as the one in input followed by the heading or subheading text with the correct heading level denoted by # or ## or ### or #### or #####. 
You must not change the page number and line number of the respective heading or subheading. You may entrirely remove the heading or subheading if it is a repetition or report title or date that was incorrectly included as a heading. Otherwise, you should correct the heading level if you think it was parsed incorrectly.
To make it easier for you, here are the expected major sections (level 1) of the document: Industry at a Glance, Supply Chain, Competitive Landscape, Costs & Operations, Questions for Owners, Datatables & Glossary
'''

SYSTEM_MESSAGE = "You are an expert in identifying the correct structure of a document by taking into account the page numbers, line numbers and the semantic meanings of the headings and subheadings."

FINAL_MESSAGE = "Please identify the correct structure of the document by taking into account the page numbers, line numbers and the semantic meanings of the headings and subheadings. Output the headings with the correct heading level denoted by # or ## or ### or #### or #####."

In [None]:
input_text = ""
for heading in headings:
    input_text += f"<{heading['page']}, {heading['sentence_number']}> {heading['text']}\n"
input_text

In [None]:
import anthropic

from llama_index.core import Document

client = anthropic.Anthropic()
model = "claude-3-5-sonnet-20240620"

response = client.messages.create(
    model=model,
    max_tokens=5000,
    system = SYSTEM_MESSAGE,
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": HEADINGS_PROMPT},
                {
                    "type": "text",
                    "text": input_text
                },
                {"type": "text", "text": FINAL_MESSAGE},
            ],
        }
    ],
)

In [None]:
output = response.content[0].text
output_list = output.split("\n")
# filter output list that do not start with <
output_list = [line for line in output_list if line.lstrip().startswith("<")]
print(len(output_list))
output_list[:50]

In [None]:
import uuid

class Node:
    def __init__(self, heading, level, page, line_number, refined_output_idx):
        self.heading = heading
        self.text = None
        self.level = level
        self.page = page
        self.line_number = line_number
        self.children = []
        self.parent = None
        self.self_index = None
        self.is_partial_node = False
        self.is_partial_node_parent = False
        self.text_size = 0
        self.refined_output_idx = refined_output_idx  
        self.embedding_text = None
        # generate a unique id for each node
        uid = str(uuid.uuid4())[:3]
        self.id_ = f"{self.page}_{self.line_number}_{self.level}_{uid}"
    
    def add_child(self, child):
        child.self_index = len(self.children)
        self.children.append(child)
        child.parent = self
    
    def __repr__(self):
        return f"self index: {self.self_index} level: {self.level} loc: <{self.page, self.line_number}> content: {self.heading}"
    
    def __str__(self):
        return f"self index: {self.self_index} level: {self.level} loc: <{self.page, self.line_number}> content: {self.heading}"


In [None]:
# make a tree structure of the headings and subheadings from output_list such as level 1 is parent of level 2 and so on

def get_level(heading):
    text = heading.lstrip()
    level = 0
    for char in text:
        if char == "#":
            level += 1
        else:
            break
    return level

def get_fields_from_line(text):
    level = 0
    line = text.lstrip()
    if line.startswith("<"):
        prefix = (line.split("> ")[0]).split("<")[1]
        page, line_number = int(prefix.split(",")[0]), int(prefix.split(",")[1])
        heading = line.split("> ")[1]
        level = get_level(heading)
        return page, line_number, heading, level
    else:
        return None, None, None, level


root = Node(heading="", level=0, page=0, line_number=0, refined_output_idx=-1)
parent = root
# construct the tree
# TODO: take page, line_number from the input instead of output to avoid any issues

refined_output = []
for i in range(len(output_list)):
    page, line_number, heading, level = get_fields_from_line(output_list[i])
    if level == 0:
        continue
    if level > parent.level:
        child = Node(heading, level, page, line_number, len(refined_output))
        parent.add_child(child)
        parent = child
    elif level == parent.level:
        child = Node(heading, level, page, line_number, len(refined_output))
        parent.parent.add_child(child)
        parent = child
    else: # level < parent.level
        while level < parent.level:
            parent = parent.parent
        child = Node(heading, level, page, line_number, len(refined_output))
        if level == parent.level:
            parent.parent.add_child(child)
        else:
            parent.add_child(child)
        parent = child
    refined_output.append((page, line_number, heading, level))



In [None]:
def get_node_text(node):
    if node.text is not None:
        return node.text
    
    if node.level == 0:
        return ""
    page = node.page
    line_number = node.line_number
    # check if there is any heading after this
    if len(refined_output) > node.refined_output_idx + 1:
        next_heading = refined_output[node.refined_output_idx + 1]
        next_page, next_line_number, next_heading_text, next_level = next_heading
        # get text between this heading and next heading using all_sentences
        text = ""
        if page == next_page:
            if line_number + 1 < next_line_number:
                text += "\n".join(all_sentences[page-1][line_number:next_line_number-1])
        else:
            text += "\n".join(all_sentences[page-1][line_number:])
            for p in range(page+1, next_page):
                text += "\n".join(all_sentences[p-1])
            text += "\n".join(all_sentences[next_page-1][:next_line_number-1])
        return text
    else:
        # get text from this heading to the end of the page
        text = "\n".join(all_sentences[page-1][line_number:])
        for p in range(page+1, len(all_sentences)):
            text += "\n".join(all_sentences[p-1])
        return text

In [None]:
# split text to chunk sizes less than 1024

def split_text_to_chunk_size(text, target_size=1024):
    text_sentences = text.split("\n")
    chunks = []
    current_chunk_len = 0
    current_chunk = ""
    for sentence in text_sentences:
        sentence_len = len(sentence.split(" "))
        if current_chunk_len + sentence_len < target_size:
            current_chunk += sentence + "\n"
            current_chunk_len += sentence_len
        else:
            chunks.append(current_chunk)
            current_chunk = sentence + "\n"
            current_chunk_len = sentence_len
    if current_chunk_len > 0:
        chunks.append(current_chunk)
    return chunks

In [None]:
# let's traverse the tree and see if we need to split some nodes into multiple nodes due to text size more than selected chunk size

# do pre-order traversal 



def set_text_and_split_necessary_nodes(node, splitter_func, chunk_size=500):
    if node.text is None:
        text = get_node_text(node)
        text_size = len(text.split(" "))
    else:
        text = node.text
        text_size = node.text_size
    
    if text_size > chunk_size:
        print("Splitting node: ", node.refined_output_idx)
        # split the node into multiple nodes
        text_chunks = splitter_func(text, target_size=chunk_size)
        # now we replace original node with dummy node which has heading but the text is empty. text will be added to children
        node.text = ""
        node.is_partial_node_parent = True
        node.text_size = 0
        # store the children of this node and remove them from the node
        original_children = node.children
        node.children = []
        print("Node children: ", len(original_children))
        print("Splitting into: ", len(text_chunks))

        for i in range(len(text_chunks)):
            child = Node(heading="", level=node.level, page=node.page, line_number=node.line_number, refined_output_idx=node.refined_output_idx)
            child.text = text_chunks[i]
            child.text_size = len(text_chunks[i].split(" "))
            # add this child to the parent
            node.add_child(child)
            child.is_partial_node = True
            # check if this is the last child, it last child add the original children to this child
            if i == len(text_chunks) - 1:
                for original_child in original_children:
                    child.add_child(original_child)
    else:
        node.text = text
        node.text_size = text_size
    
    if len(node.children) > 0:
        for child in node.children:
            set_text_and_split_necessary_nodes(child, splitter_func, chunk_size)
    return




In [None]:
# let's process the tree and split the nodes with text size more than 500 words into multiple nodes
size_based_splitter = split_text_to_chunk_size
set_text_and_split_necessary_nodes(root, splitter_func=size_based_splitter, chunk_size=500)

In [None]:
tree_structure_path = os.path.join(OUTPUT_FOLDER, f"tree_structure.pkl")
# save the tree structure
pickle.dump(root, open(tree_structure_path, "wb"))

# Phase 3: Extract info from respective section

In [None]:
# save the tree structure 
import pickle
tree_structure_path = os.path.join(OUTPUT_FOLDER, f"tree_structure.pkl")
def load_tree_structure(input_path):
    with open(input_path, "rb") as file:
        return pickle.load(tree_structure_path)

In [None]:
# only for debug purpose
# print the tree structure to see there is no loop in the tree and count the nodes

def print_tree_structure(node):
    print(node)
    if len(node.children) > 0:
        bottom_nodes = 0
        for child in node.children:
            bottom_nodes += print_tree_structure(child)
        return bottom_nodes + 1
    return 1


In [None]:
tree_structure_path = os.path.join(OUTPUT_FOLDER, f"tree_structure.pkl")
root = load_tree_structure(tree_structure_path)
print(root.children)


In [None]:
INFO_EXTRACTION_PROMPT = """
You are an expert in extracting market and financial data from documents.
Use the given tool to extract essential data from text in the enclosed document. Do not make any assumptions or add any information that is not present in the text.

Return the result in JSON format. Do not use non-JSON tags. If some numeric data is not present in the text, simply output the number 101 as an answer where numeric data is expected.
For titles and names, limit the output to 20 words. For descriptions and key points, limit the output to 50 words.
"""

In [None]:
def response_to_text(content_text: str, template: dict, main_prompt: str, system_prompt: str, final_prompt: str) -> dict:
    initial_message = {
        "role": "user",
        "content": [
            {
                "text": main_prompt,
            },
        ],
    }

    initial_message['content'].append({"text": content_text})
    if final_prompt is not None:
        initial_message['content'].append({"text": final_prompt})
    

    tool_list = [{
        "toolSpec": template
    }]
    # model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
    model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"
    response = bedrock.converse(
        modelId=model_id,
        # modelId="meta.llama3-1-405b-instruct-v1:0",
        messages=[initial_message],
        # system = system_prompt,
        inferenceConfig={
            "temperature": 0,
        },
        toolConfig={
            "tools": tool_list,
            "toolChoice": {
                "tool": {
                    "name": "info_extract"
                }
            }
        }
    )
    core_response = response['output']['message']['content'][0]['toolUse']['input']
    if 'properties' in core_response:
        core_response: dict = core_response['properties']
    for k, v in core_response.items():
        if isinstance(v, str) and v[0] in '{[' and v[-1] in ']}':
            try:
                core_response[k] = json.loads(v)
            except Exception:
                pass

    return core_response, response

In [None]:
def get_section_text(node):
    text = node.heading + "\n"
    if node.text:
        text += node.text + "\n"
    for child in node.children:
        text += get_section_text(child)
    return text

In [None]:
from Templates.ibis_aws_summary_template_all import TEMPLATE as IBIS_SUMMARY_TEMPLATE
template_parts = IBIS_SUMMARY_TEMPLATE['data']
full_templates = build_aws_template(template_parts)

In [None]:
full_templates[1]

In [None]:
    

def extract_info_for_section(node, template, main_prompt, system_prompt, final_prompt):
    section_text = get_section_text(node)
    num_failed = 0
    result = None
    while num_failed < 5:
        try:
            result = response_to_text(section_text, template, main_prompt, system_prompt, final_prompt)
            break
        except Exception as e:
            num_failed += 1
            print(f"Error: {e}")
            print(f"Failed {num_failed} times. Sleeping for 60 seconds.")
            time.sleep(60)
            continue
    return result

def extract_info_for_all_sections(root, full_templates):
    section_summaries = []
    raw_responses = []

    for idx, child in enumerate(root.children):
        if idx >= len(full_templates):
            break
        section_summary, raw_response = extract_info_for_section(child, full_templates[idx], INFO_EXTRACTION_PROMPT, None, None)
        section_summaries.append(section_summary)
        raw_responses.append(raw_response)
        print(f"Extracted for section {section_name}")
        print("Sleeping for 60 seconds.")
    return section_summaries, raw_responses


In [None]:
import pickle
section_summaries, raw_responses = extract_info_for_all_sections(root, full_templates)
section_results = {"section_summaries": section_summaries, "raw_responses": raw_responses}

pickle.dump(section_results, open(f"{OUTPUT_FOLDER}/section_summaries.pkl", "wb"))

In [None]:
from build_markdown_report import build_markdown_report_func

report_md = build_markdown_report_func(section_summaries)

In [None]:
print(report_md)
# save the markdown report
with open(f"{OUTPUT_FOLDER}/summary_report.md", "w") as file:
    file.write(report_md)

In [None]:
from Templates.build_markdown_report import report_order
import json
json_path = f"{OUTPUT_FOLDER}/section_summaries.json"
with open(json_path, "w") as f:
    json.dump(section_summaries, f, indent=4)

report_order_json_path = f"{OUTPUT_FOLDER}/report_order.json"
with open(report_order_json_path, "w") as f:
    json.dump(report_order, f, indent=4)