In [1]:
import os
import pandas as pd

import anthropic
from dotenv import load_dotenv
from pdf2image import convert_from_path
import base64
import requests
from llama_index.core import Document

In [100]:

use_braintrust_dataset = True

COMPARISON_FILE = 'claude-3-5-sonnet-20240620_qa.csv'
PDF_LOCATION = 'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf'
DOC_ID = 'ibis-healthcare-social-assistance'
MODEL_ID = 'gpt-4o-mini'
QUESTION_COL = 'question'
RESPONSE_COL = 'rag_model_response'
NUM_QUESTIONS = -1
PARSER = "claude" # "claude" or "llama-parse"
CHUNK_SIZE = 600
SPLITTER = "custom_tree"
TOP_K = 3
OUTPUT_FOLDER = f'./rag_outputs/{DOC_ID}'
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
OUTPUT_FILE = f'{OUTPUT_FOLDER}/output_{MODEL_ID}_{PARSER}_{CHUNK_SIZE}_{SPLITTER}_{TOP_K}.csv'


In [2]:
import pickle
documents = pickle.load(open(f'./rag_outputs/ibis-healthcare-social-assistance_gpt-4o-mini_claude_600_sentence_3/_claude-3-5-sonnet-20240620_pages.pkl', 'rb'))

In [3]:
title = "Healthcare and Social Assistance in the US"
docs = [document.text for document in documents]

In [4]:
# iterate over docs and see if the first sentence contains the title, then remove the whole line from the text
all_sentences = []
for i, doc in enumerate(docs):
    sentences = doc.split("\n")
    # if title in sentences[0]:
    #     sentences = sentences[1:]
    all_sentences.append(sentences)




In [5]:
all_sentences[30]

['# IBISWorld | Healthcare and Social Assistance in the US',
 '',
 'Mar 2024',
 '',
 "## How are the industry's products and services performing?",
 '',
 '### Hospitals are central to healthcare delivery',
 '',
 '- Hospitals provide inpatient and outpatient medical services (diagnostic, treatment, etc.) via physicians, nursing and other health services. This subsector includes Hospitals (IBISWorld 62211), Psychiatric Hospitals (62221) and Specialty Hospitals (62231).',
 '',
 '- Rising incomes, broader access to public and private insurance programs and the medical needs of older citizens sustained hospital patient volumes before the COVID-19 pandemic. Yet hospitals were hit particularly hard by the drop in elective procedures and patient volumes following the onset of the pandemic.',
 '',
 '- Federal policies and billions of dollars in funding directed to hospitals alleviated the initial financial impact of revenue loss stemming from delays in elective care and drops in ER visits. Acco

In [6]:
all_sentences[4]

['# 1. About',
 '',
 'https://my.ibisworld.com/us/en/industry/62/about',
 '',
 '## Codes',
 '',
 '| NAICS 2017 - USA | 62 |',
 '|-------------------|-----|',
 '| NAICS 2022 - USA | 62 |',
 '',
 'The table shows the NAICS (North American Industry Classification System) codes for the Healthcare and Social Assistance sector in the USA. Both the 2017 and 2022 versions of NAICS assign the code 62 to this sector, indicating consistency in classification over time.',
 '',
 '## Definition',
 '',
 'The Healthcare and Social Assistance sector is composed of enterprises that provide healthcare and social assistance for individuals in the United States, including hospitals, ambulatory service providers and nursing and residential care facilities. Social assistance also includes counselors and social workers, family and welfare services and natural disaster and emergency relief services.',
 '',
 '## Related Terms',
 '',
 '### ELECTRONIC HEALTH RECORD (EHR)',
 '',
 'The systematic collection and mai

In [7]:

headings = []

for i, sentences in enumerate(all_sentences):
    for j, sentence in enumerate(sentences):
        # sentence is a heading if it starts with # or ## or ### or #### or #####
        if sentence.startswith("#"):
            headings.append({"text": sentence, "page": i+1, "sentence_number": j+1})

headings[:20]

[{'text': '# IBISWorld', 'page': 1, 'sentence_number': 1},
 {'text': '## INDUSTRY REPORT', 'page': 1, 'sentence_number': 3},
 {'text': '# Healthcare and Social Assistance in the US',
  'page': 1,
  'sentence_number': 5},
 {'text': '### Mar 2024', 'page': 1, 'sentence_number': 7},
 {'text': '# IBISWorld | Healthcare and Social Assistance in the US Mar 2024',
  'page': 2,
  'sentence_number': 1},
 {'text': '## About IBISWorld', 'page': 2, 'sentence_number': 3},
 {'text': '# IBISWorld | Healthcare and Social Assistance in the US',
  'page': 3,
  'sentence_number': 1},
 {'text': '## Table Of Contents', 'page': 3, 'sentence_number': 5},
 {'text': '# About', 'page': 4, 'sentence_number': 1},
 {'text': '# 1. About', 'page': 5, 'sentence_number': 1},
 {'text': '## Codes', 'page': 5, 'sentence_number': 5},
 {'text': '## Definition', 'page': 5, 'sentence_number': 13},
 {'text': '## Related Terms', 'page': 5, 'sentence_number': 17},
 {'text': '### ELECTRONIC HEALTH RECORD (EHR)',
  'page': 5,
  '

In [8]:
len(headings)

300

In [9]:
HEADINGS_PROMPT = ''' You will be given a text containing headings and subheadings parsed from the market research report of the industry 'Healthcare and Social Assistance in the US' in markdown format. 
However, the issue is that these subheadings were parsed one page at a time. So this makes it possible that the structure of the document is not preserved.
The heading 2 in the document might be parsed as heading 1 if it is the first heading on the page. Similarly, all the following subheadings on the same page might be parsed as different heading levels.
However, the order of the headings is preserved from top to bottom of each page. So you don't have to worry about the order of the headings.
Other issue is that parsed headings/ subheadings may accidentally include date or page number or report title that might have been present as a header or footer in the document.
Your task is to identify the correct structure of the document by taking into account the semantic meanings of the headings and subheadings.
Your input will be a text file with each line starting with <page_number, line_number> of the heading/ subheading followed by the heading or subheading text starting with # or ## or ### or #### or ##### denoting heading 1, heading 2, heading 3, heading 4 and heading 5 respectively.
Your output will be a text file with each line containing the same <page_number, line_number> as the one in input followed by the heading or subheading text with the correct heading level denoted by # or ## or ### or #### or #####. 
You must not change the page number and line number of the respective heading or subheading. You may entrirely remove the heading or subheading if it is a report title or date that was incorrectly parsed as a heading. Otherwise, you should correct the heading level if you think it was parsed incorrectly.
'''

SYSTEM_MESSAGE = "You are an expert in identifying the correct structure of a document by taking into account the page numbers, line numbers and the semantic meanings of the headings and subheadings."

FINAL_MESSAGE = "Please identify the correct structure of the document by taking into account the page numbers, line numbers and the semantic meanings of the headings and subheadings. Output the headings with the correct heading level denoted by # or ## or ### or #### or #####."

In [10]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
load_dotenv("/Users/mbajaj/.env")
# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv('LLAMA_CLOUD_API_KEY')
# Using OpenAI API for embeddings/llms
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["ANTHROPIC_API_KEY"] = os.getenv('ANTHROPIC_API_KEY')
os.environ["BRAINTRUST_API_KEY"]=os.getenv('BRAINTRUST_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [11]:
input_text = ""
for heading in headings:
    input_text += f"<{heading['page']}, {heading['sentence_number']}> {heading['text']}\n"
input_text

"<1, 1> # IBISWorld\n<1, 3> ## INDUSTRY REPORT\n<1, 5> # Healthcare and Social Assistance in the US\n<1, 7> ### Mar 2024\n<2, 1> # IBISWorld | Healthcare and Social Assistance in the US Mar 2024\n<2, 3> ## About IBISWorld\n<3, 1> # IBISWorld | Healthcare and Social Assistance in the US\n<3, 5> ## Table Of Contents\n<4, 1> # About\n<5, 1> # 1. About\n<5, 5> ## Codes\n<5, 13> ## Definition\n<5, 17> ## Related Terms\n<5, 19> ### ELECTRONIC HEALTH RECORD (EHR)\n<5, 23> ### TELEMEDICINE\n<5, 27> ### HEALTH INSURANCE EXCHANGE\n<5, 31> ### FEE-FOR-SERVICE\n<5, 35> ### MEDICARE AND MEDICAID\n<5, 39> ### LONG-TERM CARE\n<5, 43> ### FEDERAL EMERGENCY MANAGEMENT AGENCY (FEMA)\n<6, 1> # IBISWorld | Healthcare and Social Assistance in the US\n<6, 5> ## GROUP HOME\n<6, 9> ## MANAGED CARE\n<6, 13> ## What's Included\n<6, 20> ## Companies\n<6, 28> ## Related Industries\n<6, 30> ### Industries in the Same Sector\n<6, 40> ### International Industries\n<6, 46> ## Additional Resources\n<7, 1> # IBISWorld 

In [12]:
import anthropic

from llama_index.core import Document

client = anthropic.Anthropic()
model = "claude-3-5-sonnet-20240620"

response = client.messages.create(
    model=model,
    max_tokens=5000,
    system = SYSTEM_MESSAGE,
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": HEADINGS_PROMPT},
                {
                    "type": "text",
                    "text": input_text
                },
                {"type": "text", "text": FINAL_MESSAGE},
            ],
        }
    ],
)

In [13]:
output = response.content[0].text
output

"Here's the corrected structure of the document with appropriate heading levels:\n\n<1, 5> # Healthcare and Social Assistance in the US\n\n<2, 3> ## About IBISWorld\n\n<3, 5> ## Table Of Contents\n\n<5, 1> ## About\n<5, 5> ### Codes\n<5, 13> ### Definition\n<5, 17> ### Related Terms\n<5, 19> #### ELECTRONIC HEALTH RECORD (EHR)\n<5, 23> #### TELEMEDICINE\n<5, 27> #### HEALTH INSURANCE EXCHANGE\n<5, 31> #### FEE-FOR-SERVICE\n<5, 35> #### MEDICARE AND MEDICAID\n<5, 39> #### LONG-TERM CARE\n<5, 43> #### FEDERAL EMERGENCY MANAGEMENT AGENCY (FEMA)\n<6, 5> #### GROUP HOME\n<6, 9> #### MANAGED CARE\n<6, 13> ### What's Included\n<6, 20> ### Companies\n<6, 28> ### Related Industries\n<6, 30> #### Industries in the Same Sector\n<6, 40> #### International Industries\n<6, 46> ### Additional Resources\n\n<9, 3> ## At a Glance\n<9, 19> ### Key Takeaways\n<9, 21> #### Performance\n<9, 26> #### External Environment\n\n<10, 5> ### Products and Services\n<10, 7> #### Products & Services Segmentation\n\n<

In [124]:
# headings[:100]

In [14]:
output_list = output.split("\n")
# filter output list that do not start with <
output_list = [line for line in output_list if line.lstrip().startswith("<")]
print(len(output_list))
output_list[:100]

207


['<1, 5> # Healthcare and Social Assistance in the US',
 '<2, 3> ## About IBISWorld',
 '<3, 5> ## Table Of Contents',
 '<5, 1> ## About',
 '<5, 5> ### Codes',
 '<5, 13> ### Definition',
 '<5, 17> ### Related Terms',
 '<5, 19> #### ELECTRONIC HEALTH RECORD (EHR)',
 '<5, 23> #### TELEMEDICINE',
 '<5, 27> #### HEALTH INSURANCE EXCHANGE',
 '<5, 31> #### FEE-FOR-SERVICE',
 '<5, 35> #### MEDICARE AND MEDICAID',
 '<5, 39> #### LONG-TERM CARE',
 '<5, 43> #### FEDERAL EMERGENCY MANAGEMENT AGENCY (FEMA)',
 '<6, 5> #### GROUP HOME',
 '<6, 9> #### MANAGED CARE',
 "<6, 13> ### What's Included",
 '<6, 20> ### Companies',
 '<6, 28> ### Related Industries',
 '<6, 30> #### Industries in the Same Sector',
 '<6, 40> #### International Industries',
 '<6, 46> ### Additional Resources',
 '<9, 3> ## At a Glance',
 '<9, 19> ### Key Takeaways',
 '<9, 21> #### Performance',
 '<9, 26> #### External Environment',
 '<10, 5> ### Products and Services',
 '<10, 7> #### Products & Services Segmentation',
 '<11, 3> ###

In [15]:
headings[:100]

[{'text': '# IBISWorld', 'page': 1, 'sentence_number': 1},
 {'text': '## INDUSTRY REPORT', 'page': 1, 'sentence_number': 3},
 {'text': '# Healthcare and Social Assistance in the US',
  'page': 1,
  'sentence_number': 5},
 {'text': '### Mar 2024', 'page': 1, 'sentence_number': 7},
 {'text': '# IBISWorld | Healthcare and Social Assistance in the US Mar 2024',
  'page': 2,
  'sentence_number': 1},
 {'text': '## About IBISWorld', 'page': 2, 'sentence_number': 3},
 {'text': '# IBISWorld | Healthcare and Social Assistance in the US',
  'page': 3,
  'sentence_number': 1},
 {'text': '## Table Of Contents', 'page': 3, 'sentence_number': 5},
 {'text': '# About', 'page': 4, 'sentence_number': 1},
 {'text': '# 1. About', 'page': 5, 'sentence_number': 1},
 {'text': '## Codes', 'page': 5, 'sentence_number': 5},
 {'text': '## Definition', 'page': 5, 'sentence_number': 13},
 {'text': '## Related Terms', 'page': 5, 'sentence_number': 17},
 {'text': '### ELECTRONIC HEALTH RECORD (EHR)',
  'page': 5,
  '

In [74]:

class Node:
    def __init__(self, heading, level, page, line_number, refined_output_idx):
        self.heading = heading
        self.text = None
        self.level = level
        self.page = page
        self.line_number = line_number
        self.children = []
        self.parent = None
        self.self_index = None
        self.is_partial_node = False
        self.is_partial_node_parent = False
        self.text_size = 0
        self.refined_output_idx = refined_output_idx  
        self.embedding_text = None      
    
    def add_child(self, child):
        child.self_index = len(self.children)
        self.children.append(child)
        child.parent = self
    
    def __repr__(self):
        return f"self index: {self.self_index} level: {self.level} loc: <{self.page, self.line_number}> content: {self.heading}"
    
    def __str__(self):
        return f"self index: {self.self_index} level: {self.level} loc: <{self.page, self.line_number}> content: {self.heading}"


In [75]:
# make a tree structure of the headings and subheadings from output_list such as level 1 is parent of level 2 and so on

def get_level(heading):
    text = heading.lstrip()
    level = 0
    for char in text:
        if char == "#":
            level += 1
        else:
            break
    return level

def get_fields_from_line(text):
    level = 0
    line = text.lstrip()
    if line.startswith("<"):
        prefix = (line.split("> ")[0]).split("<")[1]
        page, line_number = int(prefix.split(",")[0]), int(prefix.split(",")[1])
        heading = line.split("> ")[1]
        level = get_level(heading)
        return page, line_number, heading, level
    else:
        return None, None, None, level


root = Node(heading="", level=0, page=0, line_number=0, refined_output_idx=-1)
parent = root
# construct the tree
# TODO: take page, line_number from the input instead of output to avoid any issues

refined_output = []
for i in range(len(output_list)):
    page, line_number, heading, level = get_fields_from_line(output_list[i])
    if level == 0:
        continue
    if level > parent.level:
        child = Node(heading, level, page, line_number, len(refined_output))
        parent.add_child(child)
        parent = child
    elif level == parent.level:
        child = Node(heading, level, page, line_number, len(refined_output))
        parent.parent.add_child(child)
        parent = child
    else: # level < parent.level
        while level < parent.level:
            parent = parent.parent
        child = Node(heading, level, page, line_number, len(refined_output))
        if level == parent.level:
            parent.parent.add_child(child)
        else:
            parent.add_child(child)
        parent = child
    refined_output.append((page, line_number, heading, level))



In [76]:
def get_node_text(node):
    if node.text is not None:
        return node.text
    
    if node.level == 0:
        return ""
    page = node.page
    line_number = node.line_number
    # check if there is any heading after this
    if len(refined_output) > node.refined_output_idx + 1:
        next_heading = refined_output[node.refined_output_idx + 1]
        next_page, next_line_number, next_heading_text, next_level = next_heading
        # get text between this heading and next heading using all_sentences
        text = ""
        if page == next_page:
            if line_number + 1 < next_line_number:
                text += "\n".join(all_sentences[page-1][line_number:next_line_number-1])
        else:
            text += "\n".join(all_sentences[page-1][line_number:])
            for p in range(page+1, next_page):
                text += "\n".join(all_sentences[p-1])
            text += "\n".join(all_sentences[next_page-1][:next_line_number-1])
        return text
    else:
        # get text from this heading to the end of the page
        text = "\n".join(all_sentences[page-1][line_number:])
        for p in range(page+1, len(all_sentences)):
            text += "\n".join(all_sentences[p-1])
        return text

In [77]:
# split text to chunk sizes less than 1024

def split_text_to_chunk_size(text, target_size=1024):
    text_sentences = text.split("\n")
    chunks = []
    current_chunk_len = 0
    current_chunk = ""
    for sentence in text_sentences:
        sentence_len = len(sentence.split(" "))
        if current_chunk_len + sentence_len < target_size:
            current_chunk += sentence + "\n"
            current_chunk_len += sentence_len
        else:
            chunks.append(current_chunk)
            current_chunk = sentence + "\n"
            current_chunk_len = sentence_len
    if current_chunk_len > 0:
        chunks.append(current_chunk)
    return chunks

In [78]:
# let's traverse the tree and see if we need to split some nodes into multiple nodes due to text size more than selected chunk size

# do pre-order traversal 



def set_text_and_split_necessary_nodes(node, splitter_func, chunk_size=500):
    if node.text is None:
        text = get_node_text(node)
        text_size = len(text.split(" "))
    else:
        text = node.text
        text_size = node.text_size
    
    if text_size > chunk_size:
        print("Splitting node: ", node.refined_output_idx)
        # split the node into multiple nodes
        text_chunks = splitter_func(text, target_size=chunk_size)
        # now we replace original node with dummy node which has heading but the text is empty. text will be added to children
        node.text = ""
        node.is_partial_node_parent = True
        node.text_size = 0
        # store the children of this node and remove them from the node
        original_children = node.children
        node.children = []
        print("Node children: ", len(original_children))
        print("Splitting into: ", len(text_chunks))

        for i in range(len(text_chunks)):
            child = Node(heading="", level=node.level, page=node.page, line_number=node.line_number, refined_output_idx=node.refined_output_idx)
            child.text = text_chunks[i]
            child.text_size = len(text_chunks[i].split(" "))
            # add this child to the parent
            node.add_child(child)
            child.is_partial_node = True
            # check if this is the last child, it last child add the original children to this child
            if i == len(text_chunks) - 1:
                for original_child in original_children:
                    child.add_child(original_child)
    else:
        node.text = text
        node.text_size = text_size
    
    if len(node.children) > 0:
        for child in node.children:
            set_text_and_split_necessary_nodes(child, splitter_func, chunk_size)
    return




In [79]:
root.children[0].children

[self index: 0 level: 2 loc: <(2, 3)> content: ## About IBISWorld,
 self index: 1 level: 2 loc: <(3, 5)> content: ## Table Of Contents,
 self index: 2 level: 2 loc: <(5, 1)> content: ## About,
 self index: 3 level: 2 loc: <(9, 3)> content: ## At a Glance,
 self index: 4 level: 2 loc: <(15, 3)> content: ## Performance,
 self index: 5 level: 2 loc: <(30, 3)> content: ## Products and Markets,
 self index: 6 level: 2 loc: <(39, 3)> content: ## Geographic Breakdown,
 self index: 7 level: 2 loc: <(46, 5)> content: ## Competitive Forces,
 self index: 8 level: 2 loc: <(52, 5)> content: ## Companies,
 self index: 9 level: 2 loc: <(67, 5)> content: ## External Environment,
 self index: 10 level: 2 loc: <(73, 1)> content: ## Financial Benchmarks,
 self index: 11 level: 2 loc: <(77, 5)> content: ## Key Statistics]

In [80]:
# iterate over the tree and print nodes with text size more than 500 words
def print_nodes_with_text_size_more_than_500_words(node, large_nodes):
    node_text = get_node_text(node)
    text_size = len(node_text.split(" "))
    if text_size > 500:
        print(node)
        large_nodes.append(node)
    if len(node.children) > 0:
        for child in node.children:
            print_nodes_with_text_size_more_than_500_words(child, large_nodes)
    return

large_nodes = []
print_nodes_with_text_size_more_than_500_words(root, large_nodes)
print(len(large_nodes))


self index: 1 level: 4 loc: <(16, 17)> content: #### Employees
self index: 0 level: 4 loc: <(40, 5)> content: #### Business Concentration
self index: 4 level: 4 loc: <(55, 29)> content: #### Company's Industry Revenue, Market Share, and Profit Margin Over Time
self index: 4 level: 4 loc: <(57, 31)> content: #### Company's Industry Revenue, Market Share, and Profit Margin Over Time
self index: 3 level: 4 loc: <(60, 20)> content: #### Company's Industry Revenue, Market Share, and Profit Margin Over Time
self index: 3 level: 4 loc: <(64, 18)> content: #### Company's Industry Revenue, Market Share, and Profit Margin Over Time
self index: 3 level: 3 loc: <(74, 29)> content: ### Key Ratios
self index: 0 level: 4 loc: <(77, 11)> content: #### Values
self index: 1 level: 3 loc: <(79, 5)> content: ### Annual Change
9


In [29]:
# # try splitting one sample node text using split_text_to_chunk_size
# sample_node = large_nodes[0]
# sample_node_text = get_node_text(sample_node)
# print(len(sample_node_text.split(" ")))
# sample_node_text_chunks = split_text_to_chunk_size(sample_node_text, target_size=500)
# print(len(sample_node_text_chunks))
# for i in range(len(sample_node_text_chunks)):
#     num_words = len(sample_node_text_chunks[i].split(" "))
#     print(f"chunk {i+1}: {num_words} words")
# print(sample_node_text)
# print("***********")
# print(sample_node_text_chunks[0])
# print("***********")
# print(sample_node_text_chunks[1])


In [44]:
# print(len(refined_output))

In [65]:
# print the tree structure to see there is no loop in the tree
total_nodes = 0
def print_tree_structure(node):
    print(node)
    global total_nodes
    total_nodes += 1
    if len(node.children) > 0:
        for child in node.children:
            print_tree_structure(child)
    return

print_tree_structure(root)
print(total_nodes)


self index: None level: 0 loc: <(0, 0)> content: 
self index: 0 level: 1 loc: <(1, 5)> content: # Healthcare and Social Assistance in the US
self index: 0 level: 2 loc: <(2, 3)> content: ## About IBISWorld
self index: 1 level: 2 loc: <(3, 5)> content: ## Table Of Contents
self index: 2 level: 2 loc: <(5, 1)> content: ## About
self index: 0 level: 3 loc: <(5, 5)> content: ### Codes
self index: 1 level: 3 loc: <(5, 13)> content: ### Definition
self index: 2 level: 3 loc: <(5, 17)> content: ### Related Terms
self index: 0 level: 4 loc: <(5, 19)> content: #### ELECTRONIC HEALTH RECORD (EHR)
self index: 1 level: 4 loc: <(5, 23)> content: #### TELEMEDICINE
self index: 2 level: 4 loc: <(5, 27)> content: #### HEALTH INSURANCE EXCHANGE
self index: 3 level: 4 loc: <(5, 31)> content: #### FEE-FOR-SERVICE
self index: 4 level: 4 loc: <(5, 35)> content: #### MEDICARE AND MEDICAID
self index: 5 level: 4 loc: <(5, 39)> content: #### LONG-TERM CARE
self index: 6 level: 4 loc: <(5, 43)> content: #### FE

In [81]:
# print the text of the long nodes
for node in large_nodes:
    print(node.refined_output_idx)
    print(get_node_text(node))
    print("************")

43

The Revenue Volatility is described as Moderate.

### Revenue

The graph presents the total value ($) and annual change from 2011 - 2029, including a 5-year outlook.

The graph illustrates the Annual Revenue ($bn) and Change (%) from 2011 to 2029, with forecasted data from 2024 onwards. Key observations include:

1. The Annual Revenue shows a general upward trend from 2011 to 2029.
2. The Change (%) fluctuates significantly over the years, with notable peaks and troughs.
3. For 2024, the Annual Revenue is projected to be $3554.9 billion, with a 0.3% change.
4. The graph indicates a sharp decline in the change percentage around 2020-2021, possibly reflecting the impact of external events.
5. After 2024, the revenue is forecasted to continue growing, while the change percentage shows moderate fluctuations.
6. The highest positive change appears to occur around 2014-2015.
7. The y-axis for Annual Revenue ranges from 0 to 5000 ($bn), while the Change (%) axis ranges from -5% to 7.5%.
8

In [82]:
# let's process the tree and split the nodes with text size more than 500 words into multiple nodes
size_based_splitter = split_text_to_chunk_size
set_text_and_split_necessary_nodes(root, splitter_func=size_based_splitter, chunk_size=500)

Splitting node:  43
Node children:  0
Splitting into:  2
Splitting node:  98
Node children:  0
Splitting into:  3
Splitting node:  141
Node children:  0
Splitting into:  2
Splitting node:  147
Node children:  0
Splitting into:  2
Splitting node:  156
Node children:  0
Splitting into:  2
Splitting node:  172
Node children:  0
Splitting into:  2
Splitting node:  202
Node children:  0
Splitting into:  6
Splitting node:  205
Node children:  0
Splitting into:  2
Splitting node:  206
Node children:  0
Splitting into:  2


In [83]:
# iterate over large nodes and print their text, also print the text of their children and if they are partial nodes

def print_large_nodes_and_children(large_nodes):
    for node in large_nodes:
        print(node.refined_output_idx)
        print(get_node_text(node))
        for child in node.children:
            print("child: ", child)
            if child.is_partial_node:
                print("Partial Node")
            print(get_node_text(child))
            print("************")  
    return

print_large_nodes_and_children(large_nodes)
            

43

child:  self index: 0 level: 4 loc: <(16, 17)> content: 
Partial Node

The Revenue Volatility is described as Moderate.

### Revenue

The graph presents the total value ($) and annual change from 2011 - 2029, including a 5-year outlook.

The graph illustrates the Annual Revenue ($bn) and Change (%) from 2011 to 2029, with forecasted data from 2024 onwards. Key observations include:

1. The Annual Revenue shows a general upward trend from 2011 to 2029.
2. The Change (%) fluctuates significantly over the years, with notable peaks and troughs.
3. For 2024, the Annual Revenue is projected to be $3554.9 billion, with a 0.3% change.
4. The graph indicates a sharp decline in the change percentage around 2020-2021, possibly reflecting the impact of external events.
5. After 2024, the revenue is forecasted to continue growing, while the change percentage shows moderate fluctuations.
6. The highest positive change appears to occur around 2014-2015.
7. The y-axis for Annual Revenue ranges fro

In [84]:
# iterate over the tree and print nodes with text size more than 500 words
new_large_nodes = []
print_nodes_with_text_size_more_than_500_words(root, new_large_nodes)
print(len(new_large_nodes))

0


In [85]:
# for each node in the tree, set embedding text which is all the headings of the ancestors prefixed to the text of the node

def set_embedding_text(node, recursive_headings=[]):
    if node.level != 0 and node.text is not None:
        node.embedding_text = "\n".join(recursive_headings) + "\n" + node.heading + "\n" + node.text
    if len(node.children) > 0:
        for child in node.children:
            recursive_headings.append(node.heading)
            set_embedding_text(child, recursive_headings)
            recursive_headings.pop()
    return

set_embedding_text(root)

In [90]:
print(root.children[0].children[1].embedding_text)


# Healthcare and Social Assistance in the US
## Table Of Contents

Standard

1. About............................................................ 5
   Codes............................................................5
   Definition....................................................... 5
   Related Terms.............................................. 5
   What's Included............................................ 6
   Companies....................................................6
   Related Industries......................................... 6
   Additional Resources.................................... 6

2. At a Glance...................................................9
   Key Takeaways.............................................9
   Products and Services................................ 10
   Major Players.............................................. 11
   Key External Drivers................................... 12
   Industry Structure........................................12
   S

In [104]:
# create llama-index text nodes from the tree structure
from llama_index.core.schema import TextNode
def create_text_nodes_from_tree(node, llama_text_nodes):
    if node.level != 0 and node.embedding_text is not None:
        text_node = TextNode(
            text=node.embedding_text,
            metadata={"page_number": node.page, "line_number": node.line_number, "level": node.level}
        )
        llama_text_nodes.append(text_node)
    if len(node.children) > 0:
        for child in node.children:
            create_text_nodes_from_tree(child, llama_text_nodes)
    return
    
    

In [106]:
llama_text_nodes = []
create_text_nodes_from_tree(root, llama_text_nodes)
print(len(llama_text_nodes))

230


In [107]:
import weaviate

cluster_url = "https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud"
api_key = "7ZfUCibywHnzM0WKMPx7YevuN79nUtS4KJgT"

client = weaviate.connect_to_wcs(
    cluster_url=cluster_url,
    auth_credentials=weaviate.auth.AuthApiKey(api_key),
)

In [108]:
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core import VectorStoreIndex
import uuid

# generate unique index for multiple runs
INDEX_NAME = ('X' + str(uuid.uuid4())).replace('-', '_')

vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name=INDEX_NAME
)

vector_index = VectorStoreIndex(llama_text_nodes, vector_store=vector_store)

            Please make sure to close the connection using `client.close()`.


In [109]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model=MODEL_ID, api_key = OPENAI_API_KEY)
query_engine = vector_index.as_query_engine(similarity_top_k=TOP_K, llm=llm)

In [113]:
import braintrust
use_braintrust_dataset = False
if use_braintrust_dataset:
    dataset = braintrust.init_dataset(project="RagMetrics", name=DOC_ID)
    df = []
    for row in dataset:
        df.append(row)
    # convert list of dict to pandas dataframe
    dff = pd.DataFrame(df)
    dff['question'] = dff['input'].apply(lambda x: x.split(">")[1])
else:

    df = pd.read_csv(COMPARISON_FILE)
    if NUM_QUESTIONS == -1:
        dff = df.copy()
    else:
        dff = df.head(NUM_QUESTIONS).copy()

In [114]:
print(dff.shape)
print(dff.head())

(213, 7)
                                            question  \
0  What types of enterprises are included in the ...   
1  How is telemedicine defined in the context of ...   
2  What are Health Insurance Exchanges in the US ...   
3  What services are included in the Healthcare a...   
4  Who are some of the major companies operating ...   

                                              answer source type answer type  \
0  The Healthcare and Social Assistance sector in...        text       other   
1  Telemedicine is defined as an application of c...        text       other   
2  Health Insurance Exchanges are sets of state-r...        text       other   
3  The Healthcare and Social Assistance industry ...        text       other   
4  Major companies in the industry include Hca He...        text       other   

   page                                               file  \
0     5  62 Healthcare and Social Assistance in the US ...   
1     5  62 Healthcare and Social Assistance in t

In [None]:
result = []
references = []
contexts = []

In [125]:
start_idx = len(result) 
for idx in range(start_idx, len(dff)):
    question = dff.iloc[idx][QUESTION_COL]
    response = query_engine.query(question)
    result.append((response.response))
    metadata = response.metadata
    refs = []
    for m in metadata.values():
        refs.append(m['page_number'])
    references.append(refs)
    q_contexts = []
    for n in response.source_nodes:
        q_contexts.append(n.text)
    contexts.append(q_contexts)




In [126]:
dff[RESPONSE_COL] = result
dff['references'] = references
dff['context'] = contexts
dff.to_csv(OUTPUT_FILE, index=False)

In [128]:
dff.head()

Unnamed: 0,question,answer,source type,answer type,page,file,explanation,rag_model_response,references,context
0,What types of enterprises are included in the ...,The Healthcare and Social Assistance sector in...,text,other,5,62 Healthcare and Social Assistance in the US ...,Understanding the scope of the Healthcare and ...,The Healthcare and Social Assistance sector in...,"[5, 32, 36]",[\n# Healthcare and Social Assistance in the U...
1,How is telemedicine defined in the context of ...,Telemedicine is defined as an application of c...,text,other,5,62 Healthcare and Social Assistance in the US ...,Identifying key technological trends like tele...,Telemedicine is defined as an application of c...,"[25, 5, 32]",[\n# Healthcare and Social Assistance in the U...
2,What are Health Insurance Exchanges in the US ...,Health Insurance Exchanges are sets of state-r...,text,other,5,62 Healthcare and Social Assistance in the US ...,Understanding the structure of health insuranc...,Health Insurance Exchanges in the US healthcar...,"[5, 36, 5]",[\n# Healthcare and Social Assistance in the U...
3,What services are included in the Healthcare a...,The Healthcare and Social Assistance industry ...,text,other,6,62 Healthcare and Social Assistance in the US ...,Understanding the scope of services in the ind...,The Healthcare and Social Assistance industry ...,"[5, 30, 10]",[\n# Healthcare and Social Assistance in the U...
4,Who are some of the major companies operating ...,Major companies in the industry include Hca He...,text,other,6,62 Healthcare and Social Assistance in the US ...,Identifying key players helps in understanding...,Some of the major companies operating in the H...,"[11, 30, 53]",[\n# Healthcare and Social Assistance in the U...


In [127]:
print(len(result), len(references), len(contexts))

213 213 213


In [140]:
def print_multi_line(response, max_chars=100):
    # print max 20 words of the response in single line and then move to next line
    response_text = response.response
    response_text_words = response_text.split(" ")
    response_text_lines = []
    current_line = ""
    for word in response_text_words:
        if len(current_line) + len(word) < 100:
            current_line += word + " "
        else:
            response_text_lines.append(current_line)
            current_line = word + " "

    response_text_lines.append(current_line)

    return response_text_lines

In [144]:
question = "What are US healthcare total revenue figures for recent years?"
response = query_engine.query(question)
ans = print_multi_line(response)
print("\n".join(ans))

The total revenue for the US healthcare sector is projected to be $3.6 trillion for 2024. Over the 
past five years, revenue has been expanding at a compound annual growth rate (CAGR) of 0.7%. 
Looking ahead, the revenue growth rate is expected to increase to 2.7% for the period from 2024 to 
2029. 


In [154]:
def get_references(response):
    references = []
    for n in response.source_nodes:
        references.append({"page number": n.metadata["page_number"], "line number": n.metadata["line_number"]})
    return references

In [155]:

question = "what are the fragments and their proportion of total revenue?"
response = query_engine.query(question)
ans = print_multi_line(response)
print("\n".join(ans))
print("references: ", get_references(response))


The fragments and their proportions of total revenue in the Healthcare and Social Assistance 
industry for 2024 are as follows:

1. Private health insurance: 30.4% ($1.1 trillion)
2. Medicare: 
22.2% ($789.2 billion)
3. Medicaid: 19.0% ($675.4 billion)
4. Other third-party payers: 13.3% 
($472.8 billion)
5. Out-of-pocket expenses: 11.1% ($394.6 billion)
6. Other health insurance 
programs: 4.0% ($142.2 billion) 
references:  [{'page number': 35, 'line number': 5}, {'page number': 73, 'line number': 18}, {'page number': 74, 'line number': 29}]


In [156]:
question = "what are the profit margins in the industry?"
response = query_engine.query(question)
ans = print_multi_line(response)
print("\n".join(ans))
print("references: ", get_references(response))


The profit margin for the Healthcare and Social Assistance sector in the US is 10.0%. For Feeding 
America, the profit margin is 8.5%, which has been maintained from 2020 through 2024. 
references:  [{'page number': 73, 'line number': 1}, {'page number': 74, 'line number': 19}, {'page number': 60, 'line number': 20}]


In [134]:
node_x = root.children[0].children[4].children[3].children[0]
print(node_x, node_x.refined_output_idx)
print(refined_output[node_x.refined_output_idx])
print(refined_output[node_x.refined_output_idx+1])


self index: 0 level: 4 loc: <(20, 7)> content: #### What's driving current industry performance? 46
(20, 7, "#### What's driving current industry performance?", 4)
(20, 9, '##### Demographic trends shape the healthcare landscape', 5)


In [140]:

doc = Document(text="hello")
print(root.children[0].children[4].children[3].children[0].heading)
print(root.children[0].children[4].children[3].children[0].children)
text1 = get_node_text(root.children[0].children[4].children[3].children[0].children[2])
print(text1)

#### What's driving current industry performance?
[self index: 0 level: 5 loc: <(20, 9)> content: ##### Demographic trends shape the healthcare landscape, self index: 1 level: 5 loc: <(20, 17)> content: ##### Healthcare providers grapple with persistent workforce shortages, self index: 2 level: 5 loc: <(20, 27)> content: ##### The pandemic accelerated existing consolidation trends in healthcare, self index: 3 level: 5 loc: <(21, 7)> content: ##### Social assistance providers fill in the gaps as pandemic-era benefits end]

- Larger health systems eye acquisitions to create economies of scale, lower costs and gain negotiating power. The pandemic accelerated the rising consolidation activity that had been occurring in recent years.

- Private and public insurers only partially cover the cost of delivering healthcare, incentivizing healthcare providers to acquire an upper hand in provider-payer negotiations. Independent (often small or rural) practices risking service eliminations or closu

In [50]:
output_list[2]

'<1, 5> # Healthcare and Social Assistance in the US'

In [63]:
root.children[0].children[5].children

[self index: 0 level: 3 loc: <(30, 11)> content: ### Key Takeaways,
 self index: 1 level: 3 loc: <(30, 16)> content: ### Products and Services,
 self index: 2 level: 3 loc: <(34, 5)> content: ### Key Success Factor,
 self index: 3 level: 3 loc: <(35, 3)> content: ### Major Markets,
 self index: 4 level: 3 loc: <(37, 5)> content: ### International Trade]

# let's detail cases that are possible if we make text under each 

In [51]:
all_sentences[0]

['# IBISWorld',
 '',
 '## INDUSTRY REPORT',
 '',
 '# Healthcare and Social Assistance in the US',
 '',
 '### Mar 2024',
 '',
 'The image shows the cover page of an industry report by IBISWorld. The report focuses on Healthcare and Social Assistance in the United States, dated March 2024. The design features a dark blue background with red circular elements in the corners. The IBISWorld logo appears in a red rectangular shape at the top left corner of the page. The title and information are presented in white text, creating a strong contrast against the dark background. This cover page design suggests a professional and authoritative report on the healthcare and social assistance sectors in the US market.']