In [1]:
# %pip install llama-index llama-index-core llama-parse openai llama_index.embeddings.huggingface -q
# %pip install llama-index-llms-anthropic -q
# %pip install llama-index-vector-stores-weaviate -q

In [1]:
import os
import pandas as pd

import anthropic
import nest_asyncio
nest_asyncio.apply()
from dotenv import load_dotenv
from pdf2image import convert_from_path
import base64
import requests
from llama_index.core import Document

In [2]:

use_braintrust_dataset = True

# COMPARISON_FILE = 'claude-3-5-sonnet-20240620_qa.csv'
COMPARISON_FILE = "major_questions.csv"

PDF_LOCATION = 'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf'
# PDF_LOCATION = 'IndustrySource/Misc/Aerospace Ceramics copy.pdf'
# DOC_ID = 'aerospace-ceramics'
DOC_ID = 'ibis-healthcare-social-assistance'
MODEL_ID = 'gpt-4o-mini'
QUESTION_COL = 'question'
RESPONSE_COL = 'rag_model_response'
NUM_QUESTIONS = -1
PARSER = "claude" # "claude" or "llama-parse"
CHUNK_SIZE = 600
SPLITTER = "tree"
TOP_K = 3
OUTPUT_FOLDER = f'./rag_outputs/{DOC_ID}'
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
OUTPUT_FILE = f'{OUTPUT_FOLDER}/output_{MODEL_ID}_{PARSER}_{CHUNK_SIZE}_{SPLITTER}_{TOP_K}.csv'


In [3]:
OUTPUT_FILE

'./rag_outputs/ibis-healthcare-social-assistance/output_gpt-4o-mini_claude_600_tree_3.csv'

In [4]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
load_dotenv("/Users/mbajaj/.env")
# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv('LLAMA_CLOUD_API_KEY')
# Using OpenAI API for embeddings/llms
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["ANTHROPIC_API_KEY"] = os.getenv('ANTHROPIC_API_KEY')
os.environ["BRAINTRUST_API_KEY"]=os.getenv('BRAINTRUST_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [5]:
IMAGE_PROMPT = """
Given an image of a page from a market research report, your task is to convert all the information on the page into markdown format, preserving the original structure and content.

- Transcribe all text, including paragraphs and headings, verbatim from the page to markdown, maintaining the original format. Do not modify, omit, or add any text.
- If the page includes numerical information with arrows, percentages etc, describe the text in full sentences in markdown format. For example, if there's a box with the text "revenue of wine industry" and an arrow pointing up saying "10% (2015-2020)", describe this as "Revenue of wine industry increased by 10% from 2015 to 2020" instead of just copying the text.
- Do not explain any text that is clearly written in the page, including headings, subheadings, and paragraphs. Copy the text as it is.
- If the text structure is unclear, use your best judgement to format it in markdown.
- If the page contains tables, convert them into markdown table format and provide an explnation as well. Explain all the data that can be inferred from each table. For example, if a table shows sales data for different products, explain the sales trends and patterns with respect to each product. Try to provide as much detail as possible.
- If the page includes a plot or graph, describe it objectively in markdown format. Explain all the details that can be inferred from the plot or graph. For example, if a plot shows sales trends over time, describe the sales trends and patterns observed. Provide a detailed explanation of the data represented in the plot or graph.
- When explaining any component, understand the context of the entire page and be as specific as possible without any ambiguity. The position of the explanation should match the position of the component in the page.
- The output should not contain personal opinions or biases. Do not add personal comments or any information not present on the page. Avoid referring to the page or the report - explain without reference.
- Ensure no important information from the page is missed, as capturing all details is crucial.
"""

SYSTEM_MESSAGE = "You are a profession converter that converts all the details in given image of page of a market research report to markdown format while preserving all the structure."

FINAL_MESSAGE = "Please describe the provided page in markdown format. Strictly follow the criteria mentioned above to describe each component of the page."

In [6]:
import pickle

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


def request_claude_with_image(base64_image, model="claude-3-5-sonnet-20240620"):
    responded = False
    num_tries = 0
    failed = 0
    
    while not responded and num_tries < 5:
        num_tries += 1

        client = anthropic.Anthropic()
        response = client.messages.create(
            model=model,
            max_tokens=5000,
            system = SYSTEM_MESSAGE,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": IMAGE_PROMPT},
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": base64_image,
                            },
                        },
                        {"type": "text", "text": FINAL_MESSAGE},
                    ],
                }
            ],
        )
    
        try:
            response_txt = response.content[0].text
            responded = True
            return response_txt, failed
        except:
            failed += 1
            continue
    return None, failed


def ClaudeParse(pdf_path, output_dir_path, model="claude-3-5-sonnet-20240620"):
    # check if outpu_pickle exists
    output_pickle_path = os.path.join(output_dir_path, f"_{model}_pages.pkl")
    if os.path.exists(output_pickle_path):
        with open(output_pickle_path, "rb") as file:
            return pickle.load(file)
    
    img_dir = os.path.join(output_dir_path, "_imgs")
    if not os.path.exists(img_dir):
        os.makedirs(img_dir)
    
    images = convert_from_path(pdf_path)
    pages = []
    skipped_pages = 0
    average_failures = 0
    # Iterate over the images
    for i, image in enumerate(images):
        # Define the path to save the image
        image_path = os.path.join(img_dir, f'page_{i+1}.png')

        # Save the image
        image.save(image_path, 'PNG')
        base64_image = encode_image(image_path)
        response, failed = request_claude_with_image(base64_image, model)
        average_failures += failed
        if response is None:
            print(f"Failed to parse the page {i+1} after {failed} tries")
            response = ""
            skipped_pages += 1

        # convert the response to llama-index document
        doc = Document(text=response, metadata={"page_number": i+1})
        pages.append(doc)

    print(f"Average failures in calling claude API: {average_failures/len(images)}")
    print(f"Skipped {skipped_pages} pages out of {len(images)}")

    output_text_path = os.path.join(output_dir_path, f"_{model}_pages.txt")

    # save the pages as txt file for easier debugging
    with open(output_text_path, "w") as f:
        for page in pages:
            #write page number
            f.write(f"***Page {page.metadata['page_number']}***\n\n")
            f.write(page.text)
            f.write("\n")

    pickle.dump(pages, open(output_pickle_path, "wb"))
    return pages

In [9]:
model="claude-3-5-sonnet-20240620"
print(os.path.join(OUTPUT_FOLDER, f"_{model}_pages.pkl"))

./rag_outputs/ibis-healthcare-social-assistance/_claude-3-5-sonnet-20240620_pages.pkl


In [10]:
# from llama_index.core import SimpleDirectoryReader

from llama_parse import LlamaParse
if PARSER == "llama-parse":
    documents = LlamaParse(result_type="markdown").load_data(PDF_LOCATION)
    print(len(documents))
elif PARSER == "claude":
    documents = ClaudeParse(PDF_LOCATION, OUTPUT_FOLDER)
    print(len(documents))

80


In [18]:
docs = [document.text for document in documents]
# iterate over docs and see if the first sentence contains the title, then remove the whole line from the text
all_sentences = []
for i, doc in enumerate(docs):
    sentences = doc.split("\n")
    all_sentences.append(sentences)


In [19]:

headings = []
for i, sentences in enumerate(all_sentences):
    for j, sentence in enumerate(sentences):
        # sentence is a heading if it starts with # or ## or ### or #### or #####
        if sentence.startswith("#"):
            headings.append({"text": sentence, "page": i+1, "sentence_number": j+1})

print(len(headings))
headings[:2]

455


[{'text': '# Aerospace Ceramics: Global Markets to 2026',
  'page': 1,
  'sentence_number': 1},
 {'text': '# Table of Contents', 'page': 2, 'sentence_number': 1}]

In [20]:
HEADINGS_PROMPT = ''' You will be given a text containing headings and subheadings parsed from the market research report of the industry 'Healthcare and Social Assistance in the US' in markdown format. 
However, the issue is that these subheadings were parsed one page at a time. So this makes it possible that the structure of the document is not preserved.
The heading 2 in the document might be parsed as heading 1 if it is the first heading on the page. Similarly, all the following subheadings on the same page might be parsed as different heading levels.
However, the order of the headings is preserved from top to bottom of each page. So you don't have to worry about the order of the headings.
Other issue is that parsed headings/ subheadings may accidentally include date or page number or report title that might have been present as a header or footer in the document.
Your task is to identify the correct structure of the document by taking into account the semantic meanings of the headings and subheadings.
Your input will be a text file with each line starting with <page_number, line_number> of the heading/ subheading followed by the heading or subheading text starting with # or ## or ### or #### or ##### denoting heading 1, heading 2, heading 3, heading 4 and heading 5 respectively.
Your output will be a text file with each line containing the same <page_number, line_number> as the one in input followed by the heading or subheading text with the correct heading level denoted by # or ## or ### or #### or #####. 
You must not change the page number and line number of the respective heading or subheading. You may entrirely remove the heading or subheading if it is a report title or date that was incorrectly parsed as a heading. Otherwise, you should correct the heading level if you think it was parsed incorrectly.
'''

SYSTEM_MESSAGE = "You are an expert in identifying the correct structure of a document by taking into account the page numbers, line numbers and the semantic meanings of the headings and subheadings."

FINAL_MESSAGE = "Please identify the correct structure of the document by taking into account the page numbers, line numbers and the semantic meanings of the headings and subheadings. Output the headings with the correct heading level denoted by # or ## or ### or #### or #####."

In [21]:
input_text = ""
for heading in headings:
    input_text += f"<{heading['page']}, {heading['sentence_number']}> {heading['text']}\n"
input_text

"<1, 1> # Aerospace Ceramics: Global Markets to 2026\n<2, 1> # Table of Contents\n<2, 3> ## Chapter 1: Introduction ......................................................................................... 1\n<2, 14> ## Chapter 2: Summary and Highlights........................................................................ 9\n<2, 16> ## Chapter 3: Industry Trends and Opportunities...................................................... 13\n<2, 23> ## Chapter 4: Market Breakdown by Composition ................................................... 30\n<2, 32> ## Chapter 5: Market Breakdown by Application ..................................................... 53\n<2, 38> ## Chapter 6: Market Breakdown by Segment ......................................................... 65\n<2, 43> ## Chapter 7: Market Breakdown by Region ............................................................ 83\n<3, 10> # Chapter 8: Emerging Materials: Background and Applications............................ 134\n<3, 12>

In [22]:
import anthropic

from llama_index.core import Document

client = anthropic.Anthropic()
model = "claude-3-5-sonnet-20240620"

response = client.messages.create(
    model=model,
    max_tokens=5000,
    system = SYSTEM_MESSAGE,
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": HEADINGS_PROMPT},
                {
                    "type": "text",
                    "text": input_text
                },
                {"type": "text", "text": FINAL_MESSAGE},
            ],
        }
    ],
)

In [23]:
output = response.content[0].text
output_list = output.split("\n")
# filter output list that do not start with <
output_list = [line for line in output_list if line.lstrip().startswith("<")]
print(len(output_list))
output_list[:5]

124


['<1, 1> # Aerospace Ceramics: Global Markets to 2026',
 '<2, 1> ## Table of Contents',
 '<2, 3> ### Chapter 1: Introduction',
 '<2, 14> ### Chapter 2: Summary and Highlights',
 '<2, 16> ### Chapter 3: Industry Trends and Opportunities']

In [24]:

class Node:
    def __init__(self, heading, level, page, line_number, refined_output_idx):
        self.heading = heading
        self.text = None
        self.level = level
        self.page = page
        self.line_number = line_number
        self.children = []
        self.parent = None
        self.self_index = None
        self.is_partial_node = False
        self.is_partial_node_parent = False
        self.text_size = 0
        self.refined_output_idx = refined_output_idx  
        self.embedding_text = None      
    
    def add_child(self, child):
        child.self_index = len(self.children)
        self.children.append(child)
        child.parent = self
    
    def __repr__(self):
        return f"self index: {self.self_index} level: {self.level} loc: <{self.page, self.line_number}> content: {self.heading}"
    
    def __str__(self):
        return f"self index: {self.self_index} level: {self.level} loc: <{self.page, self.line_number}> content: {self.heading}"


In [25]:
# make a tree structure of the headings and subheadings from output_list such as level 1 is parent of level 2 and so on

def get_level(heading):
    text = heading.lstrip()
    level = 0
    for char in text:
        if char == "#":
            level += 1
        else:
            break
    return level

def get_fields_from_line(text):
    level = 0
    line = text.lstrip()
    if line.startswith("<"):
        prefix = (line.split("> ")[0]).split("<")[1]
        page, line_number = int(prefix.split(",")[0]), int(prefix.split(",")[1])
        heading = line.split("> ")[1]
        level = get_level(heading)
        return page, line_number, heading, level
    else:
        return None, None, None, level


root = Node(heading="", level=0, page=0, line_number=0, refined_output_idx=-1)
parent = root
# construct the tree
# TODO: take page, line_number from the input instead of output to avoid any issues

refined_output = []
for i in range(len(output_list)):
    page, line_number, heading, level = get_fields_from_line(output_list[i])
    if level == 0:
        continue
    if level > parent.level:
        child = Node(heading, level, page, line_number, len(refined_output))
        parent.add_child(child)
        parent = child
    elif level == parent.level:
        child = Node(heading, level, page, line_number, len(refined_output))
        parent.parent.add_child(child)
        parent = child
    else: # level < parent.level
        while level < parent.level:
            parent = parent.parent
        child = Node(heading, level, page, line_number, len(refined_output))
        if level == parent.level:
            parent.parent.add_child(child)
        else:
            parent.add_child(child)
        parent = child
    refined_output.append((page, line_number, heading, level))



In [26]:
def get_node_text(node):
    if node.text is not None:
        return node.text
    
    if node.level == 0:
        return ""
    page = node.page
    line_number = node.line_number
    # check if there is any heading after this
    if len(refined_output) > node.refined_output_idx + 1:
        next_heading = refined_output[node.refined_output_idx + 1]
        next_page, next_line_number, next_heading_text, next_level = next_heading
        # get text between this heading and next heading using all_sentences
        text = ""
        if page == next_page:
            if line_number + 1 < next_line_number:
                text += "\n".join(all_sentences[page-1][line_number:next_line_number-1])
        else:
            text += "\n".join(all_sentences[page-1][line_number:])
            for p in range(page+1, next_page):
                text += "\n".join(all_sentences[p-1])
            text += "\n".join(all_sentences[next_page-1][:next_line_number-1])
        return text
    else:
        # get text from this heading to the end of the page
        text = "\n".join(all_sentences[page-1][line_number:])
        for p in range(page+1, len(all_sentences)):
            text += "\n".join(all_sentences[p-1])
        return text

In [27]:
# split text to chunk sizes less than 1024

def split_text_to_chunk_size(text, target_size=1024):
    text_sentences = text.split("\n")
    chunks = []
    current_chunk_len = 0
    current_chunk = ""
    for sentence in text_sentences:
        sentence_len = len(sentence.split(" "))
        if current_chunk_len + sentence_len < target_size:
            current_chunk += sentence + "\n"
            current_chunk_len += sentence_len
        else:
            chunks.append(current_chunk)
            current_chunk = sentence + "\n"
            current_chunk_len = sentence_len
    if current_chunk_len > 0:
        chunks.append(current_chunk)
    return chunks

In [28]:
# let's traverse the tree and see if we need to split some nodes into multiple nodes due to text size more than selected chunk size

# do pre-order traversal 



def set_text_and_split_necessary_nodes(node, splitter_func, chunk_size=500):
    if node.text is None:
        text = get_node_text(node)
        text_size = len(text.split(" "))
    else:
        text = node.text
        text_size = node.text_size
    
    if text_size > chunk_size:
        print("Splitting node: ", node.refined_output_idx)
        # split the node into multiple nodes
        text_chunks = splitter_func(text, target_size=chunk_size)
        # now we replace original node with dummy node which has heading but the text is empty. text will be added to children
        node.text = ""
        node.is_partial_node_parent = True
        node.text_size = 0
        # store the children of this node and remove them from the node
        original_children = node.children
        node.children = []
        print("Node children: ", len(original_children))
        print("Splitting into: ", len(text_chunks))

        for i in range(len(text_chunks)):
            child = Node(heading="", level=node.level, page=node.page, line_number=node.line_number, refined_output_idx=node.refined_output_idx)
            child.text = text_chunks[i]
            child.text_size = len(text_chunks[i].split(" "))
            # add this child to the parent
            node.add_child(child)
            child.is_partial_node = True
            # check if this is the last child, it last child add the original children to this child
            if i == len(text_chunks) - 1:
                for original_child in original_children:
                    child.add_child(original_child)
    else:
        node.text = text
        node.text_size = text_size
    
    if len(node.children) > 0:
        for child in node.children:
            set_text_and_split_necessary_nodes(child, splitter_func, chunk_size)
    return




In [29]:
root.children

[self index: 0 level: 1 loc: <(1, 1)> content: # Aerospace Ceramics: Global Markets to 2026,
 self index: 1 level: 1 loc: <(8, 1)> content: # Chapter 1: Introduction,
 self index: 2 level: 1 loc: <(17, 1)> content: # Chapter 2: Summary and Highlights,
 self index: 3 level: 1 loc: <(21, 1)> content: # Chapter 3: Industry Trends and Opportunities,
 self index: 4 level: 1 loc: <(38, 1)> content: # Chapter 4: Market Breakdown by Composition,
 self index: 5 level: 1 loc: <(61, 1)> content: # Chapter 5: Market Breakdown by Application,
 self index: 6 level: 1 loc: <(73, 1)> content: # Chapter 6: Market Breakdown by Segment,
 self index: 7 level: 1 loc: <(91, 1)> content: # Chapter 7: Market Breakdown by Region,
 self index: 8 level: 1 loc: <(142, 1)> content: # Chapter 8: Emerging Materials: Background and Applications,
 self index: 9 level: 1 loc: <(156, 1)> content: # Chapter 9: Company Profiles,
 self index: 10 level: 1 loc: <(166, 1)> content: # Appendix: Acronyms,
 self index: 11 level:

In [30]:
# for debugging purposes, let's print the tree
# iterate over the tree and print nodes with text size more than 500 words
def print_nodes_with_text_size_more_than_500_words(node, large_nodes):
    node_text = get_node_text(node)
    text_size = len(node_text.split(" "))
    if text_size > 500:
        print(node)
        large_nodes.append(node)
    if len(node.children) > 0:
        for child in node.children:
            print_nodes_with_text_size_more_than_500_words(child, large_nodes)
    return

large_nodes = []
print_nodes_with_text_size_more_than_500_words(root, large_nodes)
print(len(large_nodes))

# only for debug purpose
# print the tree structure to see there is no loop in the tree and count the nodes

def print_tree_structure(node):
    print(node)
    if len(node.children) > 0:
        bottom_nodes = 0
        for child in node.children:
            bottom_nodes += print_tree_structure(child)
        return bottom_nodes + 1
    return 1

total_nodes = print_tree_structure(root)
print(total_nodes)

# only for debugging purposes, print the text of big nodes
# iterate over large nodes and print their text, also print the text of their children and if they are partial nodes

def print_large_nodes_and_children(large_nodes):
    for node in large_nodes:
        print(node.refined_output_idx)
        print(get_node_text(node))
        for child in node.children:
            print("child: ", child)
            if child.is_partial_node:
                print("Partial Node")
            print(get_node_text(child))
            print("************")  
    return

print_large_nodes_and_children(large_nodes)


self index: 1 level: 2 loc: <(4, 1)> content: ## List of Tables
self index: 2 level: 2 loc: <(6, 1)> content: ## List of Figures
self index: 5 level: 2 loc: <(11, 1)> content: ## Geographic Breakdown
self index: 2 level: 1 loc: <(17, 1)> content: # Chapter 2: Summary and Highlights
self index: 1 level: 3 loc: <(21, 15)> content: ### Material Recyclers
self index: 1 level: 2 loc: <(26, 5)> content: ## Industry and Key Research Organizations
self index: 1 level: 3 loc: <(33, 7)> content: ### Efficiency and Fuel Cost
self index: 4 level: 1 loc: <(38, 1)> content: # Chapter 4: Market Breakdown by Composition
self index: 0 level: 3 loc: <(39, 26)> content: ### Alumina
self index: 1 level: 3 loc: <(41, 27)> content: ### Zirconia
self index: 1 level: 2 loc: <(44, 1)> content: ## Non-oxides
self index: 0 level: 3 loc: <(45, 14)> content: ### Silicon Carbide
self index: 1 level: 3 loc: <(47, 33)> content: ### Silicon Nitride
self index: 2 level: 2 loc: <(49, 1)> content: ## Composites
self inde

In [31]:
# let's process the tree and split the nodes with text size more than 500 words into multiple nodes
size_based_splitter = split_text_to_chunk_size
set_text_and_split_necessary_nodes(root, splitter_func=size_based_splitter, chunk_size=500)

Splitting node:  13
Node children:  0
Splitting into:  3
Splitting node:  14
Node children:  0
Splitting into:  3
Splitting node:  21
Node children:  0
Splitting into:  2
Splitting node:  25
Node children:  0
Splitting into:  4
Splitting node:  29
Node children:  0
Splitting into:  2
Splitting node:  43
Node children:  0
Splitting into:  11
Splitting node:  46
Node children:  0
Splitting into:  2
Splitting node:  51
Node children:  3
Splitting into:  2
Splitting node:  53
Node children:  0
Splitting into:  4
Splitting node:  54
Node children:  0
Splitting into:  3
Splitting node:  55
Node children:  2
Splitting into:  2
Splitting node:  56
Node children:  0
Splitting into:  2
Splitting node:  57
Node children:  0
Splitting into:  3
Splitting node:  58
Node children:  0
Splitting into:  13
Splitting node:  60
Node children:  0
Splitting into:  4
Splitting node:  62
Node children:  0
Splitting into:  3
Splitting node:  64
Node children:  3
Splitting into:  3
Splitting node:  65
Node chil

In [32]:
# for each node in the tree, set embedding text which is all the headings of the ancestors prefixed to the text of the node

def set_embedding_text(node, recursive_headings=[]):
    if node.level != 0 and node.text is not None:
        node.embedding_text = "\n".join(recursive_headings) + "\n" + node.heading + "\n" + node.text
    if len(node.children) > 0:
        for child in node.children:
            recursive_headings.append(node.heading)
            set_embedding_text(child, recursive_headings)
            recursive_headings.pop()
    return

set_embedding_text(root)

In [33]:
# create llama-index text nodes from the tree structure
from llama_index.core.schema import TextNode
def create_text_nodes_from_tree(node, llama_text_nodes):
    if node.level != 0 and node.embedding_text is not None:
        text_node = TextNode(
            text=node.embedding_text,
            metadata={"page_number": node.page, "line_number": node.line_number, "level": node.level}
        )
        llama_text_nodes.append(text_node)
    if len(node.children) > 0:
        for child in node.children:
            create_text_nodes_from_tree(child, llama_text_nodes)
    return
    
    

In [41]:
llama_text_nodes = []
create_text_nodes_from_tree(root, llama_text_nodes)
print(len(llama_text_nodes))

280


In [39]:
import weaviate

cluster_url = "http://localhost:8080"

client = weaviate.connect_to_local(
    port=8080,
)

# cluster_url = "https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud"
# api_key = "7ZfUCibywHnzM0WKMPx7YevuN79nUtS4KJgT"

# client = weaviate.connect_to_wcs(
#     cluster_url=cluster_url,
#     auth_credentials=weaviate.auth.AuthApiKey(api_key),
# )

In [42]:
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core import VectorStoreIndex
import uuid

# generate unique index for multiple runs
INDEX_NAME = ('X' + str(uuid.uuid4())).replace('-', '_')

vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name=INDEX_NAME
)

vector_index = VectorStoreIndex(llama_text_nodes, vector_store=vector_store)

In [43]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model=MODEL_ID, api_key = OPENAI_API_KEY)
query_engine = vector_index.as_query_engine(similarity_top_k=TOP_K, llm=llm)

In [53]:
from braintrust import Eval

from autoevals import AnswerCorrectness, ContextRecall
from autoevals import Factuality
from autoevals.ragas import *
# BRAINTRUST_MODEL = "claude-3-5-sonnet-20240620"
BRAINTRUST_MODEL = "gpt-4o"


# Wrap ContextRecall() to propagate along the "answer" and "context" values separately
async def context_recall(output, **kwargs):
    return await ContextRecall(model=BRAINTRUST_MODEL).eval_async(output=output["answer"], context=output["context"], **kwargs)

async def answer_correctness(output, **kwargs):
    return await AnswerCorrectness(model=BRAINTRUST_MODEL).eval_async(output=output["answer"], **kwargs)

async def factuality(output, **kwargs):
    return await Factuality().eval_async(output=output["answer"], **kwargs)

async def answer_similarity(output, **kwargs):
    return await AnswerSimilarity().eval_async(output=output["answer"], **kwargs)



In [54]:
def my_task_braintrust(input):
    input_q = input.split("> ")[1]
    response = query_engine.query(input_q)
    answer = response.response
    metadata = response.metadata
    refs = []
    for m in metadata.values():
        refs.append(m['page_number'])
    q_contexts = []
    for n in response.source_nodes:
        q_contexts.append(n.text)
    return {"answer": answer, "context": q_contexts, "refs": refs}

In [51]:
DOC_ID = 'ibis-healthcare-social-assistance'
COMPARISON_FILE = "major_questions.csv"
PDF_LOCATION = 'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf'
qa_model = "claude-3-5-sonnet-20240620"
dataset_name = f"{DOC_ID}/{COMPARISON_FILE}"
use_braintrust_dataset = True

if use_braintrust_dataset:
    #TODO : init dataset from braintrust and add the references to output instead of metadata
    
    labelled_df = df[["input", "expected", "references", "context", "rag_model_response", "metadata"]]

    labelled_df["index_key"] = labelled_df.input.apply(lambda x: int(x.split("> ")[0]))
    idx_to_row_number = {idx: row_number for row_number, idx in labelled_df["index_key"].items()}
    
    eval_result = await Eval(
    name=PROJECT_NAME,
    experiment_name=EXPERIMENT_NAME,
    data=init_dataset(project=PROJECT_NAME, name=dataset_name),
    task=my_task_braintrust,
    scores=[context_recall, answer_correctness, factuality, answer_similarity],
    metadata=dict(model=MODEL_ID, topk=TOP_K, parser=PARSER, chunksize=CHUNK_SIZE, split=SPLITTER, braintrust_model=BRAINTRUST_MODEL, num_questions=NUM_QUESTIONS),
    )

In [None]:
import braintrust
if use_braintrust_dataset:
    dataset = braintrust.init_dataset(project="RagMetrics", name=DOC_ID)
    df = []
    for row in dataset:
        df.append(row)
    # convert list of dict to pandas dataframe
    dff = pd.DataFrame(df)
    dff['question'] = dff['input'].apply(lambda x: x.split(">")[1])
else:

    df = pd.read_csv(COMPARISON_FILE)
    if NUM_QUESTIONS == -1:
        dff = df.copy()
    else:
        dff = df.head(NUM_QUESTIONS).copy()

dff.head()

In [41]:
result = []
references = []
contexts = []
for question in dff[QUESTION_COL]:
    response = query_engine.query(question)
    result.append((response.response))
    metadata = response.metadata
    refs = []
    for m in metadata.values():
        refs.append(m['page_number'])
    references.append(refs)
    q_contexts = []
    for n in response.source_nodes:
        q_contexts.append(n.text)
    contexts.append(q_contexts)

dff[RESPONSE_COL] = result
dff['references'] = references
dff['context'] = contexts
dff.to_csv(OUTPUT_FILE, index=False)


In [47]:
def print_multi_line(response, max_chars=100):
    # print max 20 words of the response in single line and then move to next line
    response_text = response.response
    response_text_words = response_text.split(" ")
    response_text_lines = []
    current_line = ""
    for word in response_text_words:
        if len(current_line) + len(word) < 100:
            current_line += word + " "
        else:
            response_text_lines.append(current_line)
            current_line = word + " "

    response_text_lines.append(current_line)

    return response_text_lines

def get_references(response):
    references = []
    for n in response.source_nodes:
        references.append({"page number": n.metadata["page_number"], "line number": n.metadata["line_number"]})
    return references

In [50]:
question = "Describe the performance of the aerospace ceramics market in the US in recent years."
response = query_engine.query(question)
ans = print_multi_line(response)
print("\n".join(ans))
print("references: ", get_references(response))


The aerospace ceramics market in the U.S. has shown significant growth in recent years. In the 
commercial aerospace sector, the market value increased from $1,356.1 million in 2020 to an 
expected $1,833.1 million by 2026, reflecting a compound annual growth rate (CAGR) of 5.2%. In the 
commercial space sector, the market has experienced even more remarkable growth, rising from $158.2 
million in 2020 to a projected $902.9 million in 2026, with a CAGR of 39.2%. This indicates a 
robust demand for aerospace ceramics, driven by the need for lightweight materials and advancements 
in technology, particularly in the context of recovery from the impacts of the COVID-19 pandemic. 
Overall, the U.S. remains a dominant player in the aerospace ceramics market, contributing 
significantly to both commercial aerospace and commercial space sectors. 
references:  [{'page number': 75, 'line number': 1}, {'page number': 91, 'line number': 1}, {'page number': 83, 'line number': 3}]


In [49]:
question = "Please describe the major headwinds and tailwinds in the market."
response = query_engine.query(question)
ans = print_multi_line(response)
print("\n".join(ans))
print("references: ", get_references(response))

The market is currently facing several headwinds, primarily due to the impact of COVID-19, which 
has led to a temporary crash in travel demand expected to last 12 to 24 months. This instability is 
likely to hinder airlines' ability to invest in new equipment and reduce the purchase of new planes 
until at least 2022 or 2023. Additionally, economic restructuring may result in a long-term 
reduction in travel demand, particularly as businesses adapt to less reliance on in-person 
meetings.

On the other hand, there are tailwinds supporting the market, such as the significant 
restructuring efforts undertaken by airlines following the 2007 global economic downturn, which 
have led to more consistent profitability. The expansion of production capacity by major companies 
like Airbus and Boeing in the Asia-Pacific region is also a positive factor, as it aligns with the 
rapid recovery of domestic and international aircraft manufacturing activities. Furthermore, the 
growth of local compet