In [38]:
# %pip install llama-index llama-index-core llama-parse openai llama_index.embeddings.huggingface -q
# %pip install llama-index-llms-anthropic -q
# %pip install llama-index-vector-stores-weaviate -q

In [42]:
import os
import pandas as pd

import anthropic
import nest_asyncio
nest_asyncio.apply()
from dotenv import load_dotenv
from pdf2image import convert_from_path
import base64
import requests
from llama_index.core import Document

In [43]:

use_braintrust_dataset = True

COMPARISON_FILE = 'claude-3-5-sonnet-20240620_qa.csv'
PDF_LOCATION = 'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf'
DOC_ID = 'ibis-healthcare-social-assistance'
MODEL_ID = 'gpt-4o-mini'
QUESTION_COL = 'question'
RESPONSE_COL = 'rag_model_response'
NUM_QUESTIONS = -1
PARSER = "claude" # "claude" or "llama-parse"
CHUNK_SIZE = 600
SPLITTER = "sentence"
TOP_K = 3
OUTPUT_FOLDER = f'./rag_outputs/{DOC_ID}'
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
OUTPUT_FILE = f'{OUTPUT_FOLDER}/output_{MODEL_ID}_{PARSER}_{CHUNK_SIZE}_{SPLITTER}_{TOP_K}.csv'


In [48]:
import braintrust
if use_braintrust_dataset:
    dataset = braintrust.init_dataset(project="RagMetrics", name=DOC_ID)
    df = []
    for row in dataset:
        df.append(row)
    # convert list of dict to pandas dataframe
    dff = pd.DataFrame(df)
    dff['question'] = dff['input'].apply(lambda x: x.split(">")[1])
else:

    df = pd.read_csv(COMPARISON_FILE)
    if NUM_QUESTIONS == -1:
        dff = df.copy()
    else:
        dff = df.head(NUM_QUESTIONS).copy()

In [49]:
dff.head()

Unnamed: 0,id,_xact_id,created,project_id,dataset_id,input,expected,metadata,tags,span_id,root_span_id,is_root,question
0,00028ea4-a9e3-4293-ad1c-6e0d5bc2e672,1000193854671299437,2024-10-06T20:42:29.637Z,a3e7ed80-604a-4d72-ab2c-ba0040ab4f8d,5ddc103a-9916-4c19-aaa2-3d3d79115c35,57> What factors influence the financial healt...,The financial health of healthcare facilities ...,{'file': '62 Healthcare and Social Assistance ...,,7f368491-28bb-45bf-bfbb-7fe16ad3b2b7,7f368491-28bb-45bf-bfbb-7fe16ad3b2b7,True,What factors influence the financial health o...
1,01260b71-e5f5-4d2d-9028-8ce3c540f1fe,1000193854671299437,2024-10-06T20:42:29.638Z,a3e7ed80-604a-4d72-ab2c-ba0040ab4f8d,5ddc103a-9916-4c19-aaa2-3d3d79115c35,90> How do Medicaid reimbursements compare to ...,Medicaid reimbursements make up a significantl...,{'file': '62 Healthcare and Social Assistance ...,,306ffb8d-cf38-40f1-9bac-daa3ad6889b6,306ffb8d-cf38-40f1-9bac-daa3ad6889b6,True,How do Medicaid reimbursements compare to Med...
2,01300719-22e4-4033-ba2f-47f61da8a11f,1000193854671299437,2024-10-06T20:42:29.639Z,a3e7ed80-604a-4d72-ab2c-ba0040ab4f8d,5ddc103a-9916-4c19-aaa2-3d3d79115c35,99> Which state in the US allocates the most o...,California allocates the most on personal heal...,{'file': '62 Healthcare and Social Assistance ...,,537c615b-dcde-47e8-9ae1-632ed227ffc6,537c615b-dcde-47e8-9ae1-632ed227ffc6,True,Which state in the US allocates the most on p...
3,017a3c26-d858-4dc8-8fa3-0353235938c3,1000193854671299436,2024-10-06T20:42:29.639Z,a3e7ed80-604a-4d72-ab2c-ba0040ab4f8d,5ddc103a-9916-4c19-aaa2-3d3d79115c35,115> What are the crucial elements for referra...,"Referral networks are crucial, with referrers ...",{'file': '62 Healthcare and Social Assistance ...,,3949f686-0c77-4250-863f-4402f396c51a,3949f686-0c77-4250-863f-4402f396c51a,True,What are the crucial elements for referral ne...
4,03d8756e-f49d-41a8-acf9-48120a60a674,1000193854671299437,2024-10-06T20:42:29.637Z,a3e7ed80-604a-4d72-ab2c-ba0040ab4f8d,5ddc103a-9916-4c19-aaa2-3d3d79115c35,23> What is the projected revenue growth rate ...,The sector revenue will grow at a CAGR of 2.7%...,{'file': '62 Healthcare and Social Assistance ...,,27d60f2d-ef0e-4a39-9d6c-27d059ab2f97,27d60f2d-ef0e-4a39-9d6c-27d059ab2f97,True,What is the projected revenue growth rate for...


In [24]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
load_dotenv("/Users/mbajaj/.env")
# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv('LLAMA_CLOUD_API_KEY')
# Using OpenAI API for embeddings/llms
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["ANTHROPIC_API_KEY"] = os.getenv('ANTHROPIC_API_KEY')
os.environ["BRAINTRUST_API_KEY"]=os.getenv('BRAINTRUST_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [25]:
IMAGE_PROMPT = """
Given an image of a page from a market research report, your task is to convert all the information on the page into markdown format, preserving the original structure and content.

- Transcribe all text, including paragraphs and headings, verbatim from the page to markdown, maintaining the original format. Do not modify, omit, or add any text.
- If the page includes numerical information with arrows, percentages etc, describe the text in full sentences in markdown format. For example, if there's a box with the text "revenue of wine industry" and an arrow pointing up saying "10% (2015-2020)", describe this as "Revenue of wine industry increased by 10% from 2015 to 2020" instead of just copying the text.
- Do not explain any text that is clearly written in the page, including headings, subheadings, and paragraphs. Copy the text as it is.
- If the text structure is unclear, use your best judgement to format it in markdown.
- If the page contains tables, convert them into markdown table format and provide an explnation as well. Explain all the data that can be inferred from each table. For example, if a table shows sales data for different products, explain the sales trends and patterns with respect to each product. Try to provide as much detail as possible.
- If the page includes a plot or graph, describe it objectively in markdown format. Explain all the details that can be inferred from the plot or graph. For example, if a plot shows sales trends over time, describe the sales trends and patterns observed. Provide a detailed explanation of the data represented in the plot or graph.
- When explaining any component, understand the context of the entire page and be as specific as possible without any ambiguity. The position of the explanation should match the position of the component in the page.
- The output should not contain personal opinions or biases. Do not add personal comments or any information not present on the page. Avoid referring to the page or the report - explain without reference.
- Ensure no important information from the page is missed, as capturing all details is crucial.
"""

SYSTEM_MESSAGE = "You are a profession converter that converts all the details in given image of page of a market research report to markdown format while preserving all the structure."

FINAL_MESSAGE = "Please describe the provided page in markdown format. Strictly follow the criteria mentioned above to describe each component of the page."

In [29]:
import pickle

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


def request_claude_with_image(base64_image, model="claude-3-5-sonnet-20240620"):
    responded = False
    num_tries = 0
    failed = 0
    
    while not responded and num_tries < 5:
        num_tries += 1

        client = anthropic.Anthropic()
        response = client.messages.create(
            model=model,
            max_tokens=5000,
            system = SYSTEM_MESSAGE,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": IMAGE_PROMPT},
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": base64_image,
                            },
                        },
                        {"type": "text", "text": FINAL_MESSAGE},
                    ],
                }
            ],
        )
    
        try:
            response_txt = response.content[0].text
            responded = True
            return response_txt, failed
        except:
            failed += 1
            continue
    return None, failed


def ClaudeParse(pdf_path, output_dir_path, model="claude-3-5-sonnet-20240620"):
    # check if outpu_pickle exists
    output_pickle_path = os.path.join(output_dir_path, f"_{model}_pages.pkl")
    if os.path.exists(output_pickle_path):
        return pickle.load(open(output_pickle_path, "rb"))
    
    img_dir = os.path.join(output_dir_path, "_imgs")
    if not os.path.exists(img_dir):
        os.makedirs(img_dir)
    
    images = convert_from_path(pdf_path)
    pages = []
    skipped_pages = 0
    average_failures = 0
    # Iterate over the images
    for i, image in enumerate(images):
        # Define the path to save the image
        image_path = os.path.join(img_dir, f'page_{i+1}.png')

        # Save the image
        image.save(image_path, 'PNG')
        base64_image = encode_image(image_path)
        response, failed = request_claude_with_image(base64_image, model)
        average_failures += failed
        if response is None:
            print(f"Failed to parse the page {i+1} after {failed} tries")
            response = ""
            skipped_pages += 1

        # convert the response to llama-index document
        doc = Document(text=response, metadata={"page_number": i+1})
        pages.append(doc)

    print(f"Average failures in calling claude API: {average_failures/len(images)}")
    print(f"Skipped {skipped_pages} pages out of {len(images)}")

    output_text_path = os.path.join(output_dir_path, f"_{model}_pages.txt")

    # save the pages as txt file for easier debugging
    with open(output_text_path, "w") as f:
        for page in pages:
            #write page number
            f.write(f"***Page {page.metadata['page_number']}***\n\n")
            f.write(page.text)
            f.write("\n")

    pickle.dump(pages, open(output_pickle_path, "wb"))
    return pages

In [30]:
# from llama_index.core import SimpleDirectoryReader

from llama_parse import LlamaParse
if PARSER == "llama-parse":
    documents = LlamaParse(result_type="markdown").load_data(PDF_LOCATION)
    print(len(documents))
elif PARSER == "claude":
    documents = ClaudeParse(PDF_LOCATION, OUTPUT_FOLDER)
    print(len(documents))

80


In [31]:
import uuid

# generate unique index for multiple runs
INDEX_NAME = ('X' + str(uuid.uuid4())).replace('-', '_')

In [32]:
import weaviate

cluster_url = "https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud"
api_key = "7ZfUCibywHnzM0WKMPx7YevuN79nUtS4KJgT"

client = weaviate.connect_to_wcs(
    cluster_url=cluster_url,
    auth_credentials=weaviate.auth.AuthApiKey(api_key),
)

# weaviate vector database & llamaparse Integrated

In [35]:
print(documents[15].text)
print(len(documents))

# IBISWorld | Healthcare and Social Assistance in the US Mar 2024

## Performance Snapshot

### Revenue:

The 2019-24 Revenue CAGR increased by 0.7%.

### Revenue

Revenue for 2024 is projected to be $3.6tr. The revenue growth rate for the period 2019-24 is 0.7%, while for 2024-29 it is expected to be 2.7%.

### 2024 Revenue CAGR

The 2024 Revenue CAGR shows an increase of 0.3%.

### Revenue Volatility

The Revenue Volatility is described as Moderate.

### Revenue

The graph presents the total value ($) and annual change from 2011 - 2029, including a 5-year outlook.

The graph illustrates the Annual Revenue ($bn) and Change (%) from 2011 to 2029, with forecasted data from 2024 onwards. Key observations include:

1. The Annual Revenue shows a general upward trend from 2011 to 2029.
2. The Change (%) fluctuates significantly over the years, with notable peaks and troughs.
3. For 2024, the Annual Revenue is projected to be $3554.9 billion, with a 0.3% change.
4. The graph indicates a shar

In [36]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode

nodes = []
if SPLITTER == "sentence":

    splitter = SentenceSplitter(chunk_size=CHUNK_SIZE)
    for idx, doc in enumerate(documents):
        chunks = splitter.split_text(doc.text)
        # create nodes with metadata
        for i, chunk in enumerate(chunks):
            node = TextNode(text=chunk, metadata={'page_number': idx+1})
            nodes.append(node)


print(len(nodes))


97


In [37]:
from llama_index.vector_stores.weaviate import WeaviateVectorStore

vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name=INDEX_NAME
)

In [38]:
from llama_index.core import VectorStoreIndex
vector_index = VectorStoreIndex(nodes, vector_store = vector_store)

In [39]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model=MODEL_ID, api_key = OPENAI_API_KEY)
query_engine = vector_index.as_query_engine(similarity_top_k=TOP_K, llm=llm)

In [40]:
dff.shape, OUTPUT_FILE

((213, 7),
 './RagOutputs/ibis-healthcare-social-assistance_gpt-4o-mini_claude_600_sentence_3/output.csv')

In [41]:
result = []
references = []
contexts = []
for question in dff[QUESTION_COL]:
    response = query_engine.query(question)
    result.append((response.response))
    metadata = response.metadata
    refs = []
    for m in metadata.values():
        refs.append(m['page_number'])
    references.append(refs)
    q_contexts = []
    for n in response.source_nodes:
        q_contexts.append(n.text)
    contexts.append(q_contexts)

dff[RESPONSE_COL] = result
dff['references'] = references
dff['context'] = contexts
dff.to_csv(OUTPUT_FILE, index=False)
