In [38]:
# %pip install llama-index llama-index-core llama-parse openai llama_index.embeddings.huggingface -q
# %pip install llama-index-llms-anthropic -q
# %pip install llama-index-vector-stores-weaviate -q

In [39]:
use_braintrust_dataset = True

COMPARISON_FILE = 'claude-3-5-sonnet-20240620_qa.csv'
PDF_LOCATION = 'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf'
DOC_ID = 'ibis-healthcare-social-assistance'
MODEL_ID = 'gpt-4o-mini'
QUESTION_COL = 'question'
RESPONSE_COL = 'rag_model_response'
NUM_QUESTIONS = -1
PARSER = "llama-parse"
CHUNK_SIZE = 400
SPLITTER = "sentence"
TOP_K = 3
OUTPUT_FILE = f'./RagOutputs/{DOC_ID}_{MODEL_ID}_{PARSER}_{CHUNK_SIZE}_{SPLITTER}_{TOP_K}.csv'


In [None]:
import braintrust
if use_braintrust_dataset:
    dataset = braintrust.init_dataset(project="RagMetrics", name=DOC_ID)
    df = []
    for row in dataset:
        df.append(row)
    # convert list of dict to pandas dataframe
    dff = pd.DataFrame(df)
    dff['question'] = dff['input'].apply(lambda x: x.split(">")[1])
else:

    df = pd.read_csv(COMPARISON_FILE)
    if NUM_QUESTIONS == -1:
        dff = df.copy()
    else:
        dff = df.head(NUM_QUESTIONS).copy()

In [41]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
from dotenv import load_dotenv
load_dotenv("/Users/mbajaj/.env")
# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv('LLAMA_CLOUD_API_KEY')
# Using OpenAI API for embeddings/llms
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["BRAINTRUST_API_KEY"]=os.getenv('BRAINTRUST_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [42]:
# from llama_index.core import SimpleDirectoryReader

from llama_parse import LlamaParse
if PARSER == "llama-parse":
    documents = LlamaParse(result_type="markdown").load_data(PDF_LOCATION)
    len(documents)

Started parsing the file under job_id 95c39d88-adae-402b-ab10-cb466b01e63b


In [43]:
import uuid

# generate unique index for multiple runs
INDEX_NAME = ('X' + str(uuid.uuid4())).replace('-', '_')

In [44]:
import weaviate

cluster_url = "https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud"
api_key = "7ZfUCibywHnzM0WKMPx7YevuN79nUtS4KJgT"

client = weaviate.connect_to_wcs(
    cluster_url=cluster_url,
    auth_credentials=weaviate.auth.AuthApiKey(api_key),
)

# weaviate vector database & llamaparse Integrated

In [45]:
print(documents[20].text)
print(len(documents))

# IBISWorld | Healthcare and Social Assistance in the US

# Mar 2024

- The dramatic rise in the adoption of telemedicine in 2020 has fallen from its pandemic high but is still more integral to healthcare delivery in 2024 than pre-COVID, especially for providers offering mental health treatment.
- The challenges impacting healthcare providers are also expediting the adoption of other digital tools (AI, patient engagement products, data analytics) to streamline operations and produce cost savings.

# Social assistance providers fill in the gaps as pandemic-era benefits end

- The pandemic highlighted societal inequities. Social assistance providers have had an outsized role in addressing these inequities since the COVID-19 pandemic subsided.
- Pandemic relief funding that cushioned savings accounts and lifted many out of poverty ended between 2022 and 2023, with exact end dates varying between states. As benefits disappeared, millions turned to social assistance providers for help with 

In [46]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode

nodes = []
if SPLITTER == "sentence":

    splitter = SentenceSplitter(chunk_size=CHUNK_SIZE)
    for idx, doc in enumerate(documents):
        chunks = splitter.split_text(doc.text)
        # create nodes with metadata
        for i, chunk in enumerate(chunks):
            node = TextNode(text=chunk, metadata={'page_number': idx+1})
            nodes.append(node)


print(len(nodes))


104


In [47]:
from llama_index.vector_stores.weaviate import WeaviateVectorStore

vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name=INDEX_NAME
)

            Please make sure to close the connection using `client.close()`.


In [48]:
from llama_index.core import VectorStoreIndex
vector_index = VectorStoreIndex(nodes, vector_store = vector_store)

In [49]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model=MODEL_ID, api_key = OPENAI_API_KEY)
query_engine = vector_index.as_query_engine(similarity_top_k=TOP_K, llm=llm)

In [52]:
dff.shape, OUTPUT_FILE

((213, 7),
 './RagOutputs/ibis-healthcare-social-assistance_gpt-4o-mini_llama-parse_400_sentence_3.csv')

In [53]:
result = []
references = []
contexts = []
for question in dff[QUESTION_COL]:
    response = query_engine.query(question)
    result.append((response.response))
    metadata = response.metadata
    refs = []
    for m in metadata.values():
        refs.append(m['page_number'])
    references.append(refs)
    q_contexts = []
    for n in response.source_nodes:
        q_contexts.append(n.text)
    contexts.append(q_contexts)

dff[RESPONSE_COL] = result
dff['references'] = references
dff['context'] = contexts
dff.to_csv(OUTPUT_FILE, index=False)
