In [1]:
# %pip install llama-index llama-index-core llama-parse openai llama_index.embeddings.huggingface -q
# %pip install llama-index-llms-anthropic -q
# %pip install llama-index-vector-stores-weaviate -q

In [2]:
COMPARISON_FILE = 'claude-3-5-sonnet-20240620_qa.csv'
PDF_LOCATION = 'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf'
# PDF_PARSER = 'llm-sherpa'
PDF_PARSER = 'llama-parse'
DOC_ID = 'ibis-healthcare-social-assistance'
MODEL_ID = 'gpt-4o-mini'
OUTPUT_FILE = f'{DOC_ID}_{PDF_PARSER}_{MODEL_ID}.csv'
QUESTION_COL = 'question'
RESPONSE_COL = 'rag_model_response'
NUM_QUESTIONS = 250

In [3]:
import pandas as pd

df = pd.read_csv(COMPARISON_FILE)
dff = df.head(NUM_QUESTIONS).copy()

In [4]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
from dotenv import load_dotenv
load_dotenv()

# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = "llx-7K1IbMcLbyb8TDvMsIx3Brr7mD4K8ZnLaFMjbEq8S1uONYZp"
# Using OpenAI API for embeddings/llms
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_APIKEY')
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [5]:
# from llama_index.core import SimpleDirectoryReader

from typing import List
from llama_index.core.schema import Document

def llama_parse_docs() -> List[Document]:
    from llama_parse import LlamaParse
    return LlamaParse(
        result_type="markdown",
        gpt4o_api_key=OPENAI_API_KEY,
        # fast_mode=True
    ).load_data(PDF_LOCATION)

def llmsherpa_docs() -> List[Document]:
    from llmsherpa.readers import LayoutPDFReader
    llmsherpa_api_url = "http://localhost:5501/api/parseDocument?renderFormat=all"
    pdf_reader = LayoutPDFReader(llmsherpa_api_url)
    doc = pdf_reader.read_pdf(PDF_LOCATION)
    chunks = doc.chunks()
    min_page = min([chunk.page_idx for chunk in chunks])
    max_page = max([chunk.page_idx for chunk in chunks])
    pages = []
    for page_idx in range(min_page, max_page+1):
        cpage = [chunk.to_text() for chunk in chunks if chunk.page_idx == page_idx]
        if cpage:
            pages.append(Document(text='\n'.join(cpage)))
    return pages


In [6]:
if PDF_PARSER == 'llm-sherpa':
    documents = llmsherpa_docs()
else:
    documents = llama_parse_docs()

In [7]:
print(documents[25].text)

Hospitals are central to healthcare delivery
• Hospitals provide inpatient and outpatient medical services (diagnostic, treatment, etc.) via physicians, nursing and other health services.
This subsector includes Hospitals (IBISWorld 62211), Psychiatric Hospitals (62221) and Specialty Hospitals (62231).
• Rising incomes, broader access to public and private insurance programs and the medical needs of older citizens sustained hospital patient volumes before the COVID-19 pandemic.
Yet hospitals were hit particularly hard by the drop in elective procedures and patient volumes following the onset of the pandemic.
• Federal policies and billions of dollars in funding directed to hospitals alleviated the initial financial impact of revenue loss stemming from delays in elective care and drops in ER visits.
According to the Kaiser Family Foundation (KFF), the size of grants varied significantly per hospital based on the payor mix.
• Rapidly escalating costs for medical consumables, equipment an

In [8]:
import uuid

# generate unique index for multiple runs
INDEX_NAME = ('X' + str(uuid.uuid4())).replace('-', '_')

In [9]:
import weaviate

cluster_url = "https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud"
api_key = "7ZfUCibywHnzM0WKMPx7YevuN79nUtS4KJgT"

client = weaviate.connect_to_wcs(
    cluster_url=cluster_url,
    auth_credentials=weaviate.auth.AuthApiKey(api_key),
)

# weaviate vector database & llamaparse Integrated

In [10]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=1024)
nodes = splitter.get_nodes_from_documents(documents)

In [11]:
from llama_index.vector_stores.weaviate import WeaviateVectorStore

vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name=INDEX_NAME
)

In [12]:
from llama_index.core import VectorStoreIndex
vector_index = VectorStoreIndex(nodes, vector_store = vector_store)

In [13]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model=MODEL_ID, api_key = OPENAI_API_KEY)
query_engine = vector_index.as_query_engine(similarity_top_k=3, llm=llm)

/home/yakov/anaconda3/envs/probe/lib/python3.11/site-packages/pydantic/main.py:1059: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/


In [14]:
result = []
for question in dff[QUESTION_COL]:
    result.append(query_engine.query(question))

dff[RESPONSE_COL] = result
dff.to_csv(OUTPUT_FILE, index=False)