In [1]:
# !pip install llama-index llama-index-core llama-parse openai llama_index.embeddings.huggingface -q
# !pip install llama-index-llms-anthropic -q

In [2]:
import pandas as pd

comparison_file = 'claude-3-5-sonnet-20240620_qa.csv'
df = pd.read_csv(comparison_file)

dff = df.head()
questions = list(dff['question'])
query1, query2 = questions[:2]

query1, query2

('What types of enterprises are included in the Healthcare and Social Assistance sector in the United States?',
 'How is telemedicine defined in the context of the Healthcare and Social Assistance industry?')

In [3]:
  # llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
from dotenv import load_dotenv
load_dotenv()

# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = "llx-7K1IbMcLbyb8TDvMsIx3Brr7mD4K8ZnLaFMjbEq8S1uONYZp"
# Using OpenAI API for embeddings/llms
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_APIKEY')
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [4]:
from llama_parse import LlamaParse

# location = 'IndustrySource/Misc/32592 Explosives Manufacturing in the US Industry Report.pdf'
location = 'IndustrySource/Misc/62 Healthcare and Social Assistance in the US Industry Report.pdf'
documents = LlamaParse(result_type="markdown").load_data(location)

Started parsing the file under job_id 6e49875a-aec5-4cb6-8c30-4611c8f5cda9
...

In [5]:
INDEX_NAME = location.replace('/', '_').replace(' ', '_').split('.')[0]
INDEX_NAME

'IndustrySource_Misc_62_Healthcare_and_Social_Assistance_in_the_US_Industry_Report'

In [6]:
from llama_index.core.node_parser import SentenceSplitter

######## SentenceSplitter ########
splitter = SentenceSplitter(chunk_size=1024)
nodes = splitter.get_nodes_from_documents(documents)

######## Vector Index ########
from llama_index.core import VectorStoreIndex

embed_model = "local:BAAI/bge-small-en-v1.5"
vector_index = VectorStoreIndex(nodes, embed_model = embed_model)

######## GPT-4o to Chat ########
from llama_index.llms.openai import OpenAI

llm_gpt4o = OpenAI(model="gpt-4o-mini", api_key = OPENAI_API_KEY)
query_engine_gpt4o = vector_index.as_query_engine(similarity_top_k=3, llm=llm_gpt4o)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# query1 = "What's driving current industry performance?"
resp = query_engine_gpt4o.query(query1)
print("GPT-4o-mini:")
print(str(resp))

GPT-4o-mini:
The Healthcare and Social Assistance sector in the United States includes enterprises that provide a variety of services such as hospitals, ambulatory service providers, nursing homes, residential care facilities, and social assistance services. This encompasses counselors, social workers, family and welfare services, as well as natural disaster and emergency relief services.


In [8]:
str(resp)

'The Healthcare and Social Assistance sector in the United States includes enterprises that provide a variety of services such as hospitals, ambulatory service providers, nursing homes, residential care facilities, and social assistance services. This encompasses counselors, social workers, family and welfare services, as well as natural disaster and emergency relief services.'

In [9]:
print(nodes[12].text)

# Healthcare and Social Assistance in the US

# SWOT

|Strengths|Weaknesses|Opportunities|Threats|
|---|---|---|---|
|High Profit vs. Sector Average|Low Revenue per Employee|High Revenue Growth (2019-2024)|Low Revenue Growth (2005-2024)|
|Low Customer Class Concentration|Low Product/Service Concentration|High Revenue Growth (2024-2029)|Low Outlier Number of people with private health insurance|
|Low Capital Requirements| |Federal funding for Medicare and Medicaid| |

# Executive Summary

Demographic and economic factors are the driving forces behind rising healthcare spending in the US. The population's medical needs are expanding the oldest demographic's consumption of medical services. At the same time, growing incomes and broader insurance coverage facilitate access to healthcare providers. Increasing healthcare spending was abruptly disrupted by the pandemic, exposing healthcare and social assistance providers to unprecedented financial and operational pressures. Though pandemic co

In [10]:
# query1 = "Tell me about Oil States International Inc. to acquire GEODynamics Inc."
resp = query_engine_gpt4o.query(query2)
print("GPT-4o-mini:")
str(resp)

GPT-4o-mini:


'Telemedicine is defined as an application of clinical medicine where medical information is transferred through interactive audiovisual media for the purpose of consulting and conducting remote medical procedures or examinations.'

In [11]:
#####################################################################################################################################################

In [12]:
# WEAVIATE Vector Database -RAG

In [13]:
# %pip install llama-index-vector-stores-weaviate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [14]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from IPython.display import Markdown, display

In [15]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [16]:
import weaviate
# cloud
cluster_url = "https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud"
api_key = "7ZfUCibywHnzM0WKMPx7YevuN79nUtS4KJgT"

client = weaviate.connect_to_wcs(
    cluster_url=cluster_url,
    auth_credentials=weaviate.auth.AuthApiKey(api_key),
)

INFO:httpx:HTTP Request: GET https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/meta "HTTP/1.1 200 OK"
HTTP Request: GET https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"


In [17]:
from llama_index.core import StorageContext

# If you want to load the index later, be sure to give it a name!
vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name=INDEX_NAME
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

INFO:httpx:HTTP Request: GET https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/schema/IndustrySource_Misc_62_Healthcare_and_Social_Assistance_in_the_US_Industry_Report "HTTP/1.1 404 Not Found"
HTTP Request: GET https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/schema/IndustrySource_Misc_62_Healthcare_and_Social_Assistance_in_the_US_Industry_Report "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: POST https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/schema "HTTP/1.1 200 OK"
HTTP Request: POST https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/schema "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/schema "HTTP/1.1 200 OK"
HTTP Request: GET https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/schem

In [18]:
# set Logging to DEBUG for more detailed outputs
query_engine = vector_index.as_query_engine()
response = query_engine.query("Tell me about Oil States International Inc. to acquire GEODynamics Inc.")
response = query_engine.query(query2)

/home/yakov/anaconda3/envs/probe/lib/python3.11/site-packages/pydantic/main.py:1059: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/schema/IndustrySource_Misc_62_Healthcare_and_Social_Assistance_in_the_US_Industry_Report "HTTP/1.1 200 OK"
HTTP Request: GET https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/schema/IndustrySource_Misc_62_Healthcare_and_Social_Assistance_in_the_US_Industry_Report "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/schema/IndustrySource_Misc_62_Healthcare_and_Social_Assistance_in_the_US_Industry_Report "HTTP/1.1 200 OK"
HTTP Request: GET https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/schema/IndustrySource_Misc_62_Healthcare_and_Social_Assistance_in_the_US_Industry_Report "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/cha

In [19]:
display(Markdown(f"{response}"))

Oil States International Inc. is a company that has agreed to acquire GEODynamics Inc.

In [20]:
#########################################################################################################################################################

In [21]:
# weaviate vector database & llamaparse Integrated

In [22]:
from llama_index.core.node_parser import SentenceSplitter

######## SentenceSplitter ########
splitter = SentenceSplitter(chunk_size=1024)
nodes = splitter.get_nodes_from_documents(documents)

In [23]:
vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name=INDEX_NAME
)

INFO:httpx:HTTP Request: GET https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/schema/IndustrySource_Misc_62_Healthcare_and_Social_Assistance_in_the_US_Industry_Report "HTTP/1.1 200 OK"
HTTP Request: GET https://a0dlgmcaspopjrn2mtx4ha.c0.us-east1.gcp.weaviate.cloud/v1/schema/IndustrySource_Misc_62_Healthcare_and_Social_Assistance_in_the_US_Industry_Report "HTTP/1.1 200 OK"


In [24]:
######## Vector Index ########
from llama_index.core import VectorStoreIndex
vector_index = VectorStoreIndex(nodes, vector_store = vector_store)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [25]:
######## GPT-4o to Chat ########
from llama_index.llms.openai import OpenAI

llm_gpt4o = OpenAI(model="gpt-4o-mini", api_key = OPENAI_API_KEY)
query_engine_gpt4o = vector_index.as_query_engine(similarity_top_k=3, llm=llm_gpt4o)

In [26]:
# query1 = "What's driving current industry performance?"
resp = query_engine_gpt4o.query(query1)
print("GPT-4o-mini:")
str(resp)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
GPT-4o-mini:


'The Healthcare and Social Assistance sector in the United States includes enterprises such as hospitals, ambulatory service providers, nursing and residential care facilities, as well as social assistance services like counseling, social work, family and welfare services, and emergency relief services.'

In [27]:
# query1 = "Tell me about Oil States International Inc. to acquire GEODynamics Inc."
resp = query_engine_gpt4o.query(query2)
print("GPT-4o-mini:")
str(resp)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
GPT-4o-mini:


'Telemedicine is defined as an application of clinical medicine where medical information is transferred through interactive audiovisual media for the purpose of consulting and conducting remote medical procedures or examinations.'