In [42]:
from llama_index.core import (
    VectorStoreIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
)
from llama_index.core import SummaryIndex
from llama_index.core.schema import IndexNode
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.llms.openai import OpenAI
from llama_index.core.callbacks import CallbackManager

import os

os.environ["OPENAI_API_KEY"] = "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"


In [65]:
wiki_titles = [
    "Healthcare in the United States",
    "Patient Protection and Affordable Care Act",
    "Medicaid",
    "Medicare (United States)",
    "Health insurance in the United States",
    "Health insurance marketplace",
    "Children's Health Insurance Program",
    "Health maintenance organization (HMO)",
    "Hospital readmission",
    "Electronic health record",
    "Telemedicine",
    "Mental health in the United States",
    "Opioid epidemic in the United States",
    "Centers for Disease Control and Prevention",
    "National Institutes of Health",
    "Food and Drug Administration",
    "Public health in the United States",
    "American Medical Association",
    "United States Department of Health and Human Services",
    "Health care reform in the United States",
    "Medical malpractice in the United States",
    "Health care prices in the United States",
    "COVID-19 pandemic in the United States",
    "UnitedHealth Group",
    "Anthem Inc.",
    "Aetna",
    "Cigna",
    "Humana",
    "Centene Corporation",
    "Molina Healthcare",
    "WellCare",
    "Blue Cross Blue Shield",
    "Kaiser Permanente",
    "Cerner",  # Added as suggested
    "Epic Systems",  # Adjusted for Wikipedia's naming, correct title is "Epic Systems"
    "Teladoc Health",  # Added as suggested
    "Amwell",  # Added as suggested
    "Preferred Provider Organization",  # Added as suggested, noting PPO may need specific article lookup
    "Personal health record"  # Adjusted as "Personal Health Records" might not directly match; use context to find correct title
]


In [None]:
wiki_titles += [
    "Health Informatics",
    "Medical billing",
    "Health Insurance Portability and Accountability Act",
    "HL7",
    "International Classification of Diseases",
    "Current Procedural Terminology",
    "Electronic Data Interchange",
    "Healthcare Common Procedure Coding System",
    "Value-based healthcare",
    "Clinical audit"  # "Medical Audit" might be covered under "Clinical audit" based on Wikipedia's categorization.
]

In [66]:
from pathlib import Path
import requests
import re

# Sanitize title for file naming
def sanitize_title(title):
    # Keep spaces, but remove specific unwanted characters like parentheses
    title = title.replace("(", "").replace(")", "")
    # Remove any character not allowed by the regex pattern, except for spaces
    sanitized_title = re.sub(r"[^a-zA-Z0-9_ ]", "", title)
    # Replace spaces with underscores for the filename
    sanitized_title = sanitized_title.replace(" ", "_")
    # Ensure the title does not exceed the maximum length of 64 characters
    return sanitized_title[:64]

data_path = Path("data")
if not data_path.exists():
    data_path.mkdir()

titles_with_no_extract = []

# Assuming wiki_titles is defined somewhere above this snippet
for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            "explaintext": True,
        },
    ).json()
    page = next(iter(response["query"]["pages"].values()))
    if 'extract' in page:
        wiki_text = page["extract"]
        # Apply sanitization when creating the file name
        safe_title = sanitize_title(title)
        with open(data_path / f"{safe_title}.txt", "w") as fp:
            fp.write(wiki_text)
    else:
        print(f"Extract not found for: {title}")
        titles_with_no_extract.append(title)

# Remove titles with no extract from the original list
for title in titles_with_no_extract:
    wiki_titles.remove(title)

print("\nTitles with extracts successfully fetched and saved:")
for title in wiki_titles:
    print(title)



Titles with extracts successfully fetched and saved:
Healthcare in the United States
Patient Protection and Affordable Care Act
Medicaid
Medicare (United States)
Health insurance in the United States
Health insurance marketplace
Children's Health Insurance Program
Health maintenance organization (HMO)
Hospital readmission
Electronic health record
Telemedicine
Mental health in the United States
Opioid epidemic in the United States
Centers for Disease Control and Prevention
National Institutes of Health
Food and Drug Administration
Public health in the United States
American Medical Association
United States Department of Health and Human Services
Health care reform in the United States
Medical malpractice in the United States
Health care prices in the United States
COVID-19 pandemic in the United States
UnitedHealth Group
Anthem Inc.
Aetna
Cigna
Humana
Centene Corporation
Molina Healthcare
WellCare
Blue Cross Blue Shield
Kaiser Permanente
Cerner
Epic Systems
Teladoc Health
Amwell
Prefe

In [67]:
titles_with_no_extract

[]

In [69]:
wiki_titles = [sanitize_title(title) for title in wiki_titles]
wiki_titles

['Healthcare_in_the_United_States',
 'Patient_Protection_and_Affordable_Care_Act',
 'Medicaid',
 'Medicare_United_States',
 'Health_insurance_in_the_United_States',
 'Health_insurance_marketplace',
 'Childrens_Health_Insurance_Program',
 'Health_maintenance_organization_HMO',
 'Hospital_readmission',
 'Electronic_health_record',
 'Telemedicine',
 'Mental_health_in_the_United_States',
 'Opioid_epidemic_in_the_United_States',
 'Centers_for_Disease_Control_and_Prevention',
 'National_Institutes_of_Health',
 'Food_and_Drug_Administration',
 'Public_health_in_the_United_States',
 'American_Medical_Association',
 'United_States_Department_of_Health_and_Human_Services',
 'Health_care_reform_in_the_United_States',
 'Medical_malpractice_in_the_United_States',
 'Health_care_prices_in_the_United_States',
 'COVID19_pandemic_in_the_United_States',
 'UnitedHealth_Group',
 'Anthem_Inc',
 'Aetna',
 'Cigna',
 'Humana',
 'Centene_Corporation',
 'Molina_Healthcare',
 'WellCare',
 'Blue_Cross_Blue_Shield'

In [70]:
# Load all wiki documents
wiki_docs = {}
for wiki_title in wiki_titles:
    wiki_docs[wiki_title] = SimpleDirectoryReader(
        input_files=[f"data/{wiki_title}.txt"]
    ).load_data()

In [71]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

from llama_index.agent.openai import OpenAIAgent
from llama_index.core import load_index_from_storage, StorageContext
from llama_index.core.node_parser import SentenceSplitter
import os


node_parser = SentenceSplitter()


Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

## Create Agents for each wiki page

In [72]:

node_parser = SentenceSplitter()

# Build agents dictionary
agents = {}
query_engines = {}

# this is for the baseline
all_nodes = []

for idx, wiki_title in enumerate(wiki_titles):
    # Create chunks from doc store
    nodes = node_parser.get_nodes_from_documents(wiki_docs[wiki_title])
    all_nodes.extend(nodes)

    # build / retrieve vector index
    # Only load and persist non already existing data to a vector store
    if not os.path.exists(f"./data/{wiki_title}"):
        # build vector index
        vector_index = VectorStoreIndex(nodes)
        vector_index.storage_context.persist(
            persist_dir=f"./data/{wiki_title}"
        )
    else:
        vector_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=f"./data/{wiki_title}"),
        )

    # build summary index
    summary_index = SummaryIndex(nodes)

    # define query engines
    vector_query_engine = vector_index.as_query_engine()
    summary_query_engine = summary_index.as_query_engine()

    # define tools
    query_engine_tools = [
        QueryEngineTool(
            query_engine=vector_query_engine,
            metadata=ToolMetadata(
                name="vector_tool",
                description=(
                    "Useful for questions related to specific aspects of"
                    f" {wiki_title} (e.g. for business, technical, support,"
                    " providing documentation, the health care and claims processing industry, or more)."
                ),
            ),
        ),
        QueryEngineTool(
            query_engine=summary_query_engine,
            metadata=ToolMetadata(
                name="summary_tool",
                description=(
                    "Useful for any requests that require a holistic summary"
                    f" of EVERYTHING about {wiki_title}. For questions about"
                    " more specific sections, please use the vector_tool."
                ),
            ),
        ),
    ]

    # build agent
    function_llm = OpenAI(model="gpt-4")
    agent = OpenAIAgent.from_tools(
        query_engine_tools,
        llm=function_llm,
        verbose=True,
        system_prompt=f"""\
You are a specialized agent designed to answer queries about {wiki_title}.
You must ALWAYS use at least one of the tools provided when answering a question; do NOT rely on prior knowledge.\
""",
    )

    agents[wiki_title] = agent
    query_engines[wiki_title] = vector_index.as_query_engine(
        similarity_top_k=2
    )

In [73]:
# define tool for each document agent
all_tools = []
for wiki_title in wiki_titles:
    wiki_summary = (
        f"This content contains Wikipedia articles about {wiki_title}. Use"
        f" this tool if you want to answer any questions about {wiki_title}.\n"
    )
    doc_tool = QueryEngineTool(
        query_engine=agents[wiki_title],
        metadata=ToolMetadata(
            name=f"tool_{wiki_title}",
            description=wiki_summary,
        ),
    )
    all_tools.append(doc_tool)

In [74]:
# define an "object" index and retriever over these tools
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import ObjectIndex, SimpleToolNodeMapping

tool_mapping = SimpleToolNodeMapping.from_objects(all_tools)
obj_index = ObjectIndex.from_objects(
    all_tools,
    tool_mapping,
    VectorStoreIndex,
)

In [75]:
from llama_index.agent.openai_legacy import FnRetrieverOpenAIAgent

top_agent = FnRetrieverOpenAIAgent.from_retriever(
    obj_index.as_retriever(similarity_top_k=3),
#     system_prompt=""" 
# You are an agent designed to answer queries about a set of given healthcare webpages.
# Please always use the tools provided to answer a question. Do not rely on prior knowledge.\

# """,
    system_prompt=""" 
You are an agent designed to answer queries about a wide range of topics. Please decided which document to use to answer the question.
Please always use the tools provided to answer a question. Do not rely on prior knowledge.\

""",

    verbose=True,
)

In [76]:
base_index = VectorStoreIndex(all_nodes)
base_query_engine = base_index.as_query_engine(similarity_top_k=4)

In [77]:
# response = top_agent.query("Tell me about the healthcare industry in the united states")
response = top_agent.query("What does the FDA do, and what oversight do they have over the health industry?")


STARTING TURN 1
---------------

=== Calling Function ===
Calling function: tool_Food_and_Drug_Administration with args: {"input": "What does the FDA do?"}
Added user message to memory: What does the FDA do?
=== Calling Function ===
Calling function: summary_tool with args: {
  "input": "FDA"
}
Got output: The FDA is a federal agency responsible for overseeing a wide range of products related to public health, including food safety, pharmaceutical drugs, medical devices, cosmetics, and more. It enforces various laws and regulations to ensure the safety and efficacy of these products. The agency is led by the Commissioner of Food and Drugs, who is appointed by the President. The FDA's headquarters is located in Maryland, with field offices and laboratories across the United States and in some foreign countries. The FDA plays a crucial role in regulating and monitoring products to protect public health.

Got output: The Food and Drug Administration (FDA) is a federal agency in the United

In [57]:
response = top_agent.query("I am an adult living in america, why might I want to get on medicare?")

STARTING TURN 1
---------------

=== Calling Function ===
Calling function: tool_Medicaid with args: {"input":"reasons to get on Medicare"}
Added user message to memory: reasons to get on Medicare
=== Calling Function ===
Calling function: vector_tool with args: {
  "input": "reasons to get on Medicare"
}
Got output: One reason to get on Medicare is if an individual is eligible for both Medicare and Medicaid, as the cost of prescription drugs for those eligible for both programs is shifted to Medicare. Additionally, low-income individuals who are HIV positive may qualify for Medicaid assistance if they meet the eligibility criteria under the "disabled" category, such as progressing to AIDS.

Got output: There are several reasons why someone might want to get on Medicare:

1. Age: Once you turn 65, you are eligible for Medicare. This can help cover the costs of healthcare as you age and potentially face more health issues.

2. Disability: If you have a disability, you may be eligible fo

In [78]:
response = top_agent.query("What are the main differences between aetna and cigna?")

STARTING TURN 1
---------------

=== Calling Function ===
Calling function: tool_Aetna with args: {"input": "main differences between Aetna and Cigna"}
Added user message to memory: main differences between Aetna and Cigna
=== Calling Function ===
Calling function: vector_tool with args: {
  "input": "main differences between Aetna and Cigna"
}
Got output: The main differences between Aetna and Cigna are related to their historical backgrounds, corporate developments, and notable events.

Got output: Aetna and Cigna are both major players in the health insurance industry, but they have different historical backgrounds, corporate developments, and notable events.

Aetna, founded in 1853, is one of the nation's leading diversified health care benefits companies. It offers a broad range of traditional, voluntary, and consumer-directed health insurance products and related services. These include medical, pharmacy, dental, behavioral health, group life, and disability plans, as well as med

In [60]:
# This query returns the wrong output
response = top_agent.query("compare anthem and united health group. Who made more money in 2022?")

STARTING TURN 1
---------------

=== Calling Function ===
Calling function: tool_Anthem_Inc with args: {"input": "revenue in 2022"}
Added user message to memory: revenue in 2022
=== Calling Function ===
Calling function: vector_tool with args: {
  "input": "Anthem Inc revenue 2022"
}
Got output: $136 billion

Got output: The revenue of Anthem Inc in 2022 was $136 billion.

=== Calling Function ===
Calling function: tool_UnitedHealth_Group with args: {"input": "revenue in 2022"}
Added user message to memory: revenue in 2022
=== Calling Function ===
Calling function: vector_tool with args: {
  "input": "UnitedHealth Group revenue in 2022"
}
Got output: UnitedHealth Group reported earnings of US$20.64 billion for the fiscal year 2022.

Got output: UnitedHealth Group reported earnings of US$20.64 billion for the fiscal year 2022.

STARTING TURN 2
---------------



In [61]:
response = top_agent.query("How do the healthcare policies under the 'Patient Protection and Affordable Care Act' compare with those proposed in the 'Health Care Reform in the United States' article?")

STARTING TURN 1
---------------

=== Calling Function ===
Calling function: tool_Patient_Protection_and_Affordable_Care_Act with args: {"input": "healthcare policies"}
Added user message to memory: healthcare policies
=== Calling Function ===
Calling function: vector_tool with args: {
  "input": "healthcare policies"
}
Got output: The document likely contains information about healthcare policies.

Got output: The Patient Protection and Affordable Care Act (PPACA), often shortened to the Affordable Care Act (ACA) or nicknamed Obamacare, is a United States federal statute enacted by the 111th United States Congress and signed into law by President Barack Obama on March 23, 2010. 

The ACA includes a number of healthcare policies aimed at improving the healthcare system in the United States. Some of the key policies include:

1. **Individual Mandate**: This policy requires most Americans to have health insurance or pay a penalty. The goal of this policy is to ensure that everyone partici