RAG WITH FHIR DATA LEVERAGING KNOWLEDGE GRAPHS 

In [1]:
from neo4j import GraphDatabase
import json
import os
from dotenv import load_dotenv


load_dotenv()
# Set up Neo4j connection
uri = os.getenv("NEO4J_URI", "")
username = os.getenv("NEO4J_USERNAME", "")
password = os.getenv("NEO4J_PASSWORD", "")
driver = GraphDatabase.driver(uri, auth=(username, password))

# Function to remove 'text.display' fields
def remove_text_display(data):
    for entry in data.get("entry", []):
        resource = entry.get("resource", {})
        if "text" in resource and "display" in resource["text"]:
            del resource["text"]["display"]
    return data

# Load and preprocess FHIR JSON file
with open(r"C:\Users\STORM\Documents\GitHub\RAGwithFHIR\Dataset\Bart73_King743_26b2b916-50a8-2b98-71b0-3150050312c8.json", "r") as file:
    fhir_data = json.load(file)

# Clean the data by removing 'text.display' fields
cleaned_data = remove_text_display(fhir_data)

# Functions for Neo4j database interactions

# 1. Clear the database
def clear_database(tx):
    tx.run("MATCH (n) DETACH DELETE n")

# 2. Load FHIR data using CyFHIR
def load_bundle(tx, json_string):
    tx.run("CALL cyfhir.bundle.load($json, {validation: true, version: 'R4'})", json=json_string)


# Execute script
with driver.session() as session:
    # Clear the database
    session.write_transaction(clear_database)
    
    # Load the cleaned FHIR data as a JSON string
    cleaned_json_string = json.dumps(cleaned_data)
    session.write_transaction(load_bundle, cleaned_json_string)
    
    # Add indexes
    # session.write_transaction(add_indexes)
    
    # Create relationships for structured knowledge graph
    # session.write_transaction(create_relationships)

driver.close()


  session.write_transaction(clear_database)
  session.write_transaction(load_bundle, cleaned_json_string)


In [2]:
driver = GraphDatabase.driver(uri, auth=(username, password))

# Define queries
queries = [
    "MATCH (n:resource {resourceType: 'Condition'}) SET n:Condition",
    "MATCH (n:resource {resourceType: 'Condition'}) SET n:Embeddable",
    "MATCH (n:resource {resourceType: 'Observation'}) SET n:Observation",
    "MATCH (n:resource {resourceType: 'Observation'}) SET n:Embeddable",
    "MATCH (n:resource {resourceType: 'Medication'}) SET n:Medication",
    "MATCH (n:resource {resourceType: 'Medication'}) SET n:Embeddable",
    "MATCH (n:resource {resourceType: 'Patient'}) SET n:Patient",
    "MATCH (n:resource {resourceType: 'Patient'}) SET n:Embeddable",
    "MATCH (n:resource {resourceType: 'MedicationRequest'}) SET n:MedicationRequest",
    "MATCH (n:resource {resourceType: 'MedicationRequest'}) SET n:Embeddable",

]

# Function to run each query
def tag_resources(driver, queries):
    with driver.session() as session:
        for query in queries:
            session.run(query)
            # print(f"Executed: {query}")

# Run the tagging queries
tag_resources(driver, queries)

# Close the driver connection
driver.close()

Creating embedding text for patient node

In [3]:
from neo4j import GraphDatabase

driver = GraphDatabase.driver(uri, auth=(username, password))

def create_patient_embedding(driver):
    query = """
        MATCH (p:Patient)
        OPTIONAL MATCH (p)-[:address]->(a)
        OPTIONAL MATCH (p)-[:maritalStatus]->(m)
        OPTIONAL MATCH (p)-[:name]->(n)
        RETURN 
            n.given as name,
            n.family as lname,
            p.gender AS gender,
            p.birthDate AS birthdate,
            a.city as city,
            a.country as country,
            a.state as state,
            a.line as line,
            m.text as marrigialStatus
    """

    # Run the query and process results
    with driver.session() as session:
        result = session.run(query)
        patient_info = []
        
        for record in result:
            # Debugging: print the raw record to see what is returned
            # print("Raw record:", record)
            
            # Collect data from each record
            name = record.get("name", "Unknown")
            lname = record.get("lname", "Unknown")
            gender = record.get("gender", "Not specified")
            birthdate = record.get("birthdate", "Unknown")
            marital_status = record.get("marrigialStatus", "Unknown")
            city = record.get("city", "Unknown")
            country = record.get("country", "Unknown")
            state = record.get("state", "Unknown")
            line = record.get("line", "Unknown")
            
            # Debugging: check the values of fields
            # print(f"Name: {name}, Line: {line}, City: {city}, State: {state}, Country: {country}")
            
            # Combine into one embedding text
            address = f"{line}, {city}, {state}, {country}" if line != "Unknown" else f"{city}, {state}, {country}"
            embedding_text = f"This is information about the patient, patient's Name: {name} {lname}, patient's Address: {address}, patient's Gender: {gender}, patient's Birthdate: {birthdate}, patient's Marital Status: {marital_status}"
            patient_info.append(embedding_text)

        update_query = """
            MATCH (p:Patient)

            SET p.embeddingText = $embedding_text
        """
        session.run(update_query, embedding_text=embedding_text)

        return patient_info

# Get the patient embedding text
patient_embeddingText = create_patient_embedding(driver)




In [4]:
from neo4j import GraphDatabase

# Initialize the Neo4j driver
driver = GraphDatabase.driver(uri, auth=(username, password))

def create_patient_embedding(driver):
    query = """
        MATCH (c:Condition)
        OPTIONAL MATCH (c)-[:code]->(t)
        RETURN 
            c.recordedDate AS date,
            t.text AS text,
            id(c) AS nodeId
    """

    # Run the query and process results
    with driver.session() as session:
        result = session.run(query)
        patient_info = []
        
        for record in result:
            # Extract the data safely
            date = record.get("date", "Unknown")
            text = record.get("text", "Unknown")
            node_id = record["nodeId"]

            # Construct the embedding text
            embedding_text = f"The patient had the condition: {text}, and it was recorded on this date: {date}"

            # Store the embedding text in the corresponding node
            update_query = """
                MATCH (c:Condition)
                WHERE id(c) = $node_id
                SET c.embeddingText = $embedding_text
            """
            session.run(update_query, node_id=node_id, embedding_text=embedding_text)

            # Collect the embedding text for debugging or further use
            patient_info.append(embedding_text)

        return patient_info

# Get the patient embedding text
patient_embedding_texts = create_patient_embedding(driver)

# Debugging: Print all generated embedding texts
# for text in patient_embedding_texts:
#     print(text)




In [5]:
from neo4j import GraphDatabase

# Initialize the Neo4j driver
driver = GraphDatabase.driver(uri, auth=(username, password))

def create_patient_embedding(driver):
    query = """
        MATCH (o:Observation)
                OPTIONAL MATCH (o)-[:code]->(t)
                OPTIONAL MATCH (o)-[:valueQuantity]->(v)
                RETURN 
                    o.status as status,
                    t.text AS text,
                    v.unit as unit,
                    v.value as value,
                    id(o) AS nodeId
    """

    # Run the query and process results
    with driver.session() as session:
        result = session.run(query)
        patient_info = []
        
        for record in result:
            # Extract the data safely
            status = record.get("status", "Unknown")
            text = record.get("text", "Unknown")
            unit = record.get("unit", "Unknown")
            value = record.get("value", "Unknown")
            node_id = record["nodeId"]

            # Construct the embedding text
            embedding_text = f"The patient had an observation for : {text}, and it was recorded : {value}{unit} with the status of the observation being {status}"

            # Store the embedding text in the corresponding node
            update_query = """
                MATCH (o:Observation)
                WHERE id(o) = $node_id
                SET o.embeddingText = $embedding_text
            """
            session.run(update_query, node_id=node_id, embedding_text=embedding_text)

            # Collect the embedding text for debugging or further use
            patient_info.append(embedding_text)

        return patient_info

# Get the patient embedding text
patient_embedding_texts = create_patient_embedding(driver)

# Debugging: Print all generated embedding texts
# for text in patient_embedding_texts:
#     print(text)




In [6]:
from neo4j import GraphDatabase

driver = GraphDatabase.driver(uri, auth=(username, password))

def create_patient_embedding(driver):
    query = """
        MATCH (m:Medication)
                OPTIONAL MATCH (m)-[:code]->(t)
                RETURN 
                    m.status as status,
                    t.text AS text,
                    id(m) AS nodeId
    """

    # Run the query and process results
    with driver.session() as session:
        result = session.run(query)
        patient_info = []
        
        for record in result:
            # Debugging: print the raw record to see what is returned
            # print("Raw record:", record)
            
            # Collect data from each record
            status = record.get("status", "Unknown")
            text = record.get("text", "Unknown")
            node_id = record["nodeId"]

            
            # Combine into one embedding text
            embedding_text = f"Medication given to patient with the name: {text} and the status: {status}"
            patient_info.append(embedding_text)

            update_query = """
                MATCH (m:Medication)
                WHERE id(m) = $node_id
                SET m.embeddingText = $embedding_text
            """
            session.run(update_query, node_id=node_id, embedding_text=embedding_text)

        return patient_info

# Get the patient embedding text
patient_embeddingText = create_patient_embedding(driver)






In [7]:
# from neo4j import GraphDatabase

# driver = GraphDatabase.driver(uri, auth=(username, password))

# def create_patient_embedding(driver):
#     query = """
#         MATCH (m:MedicationRequest) 
#         optional match (m)-[:requester]->(r)
#         optional match (m)-[:medicationCodeableConcept]->(mc)
#         optional match (m)-[:dosageInstruction]->(d)
#         RETURN 
#             m.status as status,
#             r.display as requester,
#             d.text as instrocution,
#             mc.text as medication,
#             id(m) as nodeId
#     """

#     # Run the query and process results
#     with driver.session() as session:
#         result = session.run(query)
#         patient_info = []
        
#         for record in result:
#             # Debugging: print the raw record to see what is returned
#             # print("Raw record:", record)
            
#             # Collect data from each record
#             status = record.get("status", "Unknown")
#             instrocution = record.get("text", "Unknown")
#             requester = record.get("requester", "Unknown")
#             medication = record.get("medication", "Unknown")
#             node_id = record["nodeId"]

            
#             # Combine into one embedding text
#             embedding_text = f"The medication: {medication} was requested for the patient from {requester} with instrocution {instrocution}, the status of the medication request is {status}."
#             patient_info.append(embedding_text)

#             update_query = """
#                 MATCH (m:MedicationRequest)
#                 WHERE id(m) = $node_id
#                 SET m.embeddingText = $embedding_text
#             """
#             session.run(update_query, node_id=node_id, embedding_text=embedding_text)

#         return patient_info

# # Get the patient embedding text
# patient_embeddingText = create_patient_embedding(driver)




Vector index from graph

In [14]:
from langchain.graphs import Neo4jGraph
from langchain.vectorstores.neo4j_vector import Neo4jVector

from langchain.embeddings import HuggingFaceEmbeddings
import langchain
langchain.debug = True
# Specify the model from Hugging Face, e.g., "sentence-transformers/all-MiniLM-L6-v2"
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vector_index = None
# Create the vector index from the existing Neo4j graph
vector_index = Neo4jVector.from_existing_graph(
    hf_embeddings,
    url=uri,
    username=username,
    password=password,
    index_name='FHIR_INDEX',
    node_label="Embeddable",  # Replace with the actual label of your nodes
    text_node_properties=['embeddingText'],  # Properties to embed
    embedding_node_property='embedding',  # Property to store embeddings in Neo4j
)

# print(vector_index)
# question = "what condition does the patient have"

# response = vector_index.similarity_search(question, k=100)
# print(response)


In [10]:
print(vector_index.index_name)
print(vector_index.embedding_node_property)
print(vector_index.node_label)


FHIR_INDEX
embedding
Embeddable


In [59]:
from langchain.llms import Ollama
from pprint import pprint
query = "What is the patients phone number"
langchain.debug = False

context = vector_index.similarity_search(query, k=5)

print(context[0].page_content)

prompt_template = """
You are a helpful assistant with knowledge of FHIR data, the current year is 2024. Use the following context to answer the query:

Context:
{context}

Question:
{query}

Provide a concise and accurate answer.
"""
prompt = prompt_template.format(context=context, query=query)



# Initialize the LLM
llm = Ollama(model="llama3.2", temperature=0.7)

# Generate the answer
answer = llm(prompt)
print("LLM Response:", answer)




embeddingText: This is information about the patient, patient's Name: [Bart73, Caleb651] King743, patient's Address: [270 Johns Brook Suite 56], North Plymouth, MA, US, patient's Gender: male, patient's Birthdate: 1991-06-13, patient's Marital Status: Married
LLM Response: Unfortunately, there is no mention of the patient's phone number in the provided documents. The metadata and page_content sections only contain information about the patient's name, address, gender, birthdate, marital status, and medical conditions, but do not include any contact information such as a phone number.


Contextualize the search 


In [64]:
resourceId = context[0].metadata['_resourceId']

contextualize_query = f"""
MATCH (n)-[:telecom]->(p)
WHERE n.`_resourceId` = '{resourceId}'
WITH p.value + ' ' + p.use AS self, {{}} AS metadata, 1.0 AS score
RETURN self AS text, score, metadata
"""



contextualized_vectorstore = Neo4jVector.from_existing_index(
    hf_embeddings,
    url=uri,
    username=username,
    password=password,
    index_name="FHIR_INDEX",
    retrieval_query=contextualize_query,
)

26b2b916-50a8-2b98-71b0-3150050312c8


In [63]:
print(query)
response = contextualized_vectorstore.similarity_search(query, k=1)
print(response[0].page_content)

What is the patients phone number


CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input '}': expected ':' (line 3, column 36 (offset: 136))
"where n.`_resourceId` = {resourceId}"
                                    ^}