In [None]:
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
import glob
import json
import re
import time
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
# Helper class for easier runtime calculations

class timer:
    def __init__(self):
        self._start = time.time()
        self._end = None
        self._runtime = None
    
    def end(self):
        self._end = time.time()
        self._runtime = float(str(time.time() - self._start)[:5])
        return self._runtime

NEO4J_URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = ""

# Helper function that runs cypher transaction on local database
def cypher_transaction(cypher):
    driver = GraphDatabase.driver(NEO4J_URI, auth=(USERNAME,  PASSWORD))
    values = []
    with driver.session() as session:
        res = session.run(cypher)
        for record in res:
            values.append(record.values())
    driver.close()
    return values

# Helper function wrapped around cypher_transaction() for timing
def query(cypher):
    time = timer()
    result = cypher_transaction(cypher)
    runtime = time.end()
    return result, runtime


In [None]:
# Get type and number of each FHIR resource in the database
def resource_metrics():
    
    cypher = f'''
        MATCH (r:resource) 
        WITH DISTINCT(r.resourceType) AS resource_types
            ORDER BY resource_types
        UNWIND resource_types as resource_type
        MATCH (r:resource)
        WHERE r.resourceType = resource_type
        WITH resource_type, COUNT(r) as resource_count
        RETURN resource_type, resource_count
            ORDER BY resource_count
    '''

    resource_count, runtime = query(cypher)
    return resource_count
    
# Standard metrics for counting nodes and relationships
def database_metrics():
    node_count = 0
    relationship_count = 0
    
    cypher = f'''
        MATCH (n) 
        WITH COUNT(n) as node_count
        MATCH ()-[r]->()
        WITH node_count, COUNT(r) as relationship_count
        RETURN node_count, relationship_count
    '''
    
    count_result, runtime = query(cypher)
    if (len(count_result) != 0):
        node_count = count_result[0][0]
        relationship_count = count_result[0][1]
    
    return node_count, relationship_count
    
# Deletes all nodes and their relationships in database
def wipe_database():
    node_count, relationship_count = database_metrics()
    
    cypher = f'''
        MATCH (n) DETACH DELETE n
    '''

    delete_result, runtime = query(cypher)
    return 'Deleted {} nodes and {} relationships in {} seconds'.format( node_count, relationship_count, runtime )

In [None]:
wipe_database()

In [None]:
def read_json(filename):
    with open(filename) as json_file:
        bundle = json.load(json_file)
        bundle_string = json.dumps(bundle)
        bundle_string_formatted = re.sub(r'"', '\\"', bundle_string)
        return bundle_string_formatted

# Calls cyfhir.bundle.load() to load in patient medical histories via FHIR bundles
def load_bundles(synthea_bundles):
    total_time = 0.0
    for file in range(len(synthea_bundles)):
        bundle_string = read_json(synthea_bundles[file])

        cypher = f'''
            CALL cyfhir.bundle.load("{bundle_string}")
        '''
        
        patient = ' '.join(synthea_bundles[file].split('/')[2].split('_')[:2])
        result, runtime = query(cypher)
        print('--- Loaded patient "{}" in {} seconds ---'.format( patient , runtime))
        total_time += runtime
        
    return round(total_time, 3)
    
synthea_bundles = glob.glob("./synthea-bundles/*.json")
synthea_bundles.sort()
total_time = load_bundles(synthea_bundles)
resource_result = np.array(resource_metrics())
total_resources = np.sum([int(x) for x in resource_result[:,1]])
node_count, relationship_count = database_metrics()

print("\nIn {} seconds...".format(total_time))
print("Loaded 20 Patient's Medical Histories as FHIR Bundles".format(len(synthea_bundles)))
print("Which Contained {} total FHIR Resources".format(total_resources))
print("For a total of {} Nodes and {} Relationships in Neo4j".format(node_count, relationship_count))

print("At a rate of {} resources per second".format(round(total_resources/total_time, 3)))
print("Each resource has an average of {} Nodes and {} Relationships".format(round(node_count/total_resources, 3), round(relationship_count/total_resources, 3)))

In [None]:
pd.DataFrame(resource_result, columns = ["resourceType", "count"])

In [None]:
# See Occurances of which FHIR Resources directly point to other kinds of FHIR resources
def reference_metrics():
    
    cypher = f'''
        MATCH (a:resource)-[*1..4]->(b:resource)
        WITH [a.resourceType, b.resourceType] AS path
        RETURN distinct(path) AS nodes, count(path) AS path_count
    '''

    reference_count, runtime = query(cypher)
    return reference_count

path_metrics = reference_metrics()
as_array = [[x[0][0],x[0][1], x[1]] for x in path_metrics]
df_references = pd.DataFrame(as_array, columns=["Resource", "References", "Count"])
df_references.sort_values(["Count"], inplace=True, ascending=False, ignore_index=True)
df_references.head(10)

In [None]:
# Recursive Function that Deep sorts all the keys of a json and returns it as an object for comparison
def ordered(obj):
    if isinstance(obj, dict):
        return sorted((k, ordered(v)) for k, v in obj.items())
    if isinstance(obj, list):
        return sorted(ordered(x) for x in obj)
    else:
        return obj

# Find resource by matching 'resource.id', expand out all of its properties, format, and return
def get_resource(_id):
    
    cypher = '''
        WITH "%s" as _id
        MATCH (r:resource {id: _id})
        CALL cyfhir.resource.expand(r) YIELD path
        WITH cyfhir.resource.format(collect(path)) AS resource
        RETURN resource
    ''' % _id
    
    resource, runtime = query(cypher)
    return resource, runtime, cypher

# Deep sort FHIR Resources, return matching string if/if not equivalent
def equivalency_test(json1, json2):
    equivalent = ordered(json1) == ordered(json2) 
    if(equivalent):
        return ' equivalent' 
    else:
        return ' not equivalent'


bundle_file = "./synthea-bundles/Theola421_Haag279_6aff2910-82fc-44d6-84a6-c29e4b756b11.json"
bundle = None
with open(bundle_file) as json_file:
        bundle = json.load(json_file)
        
patient_resource = bundle.get('entry')[0]['resource']
_id = patient_resource['id']
resource, runtime, cypher = get_resource(_id)
resource_result = resource[0][0]

print(cypher)
print('Resources are' + equivalency_test(patient_resource, resource_result))
#print(json.dumps(resource_result, indent=2))

In [None]:
# Similar to getting a resource but you want to collect the Resource "Entry" Nodes that are created
# These are used to create the FHIR Bundle
# This query build a bundle of resources that directly point to or are directly pointed to 
# by a single resource with id matching '_id'
# [(resource)]->(resource {id: _id})->[(resource)]
def get_bundle(_id): 
    cypher = '''
        WITH "%s" as _id
        MATCH (_entry:entry {_resourceId: _id})
        WITH [_entry] + [(a:entry)-[*1..3]->(_entry) | a] + [(_entry)-[*1..3]->(a:entry) | a] as entries
        WITH DISTINCT(entries) AS entries
        UNWIND entries AS entry
        CALL cyfhir.resource.expand(entry) YIELD path
        RETURN cyfhir.bundle.format(collect(path))
        ''' % _id

    resource, runtime = query(cypher)
    return resource, runtime

bundle_file = "./synthea-bundles/Theola421_Haag279_6aff2910-82fc-44d6-84a6-c29e4b756b11.json"
bundle = None
with open(bundle_file) as json_file:
        bundle = json.load(json_file)
        
_id = bundle.get('entry')[0]['resource']['id']
bundle_result, runtime = get_bundle(_id)
bundle_result=bundle_result[0][0]

print('Bundles are' + equivalency_test(bundle, bundle_result))
print('Runtime: ' + str(runtime))
print(len(bundle_result['entry']))
#print(json.dumps(bundle_result, indent=2))

In [None]:
# Similar to the previous function but this time it takes matching resources
# and then finds all the resources that directly point to or are directly pointed to 
# by those entries, creating a bundle with a 2nd layer around the center node. 
def get_deeper_bundle(_id):
    cypher = '''
        WITH "%s" as _id
        MATCH (_entry:entry {_resourceId: _id})
        WITH [_entry] + [(a:entry)-[*1..4]->(_entry) | a] + [(_entry)-[*1..4]->(a:entry) | a] as entries
        WITH entries AS entries
        UNWIND entries AS _entry
        WITH [_entry] + [(a:entry)-[*1..4]->(_entry) | a] + [(_entry)-[*1..4]->(a:entry) | a] as entries
        UNWIND entries AS _entry
        WITH DISTINCT(_entry) AS entry
        CALL cyfhir.resource.expand(entry) YIELD path
        RETURN cyfhir.bundle.format(collect(path))
        ''' % _id

    resource, runtime = query(cypher)
    return resource, runtime

bundle_file = "./synthea-bundles/Antone63_Lebsack687_a7fefb0a-e326-b7ef-ce3c-2f97e77f15c7.json"
bundle = None
with open(bundle_file) as json_file:
        bundle = json.load(json_file)
        
_id = bundle.get('entry')[0]['resource']['id']
bundle_shallow, runtime1 = get_bundle(_id)
bundle_shallow=bundle_shallow[0][0]

bundle_deep, runtime2 = get_deeper_bundle(_id)
bundle_deep=bundle_deep[0][0]

print('Bundles are' + equivalency_test(bundle_shallow, bundle_deep))
print(len(bundle_shallow['entry']), len(bundle_deep['entry']))
print("Runtime Shallow: " + str(runtime1) + "\nRuntime Deep: " + str(runtime2))
#print(json.dumps(bundle_deep, indent=2))

In [None]:
# Collects all Possible conditions per "Condition" Resource, finds all conditions a patient
# is associated with, then One hot encodes all of the patient's conditions
def patient_conditions_encoded():
    
    cypher = '''
        MATCH path=((r:resource {resourceType: "Condition"})-[*2]->(c:coding))
        WITH distinct(c.display) as _display
          ORDER BY _display
        RETURN _display
    '''

    conditions, runtime = query(cypher)
    
    cypher = '''
        MATCH path=((r:resource {resourceType: "Condition"})-[*2]->(c:coding))
        WITH distinct(c.display) as _display
          ORDER BY _display
        WITH _display, collect(_display) AS _displays
        MATCH (patient:resource {resourceType: "Patient"})
        WITH patient.id AS Patient_ID, gds.alpha.ml.oneHotEncoding(_displays, [(coding{display: _display})<-[*2]-(r:resource {resourceType: "Condition"})-[*1..3]->(patient) | _display]) AS embedding
        UNWIND embedding as embed
        WITH Patient_ID, collect(embed) as Condition_Embeddings
        RETURN Patient_ID, Condition_Embeddings
          ORDER BY Patient_ID
        '''

    encodings, runtime = query(cypher)
    return conditions, encodings, runtime

conditions, encodings, runtime = patient_conditions_encoded()
condition_names = np.hstack((np.array(["Patient_ID"]), np.array(conditions).T[0]))
condition_names = condition_names[condition_names != None]

patients = np.array([[x[0]] for x in encodings])
conditions_encoded = np.array([x[1] for x in encodings])

table = np.hstack((patients, conditions_encoded))
print(table[0:2])
print("Runtime: " + str(runtime))
display(pd.DataFrame(table, columns=list(condition_names)).head())

In [None]:
def delete_native_projection():
    
    cypher = '''
        CALL gds.graph.drop('referenceGraph') YIELD graphName;
        '''

    result, runtime = query(cypher)
    return result, runtime

# Finds all paths in which a resource points to another resource
# It then finds all the node labels and relationship types in those paths
# Using those two lists we can create a Native Projection of all of the resource references without any of the other 
# information we may not need
def create_native_projection():
    try:
        delete_native_projection()
    except:
        pass
        
    cypher = '''
        Match path=((a:resource)-[*1..4]->(b:resource))
        WITH path
        CALL {
            WITH path
            WITH nodes(path) AS _nodes
            UNWIND _nodes AS _node
            WITH DISTINCT(labels(_node)[0]) AS _label
            RETURN collect(_label) AS _labels
        }
        WITH path, _labels
        CALL {
            WITH path
            WITH relationships(path) AS _relationships
            UNWIND _relationships AS _relationship
            WITH DISTINCT(type(_relationship)) AS _type
            RETURN collect(_type) AS _types
        }
        WITH _labels, _types
        UNWIND _labels AS _label
        UNWIND _types AS _type
        WITH collect(DISTINCT(_label)) AS label, collect(DISTINCT(_type)) AS type
        CALL gds.graph.create("referenceGraph", label, type) YIELD graphName, nodeCount, relationshipCount
        RETURN graphName, nodeCount, relationshipCount
        '''

    result, runtime = query(cypher)
    return result, runtime

result, runtime = create_native_projection()
graphName, nodeCount, relationshipCount = (result[0][0], result[0][1], result[0][2])
print(graphName, nodeCount, relationshipCount, "| runtime: "+ str(runtime))

In [None]:
# Page Rank Resource nodes to find most relevant nodes in database
def page_rank_nodes():
    
    cypher = '''
        CALL gds.pageRank.stream('referenceGraph', 
            { dampingFactor: 0.868378238, maxIterations: 52, tolerance: 0.00001 })
        YIELD nodeId, score
        WITH gds.util.asNode(nodeId) AS node, score
        WHERE labels(node)[0] = "resource"
        WITH node.id AS resourceId, node.resourceType AS resourceType, score
        RETURN resourceId, resourceType, score
        ORDER BY score DESC
        '''

    result, runtime = query(cypher)
    return result, runtime

result, runtime = page_rank_nodes()
df = pd.DataFrame(result, columns = ['resourceId', 'resourceType', 'score'])
display(df.head(10), "runtime: "+ str(runtime))

In [None]:
# Betweenness Centrality on nodes to find bridge nodes in database
def betweenness_centrality():
    
    cypher = '''
        CALL gds.betweenness.stream('referenceGraph')
        YIELD nodeId, score
        WITH gds.util.asNode(nodeId) AS node, score
        WHERE labels(node)[0] = "resource"
        WITH node.id AS resourceId, node.resourceType AS resourceType, score
        RETURN resourceId, resourceType, score
        ORDER BY score DESC
        '''

    result, runtime = query(cypher)
    return result, runtime

result, runtime = betweenness_centrality()
df = pd.DataFrame(result, columns = ['resourceId', 'resourceType', 'score'])
display(df.head(10), "runtime: "+ str(runtime))




In [None]:
# Node2vec on Nodes
def node2vec():
    
    cypher = '''
    CALL gds.alpha.node2vec.stream('referenceGraph', {
            embeddingSize: 2, 
            iterations: 4, 
            walkLength: 128,
            walksPerNode: 16,
            windowSize: 16
        })
    YIELD nodeId, embedding
    WITH gds.util.asNode(nodeId) AS node, embedding
    WHERE labels(node)[0] = "resource"
    WITH node.id AS resourceId, node.resourceType AS resourceType, embedding
    RETURN resourceId, resourceType, embedding
    ORDER BY sqrt(embedding[0]*embedding[0] + embedding[1]*embedding[1])
    '''

    result, runtime = query(cypher)
    return result, runtime

result, runtime = node2vec()
df = pd.DataFrame(result, columns = ['resourceId', 'resourceType', 'embedding'])
display(df.head(10), "runtime: "+ str(runtime))


In [None]:
num = 100
#display(df.head(num))
embedding = df['embedding'].values
embedding = np.array([[x[0], x[1]] for x in embedding])[:num]
X, Y = (embedding[:,0],embedding[:,1])
_min, _max = (min(X), max(X))

plt.figure(figsize=(15,10))
plt.plot(X, Y,'b.')
plt.axhline(linewidth=2, color='black')
plt.axvline(linewidth=2, color='black')
plt.show()


In [None]:
# Finds every patients previous medical conditions, medications, procedures, physicians, and organizations they have visited
def patient_history():
    
    cypher = '''
    MATCH (p:resource {resourceType: "Patient"})
    WITH collect(p) as patients
    UNWIND patients as patient
    CALL{
      WITH patient
      MATCH path=((r:resource {resourceType: "Condition"})-[*1]->(c:code))
      WITH DISTINCT(c.text) as _text, patient
        ORDER BY _text
      WITH patient, _text, collect(_text) AS _texts
      WITH patient, [(code{text: _text})<-[*1]-(r:resource {resourceType: "Condition"})-[*1..3]->(patient) | _text] AS embedding
      UNWIND embedding as embed
      WITH collect(DISTINCT(embed)) as conditions
      RETURN conditions
    }
    WITH patient, conditions
    CALL{
      WITH patient
      MATCH path=((r:resource {resourceType: "Organization"}))
      WITH DISTINCT(r.name) as _name, patient
        ORDER BY _name
      WITH patient, _name, collect(_name) AS _names
      WITH patient, [(s:serviceProvider {display: _name})<-[*1]-(r:resource {resourceType: "Encounter"})-[*1..3]->(patient) | _name] AS embedding
      UNWIND embedding as embed
      WITH collect(DISTINCT(embed)) as organizations
      RETURN organizations
    }
    WITH patient, conditions, organizations
    CALL{
      WITH patient
      MATCH path=((r:resource {resourceType: "Encounter"})-[*2]->(i:individual))
      WITH DISTINCT(i.display) as _display, patient
        ORDER BY _display
      WITH patient, _display, collect(_display) AS _displays
      WITH patient, [(individual{display: _display})<-[*2]-(r:resource {resourceType: "Encounter"})-[*1..3]->(patient) | _display] AS embedding
      UNWIND embedding as embed
      WITH collect(DISTINCT(embed)) as practitioners
      RETURN practitioners
    }
    WITH patient, conditions, organizations, practitioners
    CALL{
      WITH patient
      MATCH path=((r:resource {resourceType: "MedicationRequest"})-[*1]->(m:medicationCodeableConcept))
      WITH DISTINCT(m.text) as _text, patient
        ORDER BY _text
      WITH patient, _text, collect(_text) AS _texts
      WITH patient, [(medicationCodeableConcept{text: _text})<-[*1]-(r:resource {resourceType: "MedicationRequest"})-[*1..3]->(patient) | _text] AS embedding
      UNWIND embedding as embed
      WITH collect(DISTINCT(embed)) as medications
      RETURN medications
    }
    WITH patient, conditions, organizations, practitioners, medications
    CALL{
     WITH patient
      MATCH path=((r:resource {resourceType: "Procedure"})-[*1]->(c:code))
      WITH DISTINCT(c.text) as _text, patient
        ORDER BY _text
      WITH patient, _text, collect(_text) AS _texts
      WITH patient, [(code{text: _text})<-[*1]-(r:resource {resourceType: "Procedure"})-[*1..3]->(patient) | _text] AS embedding
      UNWIND embedding as embed
      WITH collect(DISTINCT(embed)) as procedures
      RETURN procedures
    }
    RETURN patient.id, conditions, organizations, practitioners, medications, procedures
    '''

    result, runtime = query(cypher)
    return result, runtime


result, runtime = patient_history()
df = pd.DataFrame(result, columns=["patient", "conditions", "organizations", "practitioners", "medications", "procedures"]).head(2)
display(df)
print(df.values[0])

In [None]:
# Takes info from above (same beginning portion of query) and one hot encodes/indexes it for each pair of patients, including same patients to show
# that it is for the most part working, then runs jaccard similarity on them to find a flat patient similarity in medical history
def patient_similarity():
    cypher = '''
    MATCH (p:resource {resourceType: "Patient"})
    WITH collect(p) AS patients
    UNWIND patients AS patient
    CALL {
      WITH patient
      MATCH path=((r:resource {resourceType: "Condition"})-[*1]->(c:code))
      WITH DISTINCT(c.text) AS _text, patient
        ORDER BY _text
      WITH patient, _text, collect(_text) AS _texts
      WITH patient, [(code{text: _text})<-[*1]-(r:resource {resourceType: "Condition"})-[*1..3]->(patient) | _text] AS embedding
      UNWIND embedding AS embed
      WITH collect(DISTINCT(embed)) AS conditions
      RETURN conditions
    }
    WITH patient, conditions
    CALL {
      WITH patient
      MATCH path=((r:resource {resourceType: "Organization"}))
      WITH DISTINCT(r.name) AS _name, patient
        ORDER BY _name
      WITH patient, _name, collect(_name) AS _names
      WITH patient, [(s:serviceProvider {display: _name})<-[*1]-(r:resource {resourceType: "Encounter"})-[*1..3]->(patient) | _name] AS embedding
      UNWIND embedding AS embed
      WITH collect(DISTINCT(embed)) AS organizations
      RETURN organizations
    }
    WITH patient, conditions, organizations
    CALL {
      WITH patient
      MATCH path=((r:resource {resourceType: "Encounter"})-[*2]->(i:individual))
      WITH DISTINCT(i.display) AS _display, patient
        ORDER BY _display
      WITH patient, _display, collect(_display) AS _displays
      WITH patient, [(individual{display: _display})<-[*2]-(r:resource {resourceType: "Encounter"})-[*1..3]->(patient) | _display] AS embedding
      UNWIND embedding AS embed
      WITH collect(DISTINCT(embed)) AS practitioners
      RETURN practitioners
    }
    WITH patient, conditions, organizations, practitioners
    CALL {
      WITH patient
      MATCH path=((r:resource {resourceType: "MedicationRequest"})-[*1]->(m:medicationCodeableConcept))
      WITH DISTINCT(m.text) AS _text, patient
        ORDER BY _text
      WITH patient, _text, collect(_text) AS _texts
      WITH patient, [(medicationCodeableConcept{text: _text})<-[*1]-(r:resource {resourceType: "MedicationRequest"})-[*1..3]->(patient) | _text] AS embedding
      UNWIND embedding AS embed
      WITH collect(DISTINCT(embed)) AS medications
      RETURN medications
    }
    WITH patient, conditions, organizations, practitioners, medications
    CALL {
     WITH patient
      MATCH path=((r:resource {resourceType: "Procedure"})-[*1]->(c:code))
      WITH DISTINCT(c.text) AS _text, patient
        ORDER BY _text
      WITH patient, _text, collect(_text) AS _texts
      WITH patient, [(code{text: _text})<-[*1]-(r:resource {resourceType: "Procedure"})-[*1..3]->(patient) | _text] AS embedding
      UNWIND embedding AS embed
      WITH collect(DISTINCT(embed)) AS procedures
      RETURN procedures
    }
    WITH patient, patient.id AS patientId, conditions + organizations + practitioners + medications + procedures AS medical_history
    WITH patient, medical_history, [patientId, medical_history] AS pair
    WITH collect(pair) AS pairs1, collect(pair) AS pairs2
    WITH pairs1, pairs2, [] AS complete
    UNWIND pairs1 AS pair1
    UNWIND pairs2 AS pair2
    WITH pair1[0] AS id1, pair2[0] AS id2, pair1[1] AS history1, pair2[1] AS history2
    CALL { 
      WITH id1, history1, id2, history2
      WITH DISTINCT(history1+history2) AS list, history1, history2
      WITH gds.alpha.ml.oneHotEncoding(list, history1) AS list1, gds.alpha.ml.oneHotEncoding(list, history2) AS list2, history1, history2
      WITH size(list1) AS len, list1, list2
      WITH [x IN range(0,len) WHERE list1[x] = 1 ] AS encoding1, [x IN range(0,len) WHERE list2[x] = 1 ] AS encoding2
      RETURN gds.alpha.similarity.jaccard(encoding1, encoding2) AS similarity
    }
    RETURN id1, id2, similarity
      ORDER BY similarity DESC
    '''
    
    result, runtime = query(cypher)
    return result, runtime

result, runtime = patient_similarity()
print("runtime: "+ str(runtime))
pd.DataFrame(result, columns=["id1", "id2", "similarity"]).iloc[17:27]