In [1]:
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
import glob
import json
import re
import time

In [2]:
class timer:
    def __init__(self):
        self._start = time.time()
        self._end = None
        self._runtime = None
    
    def end(self):
        self._end = time.time()
        self._runtime = float(str(time.time() - self._start)[:5])
        return self._runtime

In [3]:
def cypher_transaction(cypher):
    uri = "bolt://localhost:7687"
    driver = GraphDatabase.driver(uri, auth=("neo4j", ""))
    values = []
    with driver.session() as session:
        res = session.run(cypher)
        for record in res:
            values.append(record.values())
    driver.close()
    return values

def query(cypher):
    time = timer()
    result = cypher_transaction(cypher)
    runtime = time.end()
    return result, runtime


In [4]:
def resource_metrics():
    
    cypher = f'''

        MATCH (r:resource) 
        WITH DISTINCT(r.resourceType) AS resource_types
            ORDER BY resource_types
        UNWIND resource_types as resource_type
        MATCH (r:resource)
        WHERE r.resourceType = resource_type
        WITH resource_type, COUNT(r) as resource_count
        RETURN resource_type, resource_count
            ORDER BY resource_count

    '''

    resource_count, runtime = query(cypher)
    return resource_count
    
def database_metrics():
    node_count = 0
    relationship_count = 0
    
    cypher = f'''
        
        MATCH (n) 
        WITH COUNT(n) as node_count
        MATCH ()-[r]->()
        WITH node_count, COUNT(r) as relationship_count
        RETURN node_count, relationship_count
        
    '''
    
    count_result, runtime = query(cypher)
    if (len(count_result) != 0):
        node_count = count_result[0][0]
        relationship_count = count_result[0][1]
    
    return node_count, relationship_count
    
def wipe_database():
    node_count, relationship_count = database_metrics()
    
    cypher = f'''

        MATCH (n) DETACH DELETE n

    '''

    delete_result, runtime = query(cypher)
    return 'Deleted {} nodes and {} relationships in {} seconds'.format( node_count, relationship_count, runtime )

In [5]:
wipe_database()

'Deleted 0 nodes and 0 relationships in 0.011 seconds'

In [6]:
def read_json(filename):
    with open(filename) as json_file:
        bundle = json.load(json_file)
        bundle_string = json.dumps(bundle)
        bundle_string_formatted = re.sub(r'"', '\\"', bundle_string)
        return bundle_string_formatted

def load_bundles(synthea_bundles):
    total_time = 0.0
    for file in range(len(synthea_bundles)):
        bundle_string = read_json(synthea_bundles[file])

        cypher = f'''

            CALL cyfhir.bundle.load("{bundle_string}")

        '''
        
        patient = ' '.join(synthea_bundles[file].split('/')[2].split('_')[:2])
        result, runtime = query(cypher)
        print('--- Loaded patient "{}" in {} seconds ---'.format( patient , runtime))
        total_time += runtime
        
    return round(total_time, 3)
    
synthea_bundles = glob.glob("./synthea-bundles/*.json")
synthea_bundles.sort()
total_time = load_bundles(synthea_bundles)
resource_result = np.array(resource_metrics())
total_resources = np.sum([int(x) for x in resource_result[:,1]])
node_count, relationship_count = database_metrics()

print("\nIn {} seconds...".format(total_time))
print("Loaded {} Patient's Medical Histories as FHIR Bundles".format(len(synthea_bundles)))
print("Which Contained {} total FHIR Resources".format(total_resources))
print("For a total of {} Nodes and {} Relationships in Neo4j".format(node_count, relationship_count))

print("At a rate of {} resources per second".format(round(total_resources/total_time, 3)))
print("Each resource has an average of {} Nodes and {} Relationships".format(round(node_count/total_resources, 3), round(relationship_count/total_resources, 3)))

--- Loaded patient "Agustin529 Ontiveros947" in 0.048 seconds ---
--- Loaded patient "Ali918 Bashirian201" in 0.168 seconds ---
--- Loaded patient "Antone63 Lebsack687" in 0.082 seconds ---
--- Loaded patient "Cassaundra447 Bartell116" in 0.322 seconds ---
--- Loaded patient "Charise827 Durgan499" in 0.405 seconds ---
--- Loaded patient "Elanor679 Williamson769" in 0.29 seconds ---
--- Loaded patient "Georgianne697 Conroy74" in 0.523 seconds ---
--- Loaded patient "Gracia333 Carroll471" in 0.459 seconds ---
--- Loaded patient "Kum811 Purdy2" in 1.783 seconds ---
--- Loaded patient "Leo278 Schaden604" in 0.551 seconds ---
--- Loaded patient "Misha8 Oberbrunner298" in 1.068 seconds ---
--- Loaded patient "Pauline896 Aufderhar910" in 0.961 seconds ---
--- Loaded patient "Randolph418 Bernhard322" in 1.173 seconds ---
--- Loaded patient "Rocky100 Erdman779" in 0.611 seconds ---
--- Loaded patient "Rodrigo242 Little434" in 0.508 seconds ---
--- Loaded patient "Rosendo998 Wiza601" in 0.781 se

In [7]:
pd.DataFrame(resource_result, columns = ["resourceType", "count"])

Unnamed: 0,resourceType,count
0,Device,3
1,AllergyIntolerance,10
2,ImagingStudy,10
3,Organization,11
4,Practitioner,11
5,Patient,20
6,CarePlan,66
7,CareTeam,66
8,Immunization,100
9,DiagnosticReport,111


In [8]:
def reference_metrics():
    
    cypher = f'''

        MATCH (a:resource)-[*1..3]->()-[r:reference]->()-->(b:resource)
        WITH [a.resourceType, b.resourceType] AS path
        RETURN distinct(path) AS nodes, count(path) AS path_count

    '''

    reference_count, runtime = query(cypher)
    return reference_count


path_metrics = reference_metrics()
as_array = [[x[0][0],x[0][1], x[1]] for x in path_metrics]
df_references = pd.DataFrame(as_array, columns=["Resource", "References", "Count"])
df_references.sort_values(["Count"], inplace=True, ascending=False, ignore_index=True)
df_references

Unnamed: 0,Resource,References,Count
0,Observation,Patient,1428
1,Observation,Encounter,1428
2,ExplanationOfBenefit,Practitioner,1312
3,ExplanationOfBenefit,Patient,1005
4,DiagnosticReport,Observation,738
5,Claim,Encounter,532
6,Claim,Patient,532
7,Claim,Organization,524
8,ExplanationOfBenefit,Encounter,335
9,ExplanationOfBenefit,Claim,335


In [9]:
def ordered(obj):
    if isinstance(obj, dict):
        return sorted((k, ordered(v)) for k, v in obj.items())
    if isinstance(obj, list):
        return sorted(ordered(x) for x in obj)
    else:
        return obj

def get_resource(_id):
    
    cypher = f'''
        WITH "{_id}" as _id
        MATCH (r:resource)
        WHERE (r.id = _id)
        CALL cyfhir.resource.expand(r) YIELD path
        WITH cyfhir.resource.format(collect(path)) AS resource
        RETURN resource

    '''  
    
    resource, runtime = query(cypher)
    return resource


bundle_file = "./synthea-bundles/Theola421_Haag279_6aff2910-82fc-44d6-84a6-c29e4b756b11.json"
bundle = None
with open(bundle_file) as json_file:
        bundle = json.load(json_file)
        
patient_resource = bundle.get('entry')[0]['resource']
_id = patient_resource['id']
resource_result = get_resource(_id)[0][0]

ordered(patient_resource) == ordered(resource_result)

True

In [10]:
def get_bundle(_id):
    
    cypher = '''
        WITH "%s" AS _id
        CALL {
            WITH _id
            MATCH (a:entry)-[*1..3]->()-[r:reference]->(b:entry)
            WHERE (b._resourceId = _id)
            RETURN a,b
            UNION
            WITH _id
            MATCH (a:entry)-[*1..3]->()-[r:reference]->(b:entry)
            WHERE (a._resourceId = _id)
            RETURN a,b
        }
        WITH collect(a)+collect(b) AS entries
        UNWIND entries AS entry
        CALL cyfhir.resource.expand(entry) YIELD path
        RETURN cyfhir.bundle.format(collect(path))
        ''' % _id

    resource, runtime = query(cypher)
    return resource

bundle_file = "./synthea-bundles/Theola421_Haag279_6aff2910-82fc-44d6-84a6-c29e4b756b11.json"
bundle = None
with open(bundle_file) as json_file:
        bundle = json.load(json_file)
        
_id = bundle.get('entry')[0]['resource']['id']
bundle_result = get_bundle(_id)[0][0]

ordered(bundle) == ordered(bundle_result)

True