In [2]:
# Import required Python libraries for data manipulation, file handling, and performance monitoring
import pandas as pd
import numpy as np
import json
import glob
from tqdm import tqdm
import time
import os
import gc

In [3]:
# Set the operating system type to configure path delimiters correctly
OPERATING_SYS = 'Win'
# OPERATING_SYS = 'Linux'

delim = '\\'

if OPERATING_SYS != 'Win':
    delim = '/'

In [8]:
# Define paths for the input (FHIR JSON files) and output folders, and create the output folder if it does not exist
# Create a new output folder on desktop
# identify paths for input and output folder
input_root_folder_path = os.path.expanduser('~/Downloads/synthea/output/fhir')
output_folder_path =  os.path.expanduser('~/Desktop/Output')

if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    print("The new Output directory is created!")
print(f'Input Folder: {input_root_folder_path}')
print(f'Output Folder: {output_folder_path}')

The new Output directory is created!
Input Folder: /home/test/Downloads/synthea/output/fhir
Output Folder: /home/test/Desktop/Output


In [9]:
# Recursively search and list all JSON files in the specified input directory
# Find and Identify the json files in the input root folder
delim = os.sep
files = glob.glob(input_root_folder_path+delim+'**'+delim+'*.json',recursive=True)

print('---Found '+str(len(files))+' Json Files---')

---Found 1450 Json Files---


In [10]:
# Define a function to filter and return resources of a specific FHIR resource type from the JSON data
def filter_resource(data, resource_type):
    return list(filter(lambda x: x['resource']['resourceType'] == resource_type.strip(), data['entry']))

## Patient

In [None]:

# Define the column names for the patient DataFrame
cols = ['id','gender','birthDate','maritalStatus','city','state','postalCode','country','deceased','deceasedDateTime']
arr = []  # List to hold extracted patient data
start = time.time()  # Start timer to monitor processing time
f_count = 0  # Counter for failed files

# Define city and postal code distribution for generated patient data
# Format: (City Name, Postal Code, Number of Patients)
locations = [("Camulet", "49911", 20), ("Baraga", "49908", 210), ("Marquette", "49855", 1200), ("Houghton", "49931", 81)]
total_patients_needed = sum([x[2] for x in locations])  # Total number of patient entries required
loc_index = 0  # Index to track current location from the list
loc_count = 0  # Counter to track how many patients have been added for the current location

# Iterate through all FHIR JSON files
for file in tqdm(files):
    # Stop processing if required number of patient entries is reached
    if len(arr) >= total_patients_needed:
        break
    try:
        # Load the JSON file
        with open(file) as f:
            data = json.load(f)

        ar = []  # Temporary list to store one patient's data
        patient = filter_resource(data, 'Patient')[0]  # Extract the Patient resource

        # Extract and append basic patient demographics
        ar.append(patient['resource']['id'])
        ar.append(patient['resource']['gender'])
        ar.append(patient['resource']['birthDate'])
        ar.append(patient['resource']['maritalStatus']['text'])

        # Assign city and postal code from the custom distribution list
        city, postal, count = locations[loc_index]
        ar.append(city)  # Custom city
        ar.append(patient['resource']['address'][0]['state'])  # Use original state
        ar.append(postal)  # Custom postal code
        ar.append(patient['resource']['address'][0]['country'])  # Use original country

        # Handle deceased status
        if 'deceasedDateTime' in patient['resource']:
            ar.append(True)
            ar.append(patient['resource']['deceasedDateTime'])
        else:
            ar.append(False)
            ar.append(np.nan)

        arr.append(ar)  # Append the processed patient data to the master list
        loc_count += 1  # Increment location-specific counter

        # If we've added enough patients for the current location, move to the next
        if loc_count == count:
            loc_index += 1
            loc_count = 0

    except Exception as e:
        f_count += 1  # Count the file as failed if an error occurs
        continue

end = time.time()  # Stop timer

# Output summary of the processing
print(str(f_count)+' Files Failed...')
print(str(len(arr))+' Patient bundles extracted as DataFrame in '+str(end-start)+ 'Seconds')

# Convert the list of patient data into a DataFrame and save to CSV
df_patient = pd.DataFrame(arr, columns = cols)
df_patient.to_csv(output_folder_path+delim+'Patient.csv', index=False)

# Clean up memory
del df_patient
gc.collect()


100%|███████████████████████████████████████| 1450/1450 [03:30<00:00,  6.88it/s]

2 Files Failed...
1448 Patient bundles extracted as DataFrame in 210.91057419776917Seconds





In [12]:
df_patient['city'].value_counts()

city
Marquette    1448
Name: count, dtype: int64

In [14]:
df_patient['deceased'].value_counts()/len(df_patient)

deceased
False    0.828729
True     0.171271
Name: count, dtype: float64

In [15]:
#Dropping Duplicates If Any
df_patient = df_patient.drop_duplicates('id', 
                                        inplace=False, 
                                        ignore_index=True)

In [16]:
df_patient.head()

Unnamed: 0,id,gender,birthDate,maritalStatus,city,state,postalCode,country,deceased,deceasedDateTime
0,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,female,2005-05-09,Never Married,Marquette,MI,49855,US,False,
1,e8558a66-dcdf-d817-0703-0d01cb214f83,male,1952-01-07,Widowed,Marquette,MI,49855,US,True,2007-10-20T04:29:16-04:00
2,bc9ab037-b444-6147-00b1-23ca8743f39f,male,1959-08-27,Married,Marquette,MI,49855,US,False,
3,144c3d44-91d4-52f7-f7bc-8d2cd42702c1,male,1946-07-02,Married,Marquette,MI,49855,US,True,2011-01-11T12:22:13-05:00
4,c0873ef2-9462-56ef-27a8-267116507046,female,2020-01-04,Never Married,Marquette,MI,49855,US,False,


In [17]:
df_patient.to_csv(output_folder_path+delim+'Patient.csv')
del df_patient
gc.collect()

0

## Conditions

In [None]:
# Create a new DataFrame for patient conditions
# Define the column names for the condition DataFrame   
cols = ['code','codeText','patientId','encounterId','onsetDateTime','recordedDate','clinicalStatusCode']

arr = []
start = time.time()
f_count = 0

for file in tqdm(files):
    try:
        #load File
        f = open(file)
        data = json.load(f)
        f.close()
        

        conditions = filter_resource(data, 'Condition')
        for cond in conditions:
            ar = []
            
            ar.append(cond['resource']['code']['coding'][0]['code'])
            ar.append(cond['resource']['code']['text'])
            ar.append(cond['resource']['subject']['reference'].strip().split('urn:uuid:')[1])
            ar.append(cond['resource']['encounter']['reference'].strip().split('urn:uuid:')[1])
            ar.append(cond['resource']['onsetDateTime'])
            ar.append(cond['resource']['recordedDate'])
            ar.append(cond['resource']['clinicalStatus']['coding'][0]['code'])
            
            arr.append(ar)
    except Exception as e:
#         print(e)
        f_count += 1
        continue

end = time.time()
print(str(f_count)+' Files Failed...')
print(str(len(arr))+' Patient condition bundles extracted as DataFrame in '+str(end-start)+ 'Seconds')

df_condition = pd.DataFrame(arr, columns = cols)

100%|███████████████████████████████████████| 1450/1450 [03:43<00:00,  6.49it/s]

0 Files Failed...
66401 Patient condition bundles extracted as DataFrame in 223.58940529823303Seconds





In [28]:
df_condition.head()

Unnamed: 0,code,codeText,patientId,encounterId,onsetDateTime,recordedDate,clinicalStatusCode
0,314529007,Medication review due (situation),8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,6a7648a6-2b11-9ef2-59cf-69ad2de5f8ca,2015-05-18 05:50:28+00:00,2015-05-18 05:50:28+00:00,resolved
1,66383009,Gingivitis (disorder),8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,6a7648a6-2b11-9ef2-59cf-69ad2de5f8ca,2015-05-18 05:50:28+00:00,2015-05-18 05:50:28+00:00,resolved
2,18718003,Gingival disease (disorder),8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,1e64f87e-bb4b-f0d8-b7b1-ec7c29e03b42,2015-05-25 09:12:05+00:00,2015-05-25 09:12:05+00:00,resolved
3,314529007,Medication review due (situation),8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,c4c7e813-7519-2d90-4d46-a8b173f5e4d6,2016-05-23 05:50:28+00:00,2016-05-23 05:50:28+00:00,resolved
4,444814009,Viral sinusitis (disorder),8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,081bac22-f929-d00d-85b6-fc5eda48d484,2017-03-31 15:50:28+00:00,2017-03-31 15:50:28+00:00,resolved


In [29]:
df_condition['onsetDateTime'] = pd.to_datetime(df_condition['onsetDateTime'], format="%Y-%m-%dT%H:%M:%S%z", utc=True)
df_condition['recordedDate'] = pd.to_datetime(df_condition['recordedDate'], format="%Y-%m-%dT%H:%M:%S%z", utc=True)

In [31]:
#Extracting resolvedDateTime form Conditions DataFrame

cols = ['patientId','code','encounterId','onsetDateTime','resolvedDateTime','codeText']
arr = []
for name,group in tqdm(df_condition.groupby(['patientId','encounterId','onsetDateTime'])):
    #Groupby Condition Code Again
    for name2, group2 in group.groupby(['code','codeText']):
        ar = []
        # Add patientId
        ar.append(name[0])
        
        # Add code
        ar.append(name2[0])
        
        # Add encounterId
        ar.append(name[1])
        
        # Add onsetDateTime
        ar.append(name[2])
        
        #Get Records with clinicalStatusCode as Resolved
        resolved = group2.query('clinicalStatusCode == "resolved"')
        
        #Add Resolved Date to Array if Resolved Record exists
        if len(resolved) > 0 :
            ar.append(resolved['recordedDate'].max())
        else:
            ar.append(group2['recordedDate'].max())
        
        # Add codeText
        ar.append(name2[1])
        
        arr.append(ar)

df_condition_new = pd.DataFrame(arr, columns = cols)

100%|████████████████████████████████████| 54932/54932 [03:35<00:00, 255.16it/s]


In [32]:
df_condition_new['onsetDateTime'] = pd.to_datetime(df_condition_new['onsetDateTime'], 
                                                   format="%Y-%m-%dT%H:%M:%S%z", utc=True)
df_condition_new['resolvedDateTime'] = pd.to_datetime(df_condition_new['resolvedDateTime'], 
                                                      format="%Y-%m-%dT%H:%M:%S%z", utc=True)
df_condition_new.head()

Unnamed: 0,patientId,code,encounterId,onsetDateTime,resolvedDateTime,codeText
0,000555d1-916a-0323-b3e1-f00b5e740d48,239873007,0038aac1-104d-02b5-002a-a23404ffe1b2,1990-08-06 15:25:44+00:00,1990-08-06 15:25:44+00:00,Osteoarthritis of knee (disorder)
1,000555d1-916a-0323-b3e1-f00b5e740d48,156073000,04085c33-2ecb-6715-2745-988533406711,1992-03-02 15:25:44+00:00,1992-03-02 15:25:44+00:00,Complete miscarriage (disorder)
2,000555d1-916a-0323-b3e1-f00b5e740d48,19169002,04085c33-2ecb-6715-2745-988533406711,1992-03-02 15:25:44+00:00,1992-03-02 15:25:44+00:00,Miscarriage in first trimester (disorder)
3,000555d1-916a-0323-b3e1-f00b5e740d48,72892002,04085c33-2ecb-6715-2745-988533406711,1992-03-02 15:25:44+00:00,1992-03-02 15:25:44+00:00,Normal pregnancy (finding)
4,000555d1-916a-0323-b3e1-f00b5e740d48,88805009,04a7716e-2b20-02a9-7949-5acfce394c59,1995-03-31 15:25:44+00:00,1995-03-31 15:25:44+00:00,Chronic congestive heart failure (disorder)


In [33]:
df_condition_new.query('code == "840539006"')

Unnamed: 0,patientId,code,encounterId,onsetDateTime,resolvedDateTime,codeText
480,0111dc2c-204c-44ec-5a41-84cd5fc856a2,840539006,d8cc36a1-9965-4d38-80a2-cddd2a088b53,2020-12-26 14:50:24+00:00,2020-12-26 14:50:24+00:00,Disease caused by severe acute respiratory syn...
1073,03a81a51-4aa5-71a7-db56-2bba005ab1c1,840539006,9a8a04c5-2ada-63a3-f466-c656d02b8d4f,2020-10-28 12:27:07+00:00,2020-10-28 12:27:07+00:00,Disease caused by severe acute respiratory syn...
1090,03dc55bb-6672-f44b-f687-7283d071f715,840539006,1fbcafd9-012d-f15a-4b6b-4d342bc34837,2020-08-02 03:20:31+00:00,2020-08-02 03:20:31+00:00,Disease caused by severe acute respiratory syn...
2729,08b98c08-a8fc-ec59-1e0b-632bd168c18a,840539006,574647dd-397a-f24a-9c08-69d4bdcf0249,2020-09-24 02:35:13+00:00,2020-09-24 02:35:13+00:00,Disease caused by severe acute respiratory syn...
3863,0c5f95e1-7bcc-a4fb-94a5-6c2524981285,840539006,d278d5fb-e894-89b3-e913-6ae7817bae4f,2021-01-16 10:49:16+00:00,2021-01-16 10:49:16+00:00,Disease caused by severe acute respiratory syn...
...,...,...,...,...,...,...
64968,fad73c98-e0c8-696c-065b-55611e409f54,840539006,144a1397-0053-514b-1498-7cafe021c672,2021-01-06 12:15:05+00:00,2021-01-06 12:15:05+00:00,Disease caused by severe acute respiratory syn...
65014,fb2355bc-ba27-9c50-f42d-38232263f4a7,840539006,1b859b67-4ae0-0417-3da0-e59007c03821,2020-12-06 12:00:49+00:00,2020-12-06 12:00:49+00:00,Disease caused by severe acute respiratory syn...
65153,fbd1b1f4-c51f-1941-4b0e-8f632aed9fee,840539006,d1937c98-6653-92c4-ec55-7c56d64f258a,2021-03-03 13:51:11+00:00,2021-03-03 13:51:11+00:00,Disease caused by severe acute respiratory syn...
65690,fd0641a7-8edc-b5e2-f20f-0e0ddedc0e02,840539006,f4dd72fc-6d1d-d531-c548-62cdd02ed8de,2021-04-06 16:34:11+00:00,2021-04-06 16:34:11+00:00,Disease caused by severe acute respiratory syn...


840539006 Is the Code for COVID 19

In [34]:
df_condition_new.to_csv(output_folder_path+delim+'Condition.csv')
del df_condition_new
gc.collect()

6

## Encounters

In [35]:
cols = ['id','status','code','codeText','start','end','patientId','location','serviceProvider','encounterClass']

arr = []
start = time.time()
f_count = 0

for file in tqdm(files):
    try:
        #load File
        f = open(file)
        data = json.load(f)
        f.close()
        

        encounters = filter_resource(data, 'Encounter')
        for encounter in encounters:
            ar = []
            
            ar.append(encounter['resource']['id'])
            ar.append(encounter['resource']['status'])
            ar.append(encounter['resource']['type'][0]['coding'][0]['code'])
            ar.append(encounter['resource']['type'][0]['text'])
            ar.append(encounter['resource']['period']['start'])
            ar.append(encounter['resource']['period']['end'])
            ar.append(encounter['resource']['subject']['reference'].strip().split('urn:uuid:')[1])
            ar.append(encounter['resource']['location'][0]['location']['display'])
            ar.append(encounter['resource']['serviceProvider']['display'])
            ar.append(encounter['resource']['class']['code'])

            arr.append(ar)
    except Exception as e:
#         print(e)
        f_count += 1
        continue

end = time.time()
print(str(f_count)+' Files Failed...')
print(str(len(arr))+' Patient encounter bundles extracted as DataFrame in '+str(end-start)+ 'Seconds')

df_encounter = pd.DataFrame(arr, columns = cols)

100%|███████████████████████████████████████| 1450/1450 [04:50<00:00,  4.99it/s]


0 Files Failed...
118977 Patient encounter bundles extracted as DataFrame in 290.79170298576355Seconds


In [36]:
df_encounter.head()

Unnamed: 0,id,status,code,codeText,start,end,patientId,location,serviceProvider,encounterClass
0,6a7648a6-2b11-9ef2-59cf-69ad2de5f8ca,finished,410620009,Well child visit (procedure),2015-05-18T01:50:28-04:00,2015-05-18T02:05:28-04:00,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,Marquette Tribal Community Health Center,Marquette Tribal Community Health Center,AMB
1,1e64f87e-bb4b-f0d8-b7b1-ec7c29e03b42,finished,185349003,Encounter for check up (procedure),2015-05-25T01:50:28-04:00,2015-05-25T08:25:37-04:00,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,DLP MARQUETTE GENERAL HOSPITAL LLC,DLP MARQUETTE GENERAL HOSPITAL LLC,AMB
2,c4c7e813-7519-2d90-4d46-a8b173f5e4d6,finished,410620009,Well child visit (procedure),2016-05-23T01:50:28-04:00,2016-05-23T02:05:28-04:00,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,Marquette Tribal Community Health Center,Marquette Tribal Community Health Center,AMB
3,081bac22-f929-d00d-85b6-fc5eda48d484,finished,185345009,Encounter for symptom (procedure),2017-03-31T11:50:28-04:00,2017-03-31T12:05:28-04:00,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,DLP MARQUETTE GENERAL HOSPITAL LLC,DLP MARQUETTE GENERAL HOSPITAL LLC,AMB
4,a60d3005-e80a-c83e-4a7a-d79a3a7c672f,finished,410620009,Well child visit (procedure),2017-05-29T01:50:28-04:00,2017-05-29T02:14:24-04:00,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,Marquette Tribal Community Health Center,Marquette Tribal Community Health Center,AMB


In [37]:
df_encounter['encounterClass'].value_counts()

encounterClass
AMB     111265
EMER      4483
IMP       2067
HH        1001
VR         161
Name: count, dtype: int64

In [38]:
df_encounter.to_csv(output_folder_path+delim+'Encounter.csv')
del df_encounter
gc.collect()

0

## Observations

In [39]:
cols = ['id','patientId','issuedDate','effectiveDateTime','category','encounter','code','codeText','value','units','snomedCode','observationType']

arr = []
start = time.time()
f_count = 0

for file in tqdm(files):
    try:
        #load File
        f = open(file)
        data = json.load(f)
        f.close()
        

        observations = filter_resource(data, 'Observation')
        for observation in observations:
            
            if 'component' in observation['resource'].keys():
                for comp in observation['resource']['component']:
                    ar = []
                    ar.append(observation['resource']['id'])
                    ar.append(observation['resource']['subject']['reference'].strip().split('urn:uuid:')[1])
                    ar.append(observation['resource']['issued'])
                    ar.append(observation['resource']['effectiveDateTime'])
                    ar.append(observation['resource']['category'][0]['coding'][0]['code'])
                    ar.append(observation['resource']['encounter']['reference'].strip().split('urn:uuid:')[1])
                    
                    ar.append(comp['code']['coding'][0]['code'])
                    ar.append(comp['code']['coding'][0]['display'])

                    if 'valueCodeableConcept' in comp.keys():
                        ar.append(comp['valueCodeableConcept']['coding'][0]['display'])
                        ar.append(np.nan)
                        ar.append(comp['valueCodeableConcept']['coding'][0]['code'])
                        ar.append('text')
                    elif 'valueQuantity' in comp.keys():
                        ar.append(comp['valueQuantity']['value'])
                        ar.append(comp['valueQuantity']['unit'])
                        ar.append(np.nan)
                        ar.append('numeric')
                    else:
                        ar.append(comp['valueString'])
                        ar.append(np.nan)
                        ar.append(np.nan)
                        ar.append('text')

                    arr.append(ar)
            else:
                ar = []
                ar.append(observation['resource']['id'])
                ar.append(observation['resource']['subject']['reference'].strip().split('urn:uuid:')[1])
                ar.append(observation['resource']['issued'])
                ar.append(observation['resource']['effectiveDateTime'])
                ar.append(observation['resource']['category'][0]['coding'][0]['code'])
                ar.append(observation['resource']['encounter']['reference'].strip().split('urn:uuid:')[1])
                
                ar.append(observation['resource']['code']['coding'][0]['code'])
                ar.append(observation['resource']['code']['coding'][0]['display'])

                if 'valueCodeableConcept' in observation['resource'].keys():
                    ar.append(observation['resource']['valueCodeableConcept']['coding'][0]['display'])
                    ar.append(np.nan)
                    ar.append(observation['resource']['valueCodeableConcept']['coding'][0]['code'])
                    ar.append('text')
                elif 'valueString' in observation['resource'].keys():
                    ar.append(observation['resource']['valueString'])
                    ar.append(np.nan)
                    ar.append(np.nan)
                    ar.append('text')
                else:
                    ar.append(observation['resource']['valueQuantity']['value'])
                    ar.append(observation['resource']['valueQuantity']['unit'])
                    ar.append(np.nan)
                    ar.append('numeric')

                arr.append(ar)
    except Exception as e:
#         print(e)
#         print(observation['resource'])
        f_count += 1
        continue

end = time.time()
print(str(f_count)+' Files Failed...')
print(str(len(arr))+' Patient observation bundles extracted as DataFrame in '+str(end-start)+ 'Seconds')

df_observation = pd.DataFrame(arr, columns = cols)

100%|███████████████████████████████████████| 1450/1450 [05:40<00:00,  4.26it/s]


0 Files Failed...
1560675 Patient observation bundles extracted as DataFrame in 340.3905622959137Seconds


In [40]:
df_observation

Unnamed: 0,id,patientId,issuedDate,effectiveDateTime,category,encounter,code,codeText,value,units,snomedCode,observationType
0,a1f4d1c4-6128-db07-644c-8c863cda0923,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,2015-05-18T01:50:28.487-04:00,2015-05-18T01:50:28-04:00,vital-signs,6a7648a6-2b11-9ef2-59cf-69ad2de5f8ca,8302-2,Body Height,127.5,cm,,numeric
1,8c676538-70d0-6679-581a-b59fd7620c14,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,2015-05-18T01:50:28.487-04:00,2015-05-18T01:50:28-04:00,vital-signs,6a7648a6-2b11-9ef2-59cf-69ad2de5f8ca,72514-3,Pain severity - 0-10 verbal numeric rating [Sc...,3,{score},,numeric
2,2f946d41-1cdd-3c97-7e0f-95116a19f994,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,2015-05-18T01:50:28.487-04:00,2015-05-18T01:50:28-04:00,vital-signs,6a7648a6-2b11-9ef2-59cf-69ad2de5f8ca,29463-7,Body Weight,35.8,kg,,numeric
3,d3e29cde-6143-0a3c-8a09-29ffd88c05d8,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,2015-05-18T01:50:28.487-04:00,2015-05-18T01:50:28-04:00,vital-signs,6a7648a6-2b11-9ef2-59cf-69ad2de5f8ca,39156-5,Body mass index (BMI) [Ratio],22.03,kg/m2,,numeric
4,73289f95-e040-40b0-3b0a-96e8877f034a,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,2015-05-18T01:50:28.487-04:00,2015-05-18T01:50:28-04:00,vital-signs,6a7648a6-2b11-9ef2-59cf-69ad2de5f8ca,59576-9,Body mass index (BMI) [Percentile] Per age and...,93.041,%,,numeric
...,...,...,...,...,...,...,...,...,...,...,...,...
1560670,1b0a5f5f-7820-5598-6ff6-0de19e0335c3,e28eb437-d40f-ebc7-8e1c-d48982755e7b,2024-09-17T07:10:13.445-04:00,2024-09-17T07:10:13-04:00,survey,1d2ef3ce-da7d-004c-f370-d17b3b6fb5a6,56051-6,Do you consider yourself Hispanic/Latino?,No,,LA32-8,text
1560671,b6014e2f-0648-79b0-05ea-ae0804d1c42b,e28eb437-d40f-ebc7-8e1c-d48982755e7b,2024-09-17T07:46:26.445-04:00,2024-09-17T07:46:26-04:00,survey,1d2ef3ce-da7d-004c-f370-d17b3b6fb5a6,55758-7,Patient Health Questionnaire 2 item (PHQ-2) to...,1,{score},,numeric
1560672,c115dd51-0cfd-9c05-758d-bc30b15a1829,e28eb437-d40f-ebc7-8e1c-d48982755e7b,2024-09-17T08:25:54.445-04:00,2024-09-17T08:25:54-04:00,survey,1d2ef3ce-da7d-004c-f370-d17b3b6fb5a6,82667-7,Total score [DAST-10],1,{score},,numeric
1560673,70125cf0-0393-d0f4-0751-2612f3711b6d,e28eb437-d40f-ebc7-8e1c-d48982755e7b,2024-09-24T06:23:35.445-04:00,2024-09-24T06:23:35-04:00,vital-signs,b1edf136-3cb7-2508-98ba-d816b7c4d994,8310-5,Body temperature,37.91,Cel,,numeric


In [41]:
df_observation.to_csv(output_folder_path+delim+'Observation.csv')
del df_observation
del ar
del arr
gc.collect()

0

## Care Plan

In [46]:
cols = ['id','status','patientId','start','end','category','code','codeText','intent','encounter','careTeam','activityCode','activityCodeText','activityStatus','activityLocation']

arr = []
start = time.time()
f_count = 0

for file in tqdm(files):
    try:
        #load File
        f = open(file)
        data = json.load(f)
        f.close()
        

        cps = filter_resource(data, 'CarePlan')
        for cp in cps:
            if 'activity' in cp['resource'].keys():
                for activity in cp['resource']['activity']:
                    ar = []
                    ar.append(cp['resource']['id'])
                    ar.append(cp['resource']['status'])
                    ar.append(cp['resource']['subject']['reference'].strip().split('urn:uuid:')[1])
                    ar.append(cp['resource']['period']['start'])

                    if 'end' in cp['resource']['period'].keys():
                        ar.append(cp['resource']['period']['end'])
                    else:
                        ar.append(np.nan)

                    ar.append(cp['resource']['category'][0]['coding'][0]['code'])
                    ar.append(cp['resource']['category'][1]['coding'][0]['code'])
                    ar.append(cp['resource']['category'][1]['coding'][0]['display'])
                    ar.append(cp['resource']['intent'])
                    ar.append(cp['resource']['encounter']['reference'].strip().split('urn:uuid:')[1])
                    ar.append(cp['resource']['careTeam'][0]['reference'].strip().split('urn:uuid:')[1])

                    ar.append(activity['detail']['code']['coding'][0]['code'])
                    ar.append(activity['detail']['code']['coding'][0]['display'])
                    ar.append(activity['detail']['status'])
                    ar.append(activity['detail']['location']['display'])

                    arr.append(ar)
            else:
                ar = []
                ar.append(cp['resource']['id'])
                ar.append(cp['resource']['status'])
                ar.append(cp['resource']['subject']['reference'].strip().split('urn:uuid:')[1])
                ar.append(cp['resource']['period']['start'])

                if 'end' in cp['resource']['period'].keys():
                    ar.append(cp['resource']['period']['end'])
                else:
                    ar.append(np.nan)

                ar.append(cp['resource']['category'][0]['coding'][0]['code'])
                ar.append(cp['resource']['category'][1]['coding'][0]['code'])
                ar.append(cp['resource']['category'][1]['coding'][0]['display'])
                ar.append(cp['resource']['intent'])
                ar.append(cp['resource']['encounter']['reference'].strip().split('urn:uuid:')[1])
                ar.append(cp['resource']['careTeam'][0]['reference'].strip().split('urn:uuid:')[1])

                ar.append(np.nan)
                ar.append(np.nan)
                ar.append(np.nan)
                ar.append(np.nan)

                arr.append(ar)
    except Exception as e:
#         print(e)
        f_count += 1
        continue

end = time.time()
print(str(f_count)+' Files Failed...')
print(str(len(arr))+' Patient CarePlan bundles extracted as DataFrame in '+str(end-start)+ 'Seconds')

df_cp = pd.DataFrame(arr, columns = cols)

100%|███████████████████████████████████████| 1450/1450 [06:02<00:00,  4.00it/s]


0 Files Failed...
13249 Patient CarePlan bundles extracted as DataFrame in 362.48060059547424Seconds


In [47]:
df_cp

Unnamed: 0,id,status,patientId,start,end,category,code,codeText,intent,encounter,careTeam,activityCode,activityCodeText,activityStatus,activityLocation
0,0242e6bb-7399-5951-502d-36b506b133b5,completed,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,2017-08-10T01:50:28-04:00,2017-08-29T01:50:28-04:00,assess-plan,773513001,Physiotherapy care plan (record artifact),order,ad358527-952e-41ef-e544-ebb97c5b6c87,47263462-c9d3-f380-c329-ab4e3b0e788c,229586001,"Rest, ice, compression and elevation treatment...",completed,LAKE SUPERIOR HOSPICE ASSOCIATION
1,0242e6bb-7399-5951-502d-36b506b133b5,completed,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,2017-08-10T01:50:28-04:00,2017-08-29T01:50:28-04:00,assess-plan,773513001,Physiotherapy care plan (record artifact),order,ad358527-952e-41ef-e544-ebb97c5b6c87,47263462-c9d3-f380-c329-ab4e3b0e788c,229070002,Stretching exercises (regime/therapy),completed,LAKE SUPERIOR HOSPICE ASSOCIATION
2,741550cb-1f0f-57c3-ffa5-b7590e8dcf1e,completed,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,2024-06-24T01:50:28-04:00,2024-07-22T01:50:28-04:00,assess-plan,408869004,Musculoskeletal care (regime/therapy),order,772dfceb-756a-270b-198a-9b3bf6ebe653,86bd6ebc-6ca0-e55c-c797-162a33835a69,266694003,Heat therapy (procedure),completed,LAKE SUPERIOR HOSPICE ASSOCIATION
3,741550cb-1f0f-57c3-ffa5-b7590e8dcf1e,completed,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,2024-06-24T01:50:28-04:00,2024-07-22T01:50:28-04:00,assess-plan,408869004,Musculoskeletal care (regime/therapy),order,772dfceb-756a-270b-198a-9b3bf6ebe653,86bd6ebc-6ca0-e55c-c797-162a33835a69,183051005,Recommendation to rest (procedure),completed,LAKE SUPERIOR HOSPICE ASSOCIATION
4,3c4ef000-e413-ac8c-5406-59038dd0e890,active,8b4f9612-a797-f5d3-8e56-7a4e06b7d8c0,2024-12-27T12:50:28-05:00,,assess-plan,53950000,Respiratory therapy (procedure),order,bc328a7a-0af7-4d56-0858-f19a1e98d8bc,61cdd2a6-49bd-0c45-6e4a-af14710de413,304510005,Recommendation to avoid exercise (procedure),in-progress,DLP MARQUETTE GENERAL HOSPITAL LLC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13244,5ab6d923-ec72-e028-aaa4-de60135bb182,active,e28eb437-d40f-ebc7-8e1c-d48982755e7b,2012-06-05T06:23:35-04:00,,assess-plan,736285004,Hyperlipidemia clinical management plan (recor...,order,3042fdb9-9ce1-ed20-be2a-9a21a7b1d142,7baac31f-8c57-1f0f-c63f-febdf9b77d4e,183301007,Physical exercises (regime/therapy),in-progress,DLP MARQUETTE GENERAL HOSPITAL LLC
13245,c3d09305-3cb3-a9e0-84b8-c57db959375c,completed,e28eb437-d40f-ebc7-8e1c-d48982755e7b,2014-08-25T18:09:57-04:00,2015-06-02T06:23:35-04:00,assess-plan,53950000,Respiratory therapy (procedure),order,a21297a9-57e4-ca86-33f9-c14edfad4ba1,3a2b8673-2825-efda-cf92-c34de9e79f31,304510005,Recommendation to avoid exercise (procedure),completed,DLP MARQUETTE GENERAL HOSPITAL LLC
13246,c3d09305-3cb3-a9e0-84b8-c57db959375c,completed,e28eb437-d40f-ebc7-8e1c-d48982755e7b,2014-08-25T18:09:57-04:00,2015-06-02T06:23:35-04:00,assess-plan,53950000,Respiratory therapy (procedure),order,a21297a9-57e4-ca86-33f9-c14edfad4ba1,3a2b8673-2825-efda-cf92-c34de9e79f31,371605008,Deep breathing and coughing exercises (regime/...,completed,DLP MARQUETTE GENERAL HOSPITAL LLC
13247,c7230795-c1fc-d077-b658-1b5b6587e5cf,completed,e28eb437-d40f-ebc7-8e1c-d48982755e7b,2019-11-09T22:36:40-05:00,2019-11-16T22:36:40-05:00,assess-plan,53950000,Respiratory therapy (procedure),order,574b7fe5-5103-338b-ff76-ac9e211360ec,9627002e-8ea1-03ba-9e0a-dd21f9b19b82,304510005,Recommendation to avoid exercise (procedure),completed,DLP MARQUETTE GENERAL HOSPITAL LLC


In [48]:
df_cp.to_csv(output_folder_path+delim+'CarePlan.csv')
del df_cp
gc.collect()

0