# Notebook #2: Federated Data Engineering
In this notebook, we'll convert CXR DICOM to JPG files and apply the conversion code to multiple sites.

### Install the Rhino Health Python SDK, Load All Necessary Libraries and Login to the Rhino FCP

In [None]:
# pip install --upgrade rhino_health

In [1]:
import getpass
import rhino_health as rh
from rhino_health.lib.endpoints.code_object.code_object_dataclass import (
    CodeObject,
    CodeObjectCreateInput,
    CodeObjectRunInput,
    CodeTypes,
)



In [2]:
my_username = "drew@rhinohealth.com"

print("Logging In")
session = rh.login(
    username=my_username, 
    password=getpass.getpass(), 
    show_traceback=True,
    rhino_api_url='https://dev.rhinohealth.com/api/'
)
print("Logged In")

Logging In


 ········


Logged In


### Retrieve Project and Cohort Information

In [51]:
project_name = "Federated Datasets and Predictive Modeling (Setup Project + Collab) - drew@"

# Confirm project creation by listing all projects for the user
for project in session.project.get_projects():
    if project.name == project_name:
        print(project.name,"\n")
        print(project,"\n")
        project_uid = project.uid

project_uid

Federated Datasets and Predictive Modeling (Setup Project + Collab) - drew@ 

Project session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0> creator_uid='e50f2461-3d7a-4fe0-b457-b2fe45d05532' created_at='2024-02-29T16:32:14.440680Z' name='Federated Datasets and Predictive Modeling (Setup Project + Collab) - drew@' description='Example Project for end-to-end Federated Modeling (project creation,                  add collaborator (invite + accept), Data Ext/Eng, Stat Analysis, training, and evaluation' type='Validation' primary_workgroup_uid='97d7456b-baef-4d88-bfb1-afd4975eaff2' permissions=None uid='1752fb63-c36c-4221-85a4-f979c5e921bb' slack_channel='' collaborating_workgroup_uids=['97d7456b-baef-4d88-bfb1-afd4975eaff2', '33cf1db0-de14-472a-8dcb-8d83de22d946', '48cb366f-b05f-4ca2-8e1d-6dfc336cd344'] 



'1752fb63-c36c-4221-85a4-f979c5e921bb'

In [4]:
data_schemas = session.project.get_data_schemas(project_uid)
data_schema_mimic_ehr_demo_dev = session.project.get_data_schemas(project_uid)[0]
data_schema_mimic_ehr_obs_dev = session.project.get_data_schemas(project_uid)[1]
data_schema_mimic_cxr_dev = session.project.get_data_schemas(project_uid)[2]

print(f"Loaded dataschema '{data_schema_mimic_ehr_demo_dev.name}' with uid '{data_schema_mimic_ehr_demo_dev.uid}'")
print(f"Loaded dataschema '{data_schema_mimic_ehr_obs_dev.name}' with uid '{data_schema_mimic_ehr_obs_dev.uid}'")
print(f"Loaded dataschema '{data_schema_mimic_cxr_dev.name}' with uid '{data_schema_mimic_cxr_dev.uid}'")

Loaded dataschema 'mimic_ehr_demo_dev schema' with uid '35ae63d7-146d-4990-b788-cb204568c9b6'
Loaded dataschema 'mimic_ehr_obs_dev schema' with uid 'e333b639-87ea-4860-b202-96bb4c1d09cd'
Loaded dataschema 'mimic_cxr_dev schema' with uid '44a33417-767b-4357-86c0-8e05d195672e'


In [5]:
cxr_schema_mimic_ehr_demo_dev = session.project.get_data_schema_by_name(data_schema_mimic_ehr_demo_dev.name, project_uid=project_uid)
cxr_schema_uid_mimic_ehr_demo_dev = cxr_schema_mimic_ehr_demo_dev.uid
print(cxr_schema_uid_mimic_ehr_demo_dev)

cxr_schema_mimic_ehr_obs_dev = session.project.get_data_schema_by_name(data_schema_mimic_ehr_obs_dev.name, project_uid=project_uid)
cxr_schema_uid_mimic_ehr_obs_dev = cxr_schema_mimic_ehr_obs_dev.uid
print(cxr_schema_uid_mimic_ehr_obs_dev)

cxr_schema_mimic_cxr_dev = session.project.get_data_schema_by_name(data_schema_mimic_cxr_dev.name, project_uid=project_uid)
cxr_schema_uid_mimic_cxr_dev = cxr_schema_mimic_cxr_dev.uid
print(cxr_schema_uid_mimic_cxr_dev)

35ae63d7-146d-4990-b788-cb204568c9b6
e333b639-87ea-4860-b202-96bb4c1d09cd
44a33417-767b-4357-86c0-8e05d195672e


In [6]:
collaborators = session.project.get_collaborating_workgroups(project_uid)

workgroups_by_name = {x.name: x for x in collaborators}
workgroups_by_uid = {x.uid: x for x in collaborators}

In [54]:
workgroups_by_name

{'Rhino Health Test': Workgroup(session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0>, uid='97d7456b-baef-4d88-bfb1-afd4975eaff2', name='Rhino Health Test', org_name='Rhino Health Test'),
 'rhino-sandbox-aidev': Workgroup(session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0>, uid='33cf1db0-de14-472a-8dcb-8d83de22d946', name='rhino-sandbox-aidev', org_name='Rhino Sandbox'),
 'rhino-sandbox-hco': Workgroup(session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0>, uid='48cb366f-b05f-4ca2-8e1d-6dfc336cd344', name='rhino-sandbox-hco', org_name='Rhino Sandbox')}

In [56]:
for k,v in workgroups_by_name.items():
    print(f"{k}: {v}", "\n")

Rhino Health Test: Workgroup session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0> uid='97d7456b-baef-4d88-bfb1-afd4975eaff2' name='Rhino Health Test' org_name='Rhino Health Test' 

rhino-sandbox-aidev: Workgroup session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0> uid='33cf1db0-de14-472a-8dcb-8d83de22d946' name='rhino-sandbox-aidev' org_name='Rhino Sandbox' 

rhino-sandbox-hco: Workgroup session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0> uid='48cb366f-b05f-4ca2-8e1d-6dfc336cd344' name='rhino-sandbox-hco' org_name='Rhino Sandbox' 



In [8]:
workgroups_by_uid

{'97d7456b-baef-4d88-bfb1-afd4975eaff2': Workgroup(session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0>, uid='97d7456b-baef-4d88-bfb1-afd4975eaff2', name='Rhino Health Test', org_name='Rhino Health Test'),
 '33cf1db0-de14-472a-8dcb-8d83de22d946': Workgroup(session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0>, uid='33cf1db0-de14-472a-8dcb-8d83de22d946', name='rhino-sandbox-aidev', org_name='Rhino Sandbox'),
 '48cb366f-b05f-4ca2-8e1d-6dfc336cd344': Workgroup(session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0>, uid='48cb366f-b05f-4ca2-8e1d-6dfc336cd344', name='rhino-sandbox-hco', org_name='Rhino Sandbox')}

In [57]:
for k,v in workgroups_by_uid.items():
    print(f"{k}: {v}", "\n")

97d7456b-baef-4d88-bfb1-afd4975eaff2: Workgroup session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0> uid='97d7456b-baef-4d88-bfb1-afd4975eaff2' name='Rhino Health Test' org_name='Rhino Health Test' 

33cf1db0-de14-472a-8dcb-8d83de22d946: Workgroup session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0> uid='33cf1db0-de14-472a-8dcb-8d83de22d946' name='rhino-sandbox-aidev' org_name='Rhino Sandbox' 

48cb366f-b05f-4ca2-8e1d-6dfc336cd344: Workgroup session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0> uid='48cb366f-b05f-4ca2-8e1d-6dfc336cd344' name='rhino-sandbox-hco' org_name='Rhino Sandbox' 



In [9]:
hco_workgroup = workgroups_by_name["rhino-sandbox-hco"]
aidev_workgroup = workgroups_by_name["rhino-sandbox-aidev"]

print(f"Found workgroups '{aidev_workgroup.name}' and collaborators '{hco_workgroup.name}'")

Found workgroups 'rhino-sandbox-aidev' and collaborators 'rhino-sandbox-hco'


### Get the CXR Cohorts From Both Sites

In [59]:
cohorts = session.project.get_datasets(project_uid)
print(cohorts)

[Dataset(session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0>, creator_uid='e50f2461-3d7a-4fe0-b457-b2fe45d05532', created_at='2024-03-04T19:56:16.819244Z', name='mimic_ehr_demo_dev', description='', base_version_uid='0d2fa03c-52ff-499e-9ffc-3dd758c0005d', project_uid='1752fb63-c36c-4221-85a4-f979c5e921bb', workgroup_uid='97d7456b-baef-4d88-bfb1-afd4975eaff2', data_schema_uid='35ae63d7-146d-4990-b788-cb204568c9b6', uid='854a8cc9-afe6-4363-95d2-5bfee7c531af', version=1, num_cases=1679, import_status='Complete'), Dataset(session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0>, creator_uid='e50f2461-3d7a-4fe0-b457-b2fe45d05532', created_at='2024-03-04T19:57:40.953639Z', name='mimic_ehr_obs_dev', description='', base_version_uid='d1a796e7-f73a-4d70-9450-84224ab37685', project_uid='1752fb63-c36c-4221-85a4-f979c5e921bb', workgroup_uid='97d7456b-baef-4d88-bfb1-afd4975eaff2', data_schema_uid='e333b639-87ea-4860-b202-96bb4c1d09cd', uid='d1a796e7-f73a-4

In [60]:
cohorts[0]

Dataset(session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0>, creator_uid='e50f2461-3d7a-4fe0-b457-b2fe45d05532', created_at='2024-03-04T19:56:16.819244Z', name='mimic_ehr_demo_dev', description='', base_version_uid='0d2fa03c-52ff-499e-9ffc-3dd758c0005d', project_uid='1752fb63-c36c-4221-85a4-f979c5e921bb', workgroup_uid='97d7456b-baef-4d88-bfb1-afd4975eaff2', data_schema_uid='35ae63d7-146d-4990-b788-cb204568c9b6', uid='854a8cc9-afe6-4363-95d2-5bfee7c531af', version=1, num_cases=1679, import_status='Complete')

In [61]:
cohorts[1]

Dataset(session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0>, creator_uid='e50f2461-3d7a-4fe0-b457-b2fe45d05532', created_at='2024-03-04T19:57:40.953639Z', name='mimic_ehr_obs_dev', description='', base_version_uid='d1a796e7-f73a-4d70-9450-84224ab37685', project_uid='1752fb63-c36c-4221-85a4-f979c5e921bb', workgroup_uid='97d7456b-baef-4d88-bfb1-afd4975eaff2', data_schema_uid='e333b639-87ea-4860-b202-96bb4c1d09cd', uid='d1a796e7-f73a-4d70-9450-84224ab37685', version=0, num_cases=1679, import_status='Complete')

In [62]:
cohorts[2]

Dataset(session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0>, creator_uid='e50f2461-3d7a-4fe0-b457-b2fe45d05532', created_at='2024-03-04T20:14:33.913781Z', name='mimic_cxr_dev', description='mimic_cxr_dev', base_version_uid='a45aabb1-bc58-4233-a2f1-65dc1a151178', project_uid='1752fb63-c36c-4221-85a4-f979c5e921bb', workgroup_uid='97d7456b-baef-4d88-bfb1-afd4975eaff2', data_schema_uid='44a33417-767b-4357-86c0-8e05d195672e', uid='a45aabb1-bc58-4233-a2f1-65dc1a151178', version=0, num_cases=76, import_status='Complete')

In [63]:
cohorts_by_workgroup = {
    workgroups_by_uid[x.workgroup_uid].name: x for x in cohorts
}
cohorts_by_workgroup

{'Rhino Health Test': Dataset(session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0>, creator_uid='e50f2461-3d7a-4fe0-b457-b2fe45d05532', created_at='2024-03-04T20:14:33.913781Z', name='mimic_cxr_dev', description='mimic_cxr_dev', base_version_uid='a45aabb1-bc58-4233-a2f1-65dc1a151178', project_uid='1752fb63-c36c-4221-85a4-f979c5e921bb', workgroup_uid='97d7456b-baef-4d88-bfb1-afd4975eaff2', data_schema_uid='44a33417-767b-4357-86c0-8e05d195672e', uid='a45aabb1-bc58-4233-a2f1-65dc1a151178', version=0, num_cases=76, import_status='Complete')}

In [70]:
def get_dataset_names(project_uid: str):
    dataset_names = session.project.get_datasets(project_uid)
    for dataset in dataset_names:
        print(dataset.name)

get_dataset_names(project_uid)

mimic_ehr_demo_dev
mimic_ehr_obs_dev
mimic_cxr_dev


In [21]:
# HCO
my_username_hco = "adrish+2@rhinohealth.com"

print("Logging In")
session_hco = rh.login(
    username=my_username_hco, 
    password=getpass.getpass(), 
    show_traceback=True,
    rhino_api_url='https://dev.rhinohealth.com/api/'
)
print("Logged In")

Logging In


 ········


Logged In


In [71]:
# hco collaborator needs to upload the data to their rhino client (i.e. via UI/SDK)
hco_cxr_cohort = session_hco.project.get_dataset_by_name("mimic_cxr_hco")
hco_cxr_cohort_uid = hco_cxr_cohort.uid

aidev_cxr_cohort = session.project.get_dataset_by_name("mimic_cxr_dev")
aidev_cxr_cohort_uid = aidev_cxr_cohort.uid

print(f"Loaded CXR cohorts '{hco_cxr_cohort.uid}', '{aidev_cxr_cohort.uid}'")

Loaded CXR cohorts 'b3321c41-85f1-492b-b56f-a6fa99c5c79e', 'a45aabb1-bc58-4233-a2f1-65dc1a151178'


### We will use a Pre-defined Container Image with our Model

### Define the Generalized Compute Model that will Convert DICOM Images to JPG Files

```python
cxr_image_uri= "913123821419.dkr.ecr.us-east-1.amazonaws.com/rhino-gc-workgroup-rhino-sandbox-decode-health:data-prep-sb-1"

compute_params = CodeObjectCreateInput(
    session=session,
    name="DICOM to JPG Transformation Code",
    description="CXR JPG transformation the AI dev and Health System datasets",
    input_data_schema_uids = [cxr_schema_uid_mimic_cxr_dev],
    output_data_schema_uids = [None], # Auto-Generating the Output Data Schema for the Model
    project=project_uid,
    type=CodeTypes.GENERALIZED_COMPUTE,
    config={
        "container_image_uri": cxr_image_uri
    }
)
compute_model = session.code_object.create_code_object(compute_params)

print(f"Got aimodel/code object '{compute_model.name}' with uid {compute_model.uid}")
```

In [72]:
python_code = """
import pandas as pd
import os
import pydicom
import numpy as np
from PIL import Image
from sklearn.impute import SimpleImputer
import glob


def convert_dcm_image_to_jpg(name):
	dcm = pydicom.dcmread(name)
	img = dcm.pixel_array.astype(float)
	rescaled_image = (np.maximum(img, 0) / img.max()) * 255  # float pixels
	final_image = np.uint8(rescaled_image)  # integers pixels
	final_image = Image.fromarray(final_image)
	return final_image


def dataset_dcm_to_jpg(dataset_df):
	input_dir = '/input/dicom_data/'
	output_dir = '/output/file_data/'
	dcm_list = glob.glob(input_dir + '/*/*.dcm')

	dataset_df['JPG_file'] = 'Nan'
	for dcm_file in dcm_list:
		image = convert_dcm_image_to_jpg(dcm_file)
		jpg_file_name = dcm_file.split('/')[-1].split('.dcm')[0] + '.jpg'
		ds = pydicom.dcmread(dcm_file)
		idx = dataset_df['Pneumonia'][dataset_df.SeriesUID == ds.SeriesInstanceUID].index[0]
		ground_truth = '1' if dataset_df.loc[idx, 'Pneumonia'] else '0'
		class_folder = output_dir + ground_truth
		if not os.path.exists(class_folder):
			os.makedirs(class_folder)
		image.save('/'.join([class_folder, jpg_file_name]))
		dataset_df.loc[idx, 'JPG file'] = '/'.join([ground_truth, jpg_file_name])

	return dataset_df


if __name__ == '__main__':
	# Read dataset from /input
	dataset = pd.read_csv('/input/dataset.csv')

	# Convert DICOM to JPG
	dataset = dataset_dcm_to_jpg(dataset)

	# Write dataset to /output
	dataset.to_csv('/output/dataset.csv', index=False)
 """


In [73]:
code_object_params = CodeObjectCreateInput(
    name="DICOM to JPG Transformation Code",
    description="CXR JPG transformation the AI dev and Health System datasets",
    input_data_schema_uids = [cxr_schema_uid_mimic_cxr_dev],
    output_data_schema_uids = [None], # a schema will be automatically generated
    project_uid = project_uid,
    code_type = CodeTypes.PYTHON_CODE,
    code_execution_mode = 'AUTO_CONTAINER_SNIPPET',
    requirements_mode = 'PYTHON_PIP',
    config = {
		   "python_code": python_code,
           "requirements" : ["pandas == 1.3.4", "numpy == 1.21.3","sklearn==0.0", "sklearn-pandas==1.8.0", "scikit-learn==1.0.2","pydicom==2.2.0","Pillow==8.4.0"],
    }
)

data_code_object = session.code_object.create_code_object(code_object_params)
print(f"Got Code Object '{data_code_object.name}' with uid {data_code_object.uid}")

Got Code Object 'DICOM to JPG Transformation Code' with uid 47510df7-8fc6-4e13-af5b-7553e7abacab


In [74]:
code_object_params

CodeObjectCreateInput(session=None, name='DICOM to JPG Transformation Code', description='CXR JPG transformation the AI dev and Health System datasets', input_data_schema_uids=['44a33417-767b-4357-86c0-8e05d195672e'], output_data_schema_uids=[None], project_uid='1752fb63-c36c-4221-85a4-f979c5e921bb', code_type='Python Code', base_version_uid='', config={'python_code': "\nimport pandas as pd\nimport os\nimport pydicom\nimport numpy as np\nfrom PIL import Image\nfrom sklearn.impute import SimpleImputer\nimport glob\n\n\ndef convert_dcm_image_to_jpg(name):\n\tdcm = pydicom.dcmread(name)\n\timg = dcm.pixel_array.astype(float)\n\trescaled_image = (np.maximum(img, 0) / img.max()) * 255  # float pixels\n\tfinal_image = np.uint8(rescaled_image)  # integers pixels\n\tfinal_image = Image.fromarray(final_image)\n\treturn final_image\n\n\ndef dataset_dcm_to_jpg(dataset_df):\n\tinput_dir = '/input/dicom_data/'\n\toutput_dir = '/output/file_data/'\n\tdcm_list = glob.glob(input_dir + '/*/*.dcm')\n\n\

In [75]:
code_object = session.code_object.get_code_object_by_name("DICOM to JPG Transformation Code", project_uid=project_uid)
code_object

CodeObject(session=<rhino_health.lib.rhino_session.RhinoSession object at 0x106cc1fd0>, creator_uid='e50f2461-3d7a-4fe0-b457-b2fe45d05532', created_at='2024-03-04T20:35:31.649690Z', name='DICOM to JPG Transformation Code', description='CXR JPG transformation the AI dev and Health System datasets', input_data_schema_uids=['44a33417-767b-4357-86c0-8e05d195672e'], output_data_schema_uids=[None], project_uid='1752fb63-c36c-4221-85a4-f979c5e921bb', code_type='Generalized Compute', base_version_uid='47510df7-8fc6-4e13-af5b-7553e7abacab', config={'container_image_uri': '913123821419.dkr.ecr.us-east-1.amazonaws.com/rhino-gc-workgroup-rhino-sandbox-decode-health:data-prep-sb-1', 'container_base_type': 'base_image', 'base_image_uri': 'public.ecr.aws/u7h0g2s9/rhinohealth/instant_containers/python:3.9.7-slim-bullseye', 'requirements_mode': 'python_pip', 'requirements': ['numpy == 1.22.*', 'pandas ~= 1.4.2'], 'code_location': 'single_non_binary_file', 'code_execution_mode': 'snippet'}, uid='47510df

In [76]:
code_object_params = CodeObjectRunInput(
  code_object_uid = code_object.uid,
  input_dataset_uids = [[aidev_cxr_cohort_uid],[hco_cxr_cohort_uid]],
  output_dataset_names_suffix = "_conv",
  timeout_seconds = 600
)

  warn(


In [77]:
# NOTE: This step has been verified already, but failing due to collaborator that hasn't accepted the permissions policy.

code_run = session.code_object.run_code_object(code_object_params)

Exception: Failed to make request
Status is 400, Error: , Content is b'\n<!doctype html>\n<html lang="en">\n<head>\n  <title>Bad Request (400)</title>\n</head>\n<body>\n  <h1>Bad Request (400)</h1><p></p>\n</body>\n</html>\n'


In [None]:
# NOTE: This step has been verified already, but failing due to collaborator that hasn't accepted the permissions policy.

run_result = code_run.wait_for_completion()
print(f"Finished running {code_object.name}")
print(f"Result status is '{run_result.status.value}', errors={run_result.result_info.get('errors') if run_result.result_info else None}")

Exception: Failed to make request
Status is 400, Error: , Content is b'\n<!doctype html>\n<html lang="en">\n<head>\n  <title>Bad Request (400)</title>\n</head>\n<body>\n  <h1>Bad Request (400)</h1><p></p>\n</body>\n</html>\n'
