# Tutorial 2 - Multi-cohort Data Harmonization

Demonstrate how to script out data harmonization, cohort import, and reusable ETLs with the Rhino Health Python SDK

### 1. Install the Rhino Health Python SDK

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
! pip install rhino_health

### 2. Initialization and Login

In [None]:
import pandas as pd
from datetime import datetime
import os
import sys
from pathlib import Path
import getpass

import rhino_health as rh
from rhino_health.lib.endpoints.cohort.cohort_dataclass import CohortCreateInput
from rhino_health.lib.endpoints.aimodel.aimodel_dataclass import (
    AIModel,
    AIModelCreateInput,
    ModelTypes,
    AIModelRunInput,
)

#### Replace the Values with the following Variables below
1. my_nusername - This should be your username that you use to log into the Rhino Health Platform
2. my_password - This should be your password that you use to log into the Rhino Health Platform
3. ecr_base_url - Remove your ECR workgroup from the URL you were given and put that value here. It should be of the form `XXXXXXXXXXXXXX.dkr.ecr.<aws-region>.amazonaws.com` where aws-region might be us-east-1 or something similar
4. project_uid - Copy the UID from the project you just created in the UI by navigating to the homepage, pressing on the three verticle dot button in your project's square and then selecting the button _Copy UID_.

In [None]:
my_username = 'USERNAME'
ecr_base_uri = "XXXXXXXXXXXX.dkr.ecr.us-east-1.amazonaws.com"
project_uid = "XXXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" # Paste your project uid here as a string

print("Logging In")
session = rh.login(username=my_username, password=getpass.getpass())
print("Logged In")

### 3. Import Cohorts:

#### Collect all Necessary Parameters for Importing your cohort:

In [None]:
workgroup = session.project.get_collaborating_workgroups(project_uid)[0]
dataschema = session.project.get_data_schemas(project_uid)[0]
print(f"Loaded dataschema '{dataschema.name}' with uid '{dataschema.uid}'")

# Change these paths according to where you placed the files on your client
cohort1_path = "/rhino_data/tutorial_2/cohorts/site1_part1_cohort_data.csv"
cohort2_path = "/rhino_data/tutorial_2/cohorts/site2_part1_cohort_data.csv"
cohort3_path = "/rhino_data/tutorial_2/cohorts/site3_part1_cohort_data.csv"

#### Trigger cohort import

In [None]:
cohort_creation_params = CohortCreateInput(
    name="Site 1 Cohort",
    description="Diabetes cohort for site 1",
    project_uid=project_uid, 
    workgroup_uid=workgroup.uid,
    data_schema_uid=dataschema.uid,
    csv_filesystem_location=cohort1_path,
    is_data_deidentified=True,
    method="filesystem",
)

site1_cohort = session.cohort.add_cohort(cohort_creation_params)
print(f"Created new cohort '{site1_cohort.name}' with uid '{site1_cohort.uid}'")

cohort_creation_params = CohortCreateInput(
    name="Site 2 Cohort",
    description="Diabetes cohort for site 2",
    project_uid=project_uid, 
    workgroup_uid=workgroup.uid,
    data_schema_uid=dataschema.uid,
    csv_filesystem_location=cohort2_path,
    is_data_deidentified=True,
    method="filesystem",
)

site2_cohort = session.cohort.add_cohort(cohort_creation_params)
print(f"Created new cohort '{site2_cohort.name}' with uid '{site2_cohort.uid}'")

cohort_creation_params = CohortCreateInput(
    name="Site 3 Cohort",
    description="Diabetes cohort for site 3",
    project_uid=project_uid, 
    workgroup_uid=workgroup.uid,
    data_schema_uid=dataschema.uid,
    csv_filesystem_location=cohort3_path,
    is_data_deidentified=True,
    method="filesystem",
)

site3_cohort = session.cohort.add_cohort(cohort_creation_params)
print(f"Created new cohort '{site3_cohort.name}' with uid '{site3_cohort.uid}'")
print("You should now have 3 new cohorts in the project within the GUI. Feel free to take a look!")

### 4. Harmonize Cohorts Using Containerless GC
By reviewing the cohort analytics on the GUI, you can see several inconsistencies across the 3 cohorts (specifically for 'Outcome', 'Weight', 'Height' and 'SkinThickness'). These kinds of inconsistencies can often occur when collecting data from multiple sources. 

In this next part, you will use simple pandas operations to produce harmonized versions of these cohorts. 

#### 4.1 Define harmonization code for each cohort

In [None]:
site_1_code = "df.replace({'Outcome': { 'Positive': 1, 'Negative': 0}}, inplace=True)\ndf.Weight = round(df.Weight*0.453592, 0).astype(int)"
site_2_code = "df['SkinThickness'] = df['SkinThickness']*100\ndf['Height'] = df['Height']/100"
site_3_code = "df.replace({'Outcome': { 'Positive': 1, 'Negative': 0}},inplace=True)\ndf['Pregnancies'].replace('None', 0, inplace=True)"

#### 4.3 Site 1 Data Harmonization by Defining a Run of Our AI Model

In [None]:
print("Starting to run harmonization on site 1 data")
output_cohort, run_results = site1_cohort.run_code(site_1_code, output_data_schema_uid=dataschema.uid, output_cohort_names_suffix=" Fixed")
print("Finished running harmonization on site 1 data")

print("You can now see a new cohort in the GUI named 'Site 1 Cohort Fixed'")
print("View the Results below")
run_results.raw_response().json()

#### 4.4 Site 2 Data Harmonization

In [None]:
print("Starting to run harmonization on site 2 data")
output_cohort, run_results = site2_cohort.run_code(site_2_code, output_data_schema_uid=dataschema.uid, output_cohort_names_suffix=" Fixed")
print("Finished running harmonization on site 2 data")

print("You can now see a new cohort in the GUI named 'Site 2 Cohort Fixed'")
print("View the Results below")
run_results.raw_response().json()

#### 4.5 Site 3 Data Harmonization

In [None]:
print("Starting to run harmonization on site 3 data")
output_cohort, run_results = site3_cohort.run_code(site_3_code, output_data_schema_uid=dataschema.uid, output_cohort_names_suffix=" Fixed")
print("Finished running harmonization on site 3 data")

print("You can now see a new cohort in the GUI named 'Site 3 Cohort Fixed'")
print("View the Results below")
run_results.raw_response().json()

### 5. Import updated cohorts

Now let's imagine you have some updated data (*_part_2.csv files). That you would like to harmonize in a similar fashion. Making simple modifications to the code above we can harmonize the new data with little effort.

First let's import the updated data as new cohorts.

In [None]:
# Replace the paths here according to where you placed the files on you client
cohort1_part2_path = "/rhino_data/tutorial_2/cohorts/site1_part2_cohort_data.csv"
cohort2_part2_path = "/rhino_data/tutorial_2/cohorts/site2_part2_cohort_data.csv"
cohort3_part2_path = "/rhino_data/tutorial_2/cohorts/site3_part2_cohort_data.csv"

In [None]:
cohort_creation_params = CohortCreateInput(
    name="Site 1 Cohort - Part 2",
    description="Updated diabetes cohort for site 1 - part 2",
    project_uid=project_uid, 
    workgroup_uid=workgroup.uid,
    data_schema_uid=dataschema.uid,
    csv_filesystem_location=cohort1_part2_path,
    is_data_deidentified=True,
    method="filesystem",
)
site1_part2_cohort = session.cohort.add_cohort(cohort_creation_params)
print(f"Created new cohort '{site1_part2_cohort.name}' with uid '{site1_part2_cohort.uid}'")

cohort_creation_params = CohortCreateInput(
    name="Site 2 Cohort - Part 2",
    description="Updated diabetes cohort for site 2 - part 2",
    project_uid=project_uid, 
    workgroup_uid=workgroup.uid,
    data_schema_uid=dataschema.uid,
    csv_filesystem_location=cohort2_part2_path,
    is_data_deidentified=True,
    method="filesystem",
)
site2_part2_cohort = session.cohort.add_cohort(cohort_creation_params)
print(f"Created new cohort '{site2_part2_cohort.name}' with uid '{site2_part2_cohort.uid}'")

cohort_creation_params = CohortCreateInput(
    name="Site 3 Cohort - Part 2",
    description="Updated diabetes cohort for site 3 - part 2",
    project_uid=project_uid, 
    workgroup_uid=workgroup.uid,
    data_schema_uid=dataschema.uid,
    csv_filesystem_location=cohort3_part2_path,
    is_data_deidentified=True,
    method="filesystem",
)
site3_part2_cohort = session.cohort.add_cohort(cohort_creation_params)
print(f"Created new cohort '{site3_part2_cohort.name}' with uid '{site3_part2_cohort.uid}'")

### 6. Harmonize data reusing the previous harmonization code
The new Part 2 cohorts suffer from the same inconsistencies as the Part 1 cohorts.
You can easily fix this by running the same preprocessing code you have defined eariler:

In [None]:
# Site 1
print("Starting to run harmonization on site 1 - part 2 data")
output_cohort, run_results = site1_part2_cohort.run_code(site_1_code, output_data_schema_uid=dataschema.uid, output_cohort_names_suffix=" Fixed")
print("Finished running harmonization on site 1 - part 2 data")
print("You can now see a new cohort in the GUI named 'Site 1 Cohort - Part 2 Fixed'")

# Site 2
print("Starting to run harmonization on site 2 - part 2 data")
output_cohort, run_results = site2_part2_cohort.run_code(site_1_code, output_data_schema_uid=dataschema.uid, output_cohort_names_suffix=" Fixed")
print("Finished running harmonization on site 2 - part 2 data")
print("You can now see a new cohort in the GUI named 'Site 2 Cohort - Part 2 Fixed'")

# Site 3
print("Starting to run harmonization on site 3 - part 2 data")
output_cohort, run_results = site3_part2_cohort.run_code(site_1_code, output_data_schema_uid=dataschema.uid, output_cohort_names_suffix=" Fixed")
print("Finished running harmonization on site 3 - part 2 data")
print("You can now see a new cohort in the GUI named 'Site 3 Cohort - Part 2 Fixed'")

#### Your datasets are now harmonized! Use the filters on the Cohort Analytics tab in the GUI to visualize the results.
# End of tutorial 2! 