In [7]:
import os
import sys 
from getpass import getpass, getuser

import rhino_health # Only need this in final version

print("Logging In")
my_username = "my_email@example.com" # Replace this with the email you use to log into Rhino Health
my_password = getpass()
rhino_api_url= rhino_health.ApiEnvironment.PROD_API_URL
session = rhino_health.login(username=my_username, password=my_password, rhino_api_url=rhino_api_url, show_traceback=True)

print("Logged In")

Logging In
········
Logged In


# Intersection Mode

The data we are interested in lives at two different sites.
Lahey has information about the patient, age and blood type
MGH has the gender SpO2 values.

In [8]:
WORKGROUP_UID = "e590e0fa-ae37-48b3-b50e-c232536cefab"  # Replace this with your workgroup

from rhino_health.lib.endpoints.cohort.cohort_dataclass import CohortCreateInput
from rhino_health.lib.endpoints.data_schema.data_schema_dataclass import DataschemaCreateInput
from rhino_health.lib.endpoints.project.project_dataclass import ProjectCreateInput
from rhino_health.lib.metrics.basic import Mean, StandardDeviation

new_project = ProjectCreateInput(
    name="Federated Join Metrics",
    description="Example Project for Federated Join",
    type="Validation",
    primary_workgroup_uid=WORKGROUP_UID,
)
project = session.project.add_project(new_project)

dataschema_input = DataschemaCreateInput(
    name="Federated Join Input Schema",
    description="Federated Join Input Schema",
    primary_workgroup_uid=WORKGROUP_UID,
    projects=[project.uid],
    file_path="./FederatedDataSchema.csv",
)
dataschema = session.data_schema.create_data_schema(dataschema_input)
data_schema_uid = dataschema.uid

filter_cohort_input = CohortCreateInput(
    name="Blood Test Results",
    description="Identifiers with Blood Type, No SPO2 or Gender",
    project_uid=project.uid,
    workgroup_uid=WORKGROUP_UID,
    data_schema_uid=data_schema_uid,
    csv_filesystem_location="./rhinoOnPrem/rhinoAgent/tests/test_data/FederatedJoinFilterCohort.csv",
    image_filesystem_location="",
    method="filesystem",
    is_data_deidentified=True,
    file_base_path="",
)
filter_cohort_at_lahey = session.cohort.add_cohort(filter_cohort_input)

first_data_cohort_input = CohortCreateInput(
    name="SpO2 Values on 1/1",
    description="Identifiers with SPO2 and Gender no Blood Type",
    project_uid=project.uid,
    workgroup_uid=WORKGROUP_UID,
    data_schema_uid=data_schema_uid,
    csv_filesystem_location="./rhinoOnPrem/rhinoAgent/tests/test_data/FederatedJoinDataCohort.csv",
    image_filesystem_location="",
    method="filesystem",
    is_data_deidentified=True,
    file_base_path="",
)
first_data_cohort_at_mgh = session.cohort.add_cohort(first_data_cohort_input)

print(filter_cohort_at_lahey.dict(include={'uid', 'import_status'}))
print(first_data_cohort_at_mgh.dict(include={'uid', 'import_status'}))

{'uid': 'ef60b5d1-a043-4c5c-ba1b-c988ffae1926', 'import_status': 'Complete'}
{'uid': '5c22a788-3908-4cac-8ed7-422b7cf277db', 'import_status': 'Complete'}


In [9]:
from rhino_health.lib.metrics import Count, Mean, StandardDeviation

In [26]:
print("SpO2 values in data cohort for patents > 35 years old")
configuration = Mean(
    variable="SpO2",
    join_field={"data_column": "UID", "filter_column": "Age", "filter_value": 35, "filter_type": ">"},
)

joined_results = session.project.joined_cohort_metric(
    filter_cohorts=[filter_cohort_at_lahey.uid],
    data_cohorts=[first_data_cohort_at_mgh.uid],
    configuration=configuration
)

print(joined_results.output)

SpO2 values in data cohort for patents > 35 years old
{'mean': 1.0231055900621118}


We can compare the federated result to what we would have gotten if we ran this unfederated and see that we get the same response.

In [27]:
import pandas as pd
import numpy as np
filter_df = pd.read_csv("./FederatedJoinFilterCohort.csv")
first_data_df = pd.read_csv("./FederatedJoinDataCohort.csv")

centralized_result = np.mean(first_data_df.SpO2[filter_df["Age"] > 35])
print(centralized_result)
federated_result = joined_results.output['mean']
print(centralized_result == federated_result)

1.0231055900621118
True


In [31]:
print("SpO2 values in data cohort for male patents > 35 years old")

configuration = Mean(
    variable={"data_column": "SpO2", "filter_column": "Gender", "filter_value": "m", "filter_type": "="},
    join_field={"data_column": "UID", "filter_column": "Age", "filter_value": 35, "filter_type": ">"},
)

joined_results = session.project.joined_cohort_metric(
    filter_cohorts=[filter_cohort_at_lahey.uid],
    data_cohorts=[first_data_cohort_at_mgh.uid],
    configuration=configuration
)

federated_result = joined_results.output['mean']
print(f"Federeated Mean: {federated_result}")

a = filter_df["Age"] > 35
b = first_data_df["Gender"] == "m"
centralized_result = np.mean(first_data_df.SpO2[a&b])
print(f"Centralized Mean: {centralized_result}")
print(centralized_result == federated_result)

SpO2 values in data cohort for male patents > 35 years old
Federeated Mean: 1.0224691358024691
Centralized Mean: 1.0224691358024691
True


In [34]:
print("Works with grouping. SpO2 values for both genders >35 years old")

configuration = Mean(
    variable="SpO2",
    join_field={"data_column": "UID", "filter_column": "Age", "filter_value": 35, "filter_type": ">"},
    group_by={"groupings": ["Gender"]},
)

joined_results = session.project.joined_cohort_metric(
    filter_cohorts=[filter_cohort_at_lahey.uid],
    data_cohorts=[first_data_cohort_at_mgh.uid],
    configuration=configuration
)

print(joined_results.output)

c = first_data_df["Gender"] == "f"
female_centralized_result = np.mean(first_data_df.SpO2[a&c])
print(female_centralized_result == joined_results.output['f']['mean'])

Works with grouping. SpO2 values for both genders >35 years old
{'m': {'mean': 1.0224691358024691}, 'f': {'mean': 1.02375}}
True


In [50]:
print("Filtering on multiple data columns")

configuration = Mean(
    variable="SpO2",
    join_field="UID",
    data_filters=[
        {
            "filter_column": "Age",
            "filter_value": 35,
            "filter_type": ">",
            "filter_cohort": filter_cohort_at_lahey.uid,  
        },
        {
            "filter_column": "Age",
            "filter_value": 35,
            "filter_type": ">",
            # For intersection mode, if unspecified defaults to the data cohort.
            # "filter_cohort": first_data_cohort_at_mgh.uid,  
        },
        {
            "filter_column": "BloodType",
            "filter_value": "a",
            "filter_type": "=",
            "filter_cohort": filter_cohort_at_lahey.uid,
        },
        {
            "filter_column": "Gender",
            "filter_value": "m",
            "filter_type": "=",
            "filter_cohort": first_data_cohort_at_mgh.uid,
        }
    ],
)

joined_results = session.project.joined_cohort_metric(
    filter_cohorts=[filter_cohort_at_lahey.uid],
    data_cohorts=[first_data_cohort_at_mgh.uid],
    configuration=configuration
)
print(joined_results.output)
federated_result = joined_results.output['mean']

a = filter_df["Age"] > 35
b = filter_df["BloodType"] == "a"
valid_uids = first_data_df.UID.isin(filter_df[a&b].UID)
c = first_data_df["Age"] > 35
d = first_data_df["Gender"] == "m"
centralized_result = np.mean(first_data_df.SpO2[valid_uids&c&d])
print(centralized_result)
print(centralized_result == federated_result)

Filtering on multiple data columns
{'mean': 1.0492307692307692}
1.0492307692307692
True


# Union Mode

We have three datasets for the same metric.

> - Current Latest SpO2 
> - Backup dataset with old SpO2 values
> - Separate initial diagnostics labratory where patients might have transferred from

We want a deduplicated view of the data where we prefer using our own data first, and if the patient is not found we fall back to the other data sources

In [45]:
second_data_cohort_input = CohortCreateInput(
    name="SpO2 Values (Old)",
    description="Identifiers with SPO2 and Gender no Blood Type",
    project_uid=project.uid,
    workgroup_uid=WORKGROUP_UID,
    data_schema_uid=data_schema_uid,
    csv_filesystem_location="./rhinoOnPrem/rhinoAgent/tests/test_data/FederatedJoinUnionCohort1.csv",
    image_filesystem_location="",
    method="filesystem",
    is_data_deidentified=True,
    file_base_path="",
)
second_data_cohort_old_values = session.cohort.add_cohort(second_data_cohort_input)

third_data_cohort_input = CohortCreateInput(
    name="SpO2 Values at Diagnostics Lab",
    description="Identifiers with SPO2 and Gender no Blood Type",
    project_uid=project.uid,
    workgroup_uid=WORKGROUP_UID,
    data_schema_uid=data_schema_uid,
    csv_filesystem_location="./rhinoOnPrem/rhinoAgent/tests/test_data/FederatedJoinUnionCohort2.csv",
    image_filesystem_location="",
    method="filesystem",
    is_data_deidentified=True,
    file_base_path="",
)
third_data_cohort_at_lab = session.cohort.add_cohort(third_data_cohort_input)


union_cohort_uids = [first_data_cohort_at_mgh.uid, second_data_cohort_old_values.uid, third_data_cohort_at_lab.uid]

print(union_cohort_uids)

['5c22a788-3908-4cac-8ed7-422b7cf277db', 'abcaa0d7-1cc5-4092-9ba8-43cadc6b8f2c', '9ff343c8-01a3-4ea7-b263-209d4b0ff524']


In [38]:
second_data_df = pd.read_csv("./FederatedJoinUnionCohort1.csv")
third_data_df = pd.read_csv("./FederatedJoinUnionCohort2.csv")

In [49]:
print("Union mean using data filters")

configuration = Mean(
    variable="SpO2",
    join_field="UID",
    join_mode="union",
    data_filters=[  # Data Filters are applied before any other operations
        {
            "filter_column": "Age",
            "filter_value": 35,
            "filter_type": ">",
            # Applied to every Union cohort
        },
        {
            "filter_column": "Gender",
            "filter_value": "m",
            "filter_type": "=",
            "filter_cohort": second_data_cohort_old_values.uid,
        },
        {
            "filter_column": "Gender",
            "filter_value": "f",
            "filter_type": "=",
            "filter_cohort": third_data_cohort_at_lab.uid,
        },
    ],
)

joined_results = session.project.joined_cohort_metric(
    data_cohorts=union_cohort_uids,
    configuration=configuration
)

print(joined_results.output)

federated_result = joined_results.output['mean']

one = first_data_df[(first_data_df.Age > 35)]
two = second_data_df[(second_data_df.Age > 35) & (second_data_df.Gender == "m") & (~second_data_df.UID.isin(one.UID))]
three = third_data_df[(third_data_df.Age > 35) & (third_data_df.Gender == "m") & (~third_data_df.UID.isin(two.UID)) & (~third_data_df.UID.isin(one.UID))]

centralized_result = np.mean(pd.concat([one.SpO2, two.SpO2, three.SpO2]))

print(centralized_result)
print(centralized_result == federated_result)
            

Union mean using data filters
{'mean': 1.0182239382239384}
1.0182239382239384
True


### Note
Unlike using DataFilters, FilterVariables on the variable field are applied **after** joins are performed. See the example below for the difference

In [52]:
print("Data Filter are applied prior to joining")
configuration = Mean(
    variable="SpO2",
    join_field="UID",
    join_mode="union",
    data_filters=[  # Data Filters are applied before any other operations
        {
            "filter_column": "Age",
            "filter_value": 35,
            "filter_type": ">",
        },
        {
            "filter_column": "Gender",
            "filter_value": "m",
            "filter_type": "=",
        },
    ],
)
joined_results = session.project.joined_cohort_metric(
    data_cohorts=union_cohort_uids,
    configuration=configuration
)

print(joined_results.output)

federated_result = joined_results.output['mean']

one = first_data_df[(first_data_df.Age > 35) & (first_data_df.Gender == "m")]
two = second_data_df[(second_data_df.Age > 35) & (second_data_df.Gender == "m") & (~second_data_df.UID.isin(one.UID))]
three = third_data_df[(third_data_df.Age > 35) & (third_data_df.Gender == "m") & (~third_data_df.UID.isin(two.UID)) & (~third_data_df.UID.isin(one.UID))]

centralized_data_filter_result = np.mean(pd.concat([one.SpO2, two.SpO2, three.SpO2]))
print(centralized_data_filter_result)
print(centralized_data_filter_result == federated_result)

Data Filter are applied prior to joining
{'mean': 1.0301869158878503}
1.0301869158878503
True


In [67]:
print("FilterVariables are applied after joining")  # TODO: Fix this bug

from rhino_health.lib.metrics.base_metric import JoinMode

configuration = Mean(
    variable={"data_column": "SpO2", "filter_column": "Gender", "filter_value": "m", "filter_type": "="},
    join_field={"data_column": "UID", "filter_column": "Age", "filter_value": 35, "filter_type": ">"},
    join_mode=JoinMode.UNION
)

joined_results = session.project.joined_cohort_metric(
    data_cohorts=union_cohort_uids,
    configuration=configuration
)

print(joined_results.output)

federated_result = joined_results.output['mean']
print(centralized_data_filter_result)
print(centralized_data_filter_result == federated_result)

# This is the logic performed instead
one = first_data_df[(first_data_df.Gender == "m")]
one_join = first_data_df[(first_data_df.Age > 35)]
two = second_data_df[(second_data_df.Gender == "m") & (~second_data_df.UID.isin(one_join.UID))]
two_join = second_data_df[(second_data_df.Age > 35) & (~second_data_df.UID.isin(one_join.UID))]
three = third_data_df[(third_data_df.Gender == "m") & (~third_data_df.UID.isin(two_join.UID)) & (~third_data_df.UID.isin(one_join.UID))]

centralized_filter_variable_result = np.mean(pd.concat([one.SpO2, two.SpO2, three.SpO2]))
print(centralized_filter_variable_result)

print(centralized_filter_variable_result == federated_result)


FilterVariables are applied after joining
{'mean': 1.0259375000000002}
1.0301869158878503
False
1.0259375000000002
True
