# Performing Exploratory Data Analysis (EDA) with FCP
Demonstrate usage of the Rhino Health Python SDK for performing EDA using federated analytics

#### Prerequisites 
1. Have two datasets imported in FCP with Height, Weight, and Gender fields (e.g. from Tutorial 1)

### Initialization and Login

In [None]:
from getpass import getpass
import rhino_health as rh

In [None]:
print("Logging In")
my_username = "my_email@example.com" # Replace this with the email you use to log into Rhino Health
session = rhino_health.login(username=my_username, password=getpass())
print("Logged In")

### Run this cell to list all of the projects in your environment:

In [None]:
projects = session.project.search_for_projects_by_name(name="")
for p in projects:
    print(p.name)

### Load the Project you would like to calculate the metric for by placing the Project's name below
Replace `PROJECT_NAME` with the name of your project

In [None]:
project = session.project.get_project_by_name("PROJECT_NAME")
if not project:
        raise ValueError("Project not found.")

print("Selected project name:", project.name)

### List your available datasets in your project and then load those you would like to calculate federated percentiles

Run this cell to list all of your available datasets in your project

In [None]:
all_datasets = session.dataset.search_for_datasets_by_name(name="")
project_datasets = [d for d in all_datasets if d.project and d.project.uid == project.uid]

print(f"\nDatasets in project '{project.name}':")
for d in project_datasets:
    print(f"- {d.name} : {d.uid}")

In [None]:
FIRST_TEST_DATASET_UID = "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX"   # Replace this with the ID of the first test dataset
SECOND_TEST_DATASET_UID = "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX"   # Replace this with the ID of the second test dataset
first_dataset = session.dataset.get_dataset(FIRST_TEST_DATASET_UID)
second_dataset = session.dataset.get_dataset(SECOND_TEST_DATASET_UID)
all_datasets = [first_dataset.uid, second_dataset.uid]

### Calculate Metrics Per Site
All calculations are performed on-prem - only aggregate data returned to the notebook

In [None]:
from rhino_health.lib.metrics import Count, FilterType, Mean, StandardDeviation

In [None]:
print("Simple counts per site")

count_verification = Count(variable="Height")
individual_results = {
    "site1": session.dataset.get_dataset_metric(first_dataset.uid, count_verification).output,
    "site2": session.dataset.get_dataset_metric(second_dataset.uid, count_verification).output,
}

print(f"{individual_results}")

In [None]:
print("Simple mean per site")

mean_verification = Mean(
    variable="Height"
)
individual_results = {
    "site1": session.dataset.get_dataset_metric(first_dataset.uid, mean_verification).output,
    "site2": session.dataset.get_dataset_metric(second_dataset.uid, mean_verification).output,
}

print(f"{individual_results}")

In [None]:
print("Filtered Height mean per site")

mean_verification = Mean(
    variable={
        "data_column": "Height",
        "filter_column": "Gender",
        "filter_value": "M"
    }
)
individual_results = {
    "site1": session.dataset.get_dataset_metric(first_dataset.uid, mean_verification).output,
    "site2": session.dataset.get_dataset_metric(second_dataset.uid, mean_verification).output,
}

print(f"{individual_results}")

In [None]:
mean_verification = Mean(
    variable="Height",
    group_by={"groupings": ["Gender"]},
)
individual_results = {
    "site1": session.dataset.get_dataset_metric(first_dataset.uid, mean_verification).output,
    "site2": session.dataset.get_dataset_metric(second_dataset.uid, mean_verification).output,
}

print(f"{individual_results}")


### Calculate Aggregated Metrics Across Multiple Sites
Similarly - all calculations are performed on-prem - only aggregate data returned to the notebook.

This ensures data privacy is preserved at each site while enabling secure federated analysis.


#### Aggregate Grouped Height Mean

In [None]:
print("Aggregate Grouped Height Mean:")

mean_verification = Mean(
    variable="Height",
    group_by={"groupings": ["Gender"]}
)

grouped_results = session.project.aggregate_dataset_metric(
    dataset_uids=[str(first_dataset.uid), str(second_dataset.uid)],
    metric_configuration=mean_verification
)

print(f"{grouped_results.output}")

#### Complex Aggregation (Filter + Grouping)

In [None]:
print("Complex Aggregation (Filtered and Grouped):")

configuration = Mean(
    variable={
        "data_column": "Height",
        "filter_column": "Weight",
        "filter_value": 70,
        "filter_type": FilterType.GREATER_THAN_EQUAL,
    },
    group_by={"groupings": ["Gender"]}
)

grouped_results = session.project.aggregate_dataset_metric(
    dataset_uids=[str(first_dataset.uid), str(second_dataset.uid)],
    metric_configuration=configuration
)

print(f"{grouped_results.output}")

#### Complex Aggregation with Complex Filtering

In [None]:
print("Complex Aggregation with Complex Filtering:")

configuration = Mean(
    variable={
        "data_column": "Height",
        "filter_column": "Weight",
        "filter_value": {
            "lower": {"filter_value": 70, "filter_type": FilterType.GREATER_THAN_EQUAL},
            "upper": {"filter_value": 100, "filter_type": FilterType.LESS_THAN_EQUAL},
        },
        "filter_type": FilterType.BETWEEN,
    },
    group_by={"groupings": ["Gender"]}
)

grouped_results = session.project.aggregate_dataset_metric(
    dataset_uids=[str(first_dataset.uid), str(second_dataset.uid)],
    metric_configuration=configuration
)

print(f"{grouped_results.output}")

#### Standard Deviation of Height (Per Site, Not Federated)

In [None]:
print("Standard Deviation of Height (per site, not aggregated):")

configuration = StandardDeviation(variable="Height")

individual_results = {
    "site1": session.dataset.get_dataset_metric(first_dataset.uid, configuration).output,
    "site2": session.dataset.get_dataset_metric(second_dataset.uid, configuration).output,
}

print(f"{individual_results}")