# Performing Exploratory Data Analysis (EDA) with FCP
Demonstrate usage of the Rhino Health Python SDK for performing EDA using federated analytics

#### Prerequisites 
1. Have two datasets imported in FCP with Height, Weight, and Gender fields (e.g. from Tutorial 1)

### Initialization and Login

In [None]:
from getpass import getpass
import rhino_health

In [None]:
print("Logging In")
my_username = "my_email@example.com" # Replace this with the email you use to log into Rhino Health
session = rhino_health.login(username=my_username, password=getpass())
print("Logged In")

In [None]:
FIRST_TEST_DATASET_ID = "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX"  # Replace this
SECOND_TEST_DATASET_ID = "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX"  # Replace this
first_dataset = session.dataset.get_dataset(FIRST_TEST_DATASET_ID)
second_dataset = session.dataset.get_dataset(SECOND_TEST_DATASET_ID)
all_datasets = [first_dataset.uid, second_dataset.uid]

### Calculate Metrics Per Site
All calculations are performed on-prem - only aggregate data returned to the notebook

In [None]:
from rhino_health.lib.metrics import Count, FilterType, Mean, StandardDeviation

In [None]:
print("Simple counts per site")

count_verification = Count(variable="Height")
individual_results = {
    "site1": session.dataset.get_dataset_metric(first_dataset.uid, count_verification).output,
    "site2": session.dataset.get_dataset_metric(second_dataset.uid, count_verification).output,
}

print(f"{individual_results}")

In [None]:
print("Simple mean per site")

mean_verification = Mean(
    variable="Height"
)
individual_results = {
    "site1": session.dataset.get_dataset_metric(first_dataset.uid, mean_verification).output,
    "site2": session.dataset.get_dataset_metric(second_dataset.uid, mean_verification).output,
}

print(f"{individual_results}")

In [None]:
print("Filtered Height mean per site")

mean_verification = Mean(
    variable={
        "data_column": "Height",
        "filter_column": "Gender",
        "filter_value": "M"
    }
)
individual_results = {
    "site1": session.dataset.get_dataset_metric(first_dataset.uid, mean_verification).output,
    "site2": session.dataset.get_dataset_metric(second_dataset.uid, mean_verification).output,
}

print(f"{individual_results}")

In [None]:
print("Grouped Height mean per site")

mean_verification = Mean(
    variable="Height",
    group_by={"groupings": ["Gender"]},
)
individual_results = {
    "site1": session.dataset.get_dataset_metric(first_dataset.uid, mean_verification).output,
    "site2": session.dataset.get_dataset_metric(second_dataset.uid, mean_verification).output,
}

print(f"{individual_results}")


### Calculate Aggregated Metrics Across Multiple Sites
Similarly - all calculations are performed on-prem - only aggregate data returned to the notebook

In [None]:
print("Aggregate Grouped Height mean")
grouped_results = session.project.aggregate_dataset_metric(all_datasets, mean_verification)

print(f"{grouped_results.output}")

In [None]:
print("Complex Aggregation")

configuration = Mean(
    variable={
        "data_column": "Height",
        "filter_column": "Weight",
        "filter_value": 70,
        "filter_type": FilterType.GREATER_THAN_EQUAL,
    },
    group_by={"groupings": ["Gender"]}
)

grouped_results = session.project.aggregate_dataset_metric(all_datasets, configuration)
print(f"{grouped_results.output}")


In [None]:
print("Complex Aggregation with Complex Filtering")

configuration = Mean(
    variable={
        "data_column": "Height",
        "filter_column": "Weight",
        "filter_value": {
            "lower": {"filter_value": 70, "filter_type": FilterType.GREATER_THAN_EQUAL},
            "upper": {"filter_value": 100, "filter_type": FilterType.LESS_THAN_EQUAL},
        },
        "filter_type": FilterType.BETWEEN,
    },
    group_by={"groupings": ["Gender"]}
)

grouped_results = session.project.aggregate_dataset_metric(all_datasets, configuration)
print(f"{grouped_results.output}")

In [None]:
print("Standard Deviation of Height")

configuration = StandardDeviation(
    variable="Height"
)
individual_results = {
    "site1": session.dataset.get_dataset_metric(first_dataset.uid, configuration).output,
    "site2": session.dataset.get_dataset_metric(second_dataset.uid, configuration).output,
}
print(f"{individual_results}")