# Data Scientist

### Install dependencies

In [None]:
!uv pip install syft_flwr numpy pandas matplotlib seaborn

## Log into DOs' datasites

In [None]:
from pathlib import Path

from syft_core import Client

DO1 = "flower-test-group-1@openmined.org"
DO2 = "flower-test-group-2@openmined.org"
DS = Client.load().email
DS

In [None]:
LOCAL_TEST = False

if LOCAL_TEST:
    from syft_rds.orchestra import setup_rds_server

    ds_stack = setup_rds_server(email=DS, key="flwr", root_dir=Path("."))
    do_client_1 = ds_stack.init_session(host=DO1)
    do_client_2 = ds_stack.init_session(host=DO2)
else:
    import syft_rds as sy

    do_client_1 = sy.init_session(host=DO1)
    do_client_2 = sy.init_session(host=DO2)

print("Logged into: ", do_client_1.host)
print("Logged into: ", do_client_2.host)

### Inspect DOs' Mock Datasets

In [None]:
SYFTBOX_DATASET_NAME = "pima-indians-diabetes-database"

dataset1 = do_client_1.dataset.get(name=SYFTBOX_DATASET_NAME)
dataset1.describe()

In [None]:
dataset2 = do_client_2.dataset.get(name=SYFTBOX_DATASET_NAME)
dataset2.describe()

## DS does some data analytics on mock datasets

For each client:
1. Concat the dataframes in `train.csv` and `mock.csv`
2. Calculating key statistical moments (count, sum, mean) and frequency distributions (histograms) for important features ('Glucose', 'BMI', 'Age'). Also visualize the historgrams

In [None]:
import pandas as pd

df1_train = pd.read_csv(dataset1.get_mock_path() / "train.csv")
df1_test = pd.read_csv(dataset1.get_mock_path() / "test.csv")

df2_train = pd.read_csv(dataset2.get_mock_path() / "train.csv")
df2_test = pd.read_csv(dataset2.get_mock_path() / "test.csv")

df = pd.concat([df1_train, df1_test, df2_train, df2_test], ignore_index=True)
df.info()

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from loguru import logger


# --- Plotting Function (adapted to take feature-specific metrics) ---
def plot_feature_histogram_from_metrics_plt(
    feature_name: str, metrics_dict: dict, feature_bins_config: dict
):
    """Plots a combined histogram for a single feature using plt.bar."""
    print(f"\nPlotting histogram for: {feature_name} using plt.bar")
    sns.set_theme(style="whitegrid")  # Apply Seaborn style

    hist_outcome0 = metrics_dict.get(f"{feature_name}_hist_outcome0")
    count_outcome0 = metrics_dict.get(f"{feature_name}_count_outcome0", 0)
    hist_outcome1 = metrics_dict.get(f"{feature_name}_hist_outcome1")
    count_outcome1 = metrics_dict.get(f"{feature_name}_count_outcome1", 0)
    bin_edges = feature_bins_config.get(feature_name)

    if bin_edges is None:
        print(
            f"Error: Bin edges not defined for feature '{feature_name}'. Cannot plot."
        )
        return

    # Ensure bin_edges is a numpy array for np.diff
    if not isinstance(bin_edges, np.ndarray):
        bin_edges = np.array(bin_edges)

    bin_widths = np.diff(bin_edges)

    plt.figure(figsize=(10, 6))
    has_plotted_anything = False

    # Plot outcome 0
    if hist_outcome0 is not None and count_outcome0 > 0:
        frequencies0 = np.array(hist_outcome0)
        if len(frequencies0) == len(bin_edges) - 1:
            plt.bar(
                bin_edges[:-1],
                frequencies0,
                width=bin_widths,
                align="edge",
                alpha=0.6,
                label="No Diabetes (0)",
                color="skyblue",
            )
            has_plotted_anything = True
        else:
            print(
                f"Warning: Mismatch for {feature_name} Outcome 0. Frequencies length {len(frequencies0)}, Expected bins {len(bin_edges)-1}."
            )

    # Plot outcome 1
    if hist_outcome1 is not None and count_outcome1 > 0:
        frequencies1 = np.array(hist_outcome1)
        if len(frequencies1) == len(bin_edges) - 1:
            plt.bar(
                bin_edges[:-1],
                frequencies1,
                width=bin_widths,
                align="edge",
                alpha=0.6,
                label="Diabetes (1)",
                color="salmon",
            )
            has_plotted_anything = True
        else:
            print(
                f"Warning: Mismatch for {feature_name} Outcome 1. Frequencies length {len(frequencies1)}, Expected bins {len(bin_edges)-1}."
            )

    if not has_plotted_anything:
        print(f"Info: No valid histogram data to plot for feature '{feature_name}'.")
        plt.close()  # Close the empty figure
        return

    plt.title(f"Local Histogram: {feature_name}")
    plt.xlabel(feature_name)
    plt.ylabel("Local Frequency")
    plt.xticks(bin_edges, rotation=45, ha="right")
    plt.legend(title="Diabetes Status")
    plt.grid(axis="y", linestyle="--")
    plt.tight_layout()

    # Save plots
    save_dir = Path("./mock_figures")
    if not save_dir.exists():
        save_dir.mkdir(parents=True, exist_ok=True)  # Using pathlib.Path.mkdir
        print(f"Created directory: {save_dir}")

    # Save the plot
    file_path = save_dir / f"{feature_name}_histogram.png"  # Using pathlib operator
    try:
        plt.savefig(file_path)
        print(f"Plot saved to {file_path}")
    except Exception as e:
        print(f"Error saving plot for {feature_name}: {e}")

    plt.show()
    plt.close()

In [None]:
KEY_DIABETES_FEATURES = ["Glucose", "BMI", "Age"]
DIABETES_OUTCOME_COLUMN = "y"
FEATURE_BINS = {
    "Glucose": np.linspace(40, 250, 11),  # 10 bins from 40 to 250
    "BMI": np.linspace(15, 60, 10),  # 9 bins from 15 to 60
    "Age": np.linspace(20, 90, 15),  # 14 bins from 20 to 90
}

# Ensure Glucose, BMI, Age are numeric and handle potential issues if necessary
df["Glucose"] = pd.to_numeric(df["Glucose"], errors="coerce")
df["BMI"] = pd.to_numeric(df["BMI"], errors="coerce")
df["Age"] = pd.to_numeric(df["Age"], errors="coerce")

metrics = {}

for feature_name in KEY_DIABETES_FEATURES:
    logger.info(f"Calculating metrics for feature: {feature_name}")

    if feature_name not in df.columns:
        logger.warning(f"Feature '{feature_name}' not found in DataFrame. Skipping.")
        continue

    current_bin_edges = FEATURE_BINS[feature_name]

    # Metrics for y=0
    subset_no_diabetes = df[df[DIABETES_OUTCOME_COLUMN] == 0]
    feature_data_outcome0 = subset_no_diabetes[feature_name].dropna()

    if not feature_data_outcome0.empty:
        freqs_0, _ = np.histogram(feature_data_outcome0, bins=current_bin_edges)
        metrics[f"{feature_name}_hist_outcome0"] = freqs_0.tolist()
        metrics[f"{feature_name}_mean_outcome0"] = feature_data_outcome0.mean()
        metrics[f"{feature_name}_sum_outcome0"] = feature_data_outcome0.sum()
        metrics[f"{feature_name}_count_outcome0"] = len(feature_data_outcome0)
    else:  # Handle case where feature_data_outcome0 is empty
        metrics[f"{feature_name}_hist_outcome0"] = [0] * (len(current_bin_edges) - 1)
        metrics[f"{feature_name}_mean_outcome0"] = np.nan
        metrics[f"{feature_name}_sum_outcome0"] = np.nan
        metrics[f"{feature_name}_count_outcome0"] = 0

    # Metrics for y=1
    subset_diabetes = df[df[DIABETES_OUTCOME_COLUMN] == 1]
    feature_data_outcome1 = subset_diabetes[feature_name].dropna()

    if not feature_data_outcome1.empty:
        freqs_1, _ = np.histogram(feature_data_outcome1, bins=current_bin_edges)
        metrics[f"{feature_name}_hist_outcome1"] = freqs_1.tolist()
        metrics[f"{feature_name}_mean_outcome1"] = feature_data_outcome1.mean()
        metrics[f"{feature_name}_sum_outcome1"] = feature_data_outcome1.sum()
        metrics[f"{feature_name}_count_outcome1"] = len(feature_data_outcome1)
    else:  # Handle case where feature_data_outcome1 is empty
        metrics[f"{feature_name}_hist_outcome1"] = [0] * (len(current_bin_edges) - 1)
        metrics[f"{feature_name}_mean_outcome1"] = np.nan
        metrics[f"{feature_name}_sum_outcome1"] = np.nan
        metrics[f"{feature_name}_count_outcome1"] = 0

    plot_feature_histogram_from_metrics_plt(
        feature_name=feature_name,
        metrics_dict=metrics,  # Use the metrics from image or calculation
        feature_bins_config=FEATURE_BINS,
    )

logger.info("--- Calculated Metrics ---")
# This will print the 'metrics' dictionary like in your example
for k, v in metrics.items():
    if isinstance(v, list) and len(v) > 5:
        logger.info(f"  {k}: {str(v)[:30]}... (length {len(v)})")
    elif isinstance(v, (np.float64, float)):
        logger.info(f"  {k}: {v:.2f}")
    else:
        logger.info(f"  {k}: {v}")
logger.info("------------------------")

## Preparing `syft_flwr` project code

```bash
fed-analytics-diabetes/
├── fed_analytics_diabetes/
│   ├── __init__.py
│   ├── client_app.py
│   └── server_app.py
├── pyproject.toml
└── README.md
```

In [None]:
from pathlib import Path

SYFT_FLWR_PROJECT_DIR = Path("./fed-analytics-diabetes")
assert SYFT_FLWR_PROJECT_DIR.exists()

## Run `flwr` simulation

After preparing `syft_flwr` code, DS runs `flwr run` to make sure that it's compatible with Flower

In [None]:
RUN_SIMULATION = 1

if RUN_SIMULATION:
    !flwr run {SYFT_FLWR_PROJECT_DIR}

### Bootstrapping the `flwr` project
DS runs `syft_flwr.boostrap` to turn a `flwr` project into a `syft_flwr` project

In [None]:
import syft_flwr

try:
    !rm -rf {SYFT_FLWR_PROJECT_DIR / "main.py"}
    !rm -rf {SYFT_FLWR_PROJECT_DIR / "**/__pycache__/"}
    !rm -rf {SYFT_FLWR_PROJECT_DIR / "simulation_logs"}
    !rm -rf figures/
    syft_flwr.bootstrap(SYFT_FLWR_PROJECT_DIR, aggregator=DS, datasites=[DO1, DO2])
    print("Bootstrapped project successfully ✅")
except Exception as e:
    print(f"Bootstrapped project failed with error: '{e}' ❌")

### DS runs `syft_flwr` simulation

In [None]:
if RUN_SIMULATION:
    syft_flwr.run(
        SYFT_FLWR_PROJECT_DIR, [dataset1.get_mock_path(), dataset2.get_mock_path()]
    )

## DS submits jobs to DOs

In [None]:
!rm -rf {SYFT_FLWR_PROJECT_DIR / "**/__pycache__/"}
!rm -rf {SYFT_FLWR_PROJECT_DIR / "simulation_logs"}
!rm -rf figures/

In [None]:
# Jobs submission
datasites = [do_client_1, do_client_2]

for client in datasites:
    job = client.jobs.submit(
        name="iris_fed_analytics",
        description="Syft Flower Federated Analytics on the Iris Dataset",
        user_code_path=SYFT_FLWR_PROJECT_DIR,
        dataset_name=SYFTBOX_DATASET_NAME,
        tags=["federated", "analytics", "syft_flwr", "flwr"],
        entrypoint="main.py",
    )
    print(job)

## DS runs FL server code

In [None]:
import os

if LOCAL_TEST:
    os.environ["SYFTBOX_CLIENT_CONFIG_PATH"] = str(ds_stack.client.config_path)

os.environ["LOGURU_LEVEL"] = "DEBUG"
os.environ["SYFT_FLWR_MSG_TIMEOUT"] = "60"

!uv run {str(SYFT_FLWR_PROJECT_DIR / "main.py")} --active