# Data Scientist

## Log into DOs' datasites

In [None]:
from pathlib import Path

from syft_rds.orchestra import setup_rds_server

DS = "ds@openmined.org"
DO1 = "do1@openmined.org"
DO2 = "do2@openmined.org"

ds_stack = setup_rds_server(email=DS, key="flwr", root_dir=Path("."))

do_client_1 = ds_stack.init_session(host=DO1)
print("Logged into: ", do_client_1.host)

do_client_2 = ds_stack.init_session(host=DO2)
print("Logged into: ", do_client_2.host)

### Inspect DOs' Mock Datasets

In [None]:
SYFTBOX_DATASET_NAME = "pima-indians-diabetes-database"

dataset1 = do_client_1.dataset.get(name=SYFTBOX_DATASET_NAME)
dataset1.describe()

In [None]:
dataset2 = do_client_2.dataset.get(name=SYFTBOX_DATASET_NAME)
dataset2.describe()

## DS does some data analytics on mock datasets

For each client:
1. Concat the dataframes in `train.csv` and `mock.csv`
2. 

In [None]:
import pandas as pd

df1_train = pd.read_csv(dataset1.get_mock_path() / "train.csv")
df1_test = pd.read_csv(dataset1.get_mock_path() / "test.csv")

df2_train = pd.read_csv(dataset2.get_mock_path() / "train.csv")
df2_test = pd.read_csv(dataset2.get_mock_path() / "test.csv")

In [None]:
pima = pd.concat([df1_train, df1_test, df2_train, df2_test], ignore_index=True)
pima.head(11)

In [None]:
pima_col_idx = pima.columns
pima_col_idx

In [None]:
pima.dtypes

In [None]:
# Create Nutritional status column

Nutritional_status = pd.Series([])

for i in range(len(pima)):
    if pima["BMI"][i] == 0.0:
        Nutritional_status[i] = "NA"

    elif pima["BMI"][i] < 18.5:
        Nutritional_status[i] = "Underweight"

    elif pima["BMI"][i] < 25:
        Nutritional_status[i] = "Normal"

    elif pima["BMI"][i] >= 25 and pima["BMI"][i] < 30:
        Nutritional_status[i] = "Overweight"

    elif pima["BMI"][i] >= 30:
        Nutritional_status[i] = "Obese"

    else:
        Nutritional_status[i] = pima["BMI"][i]

In [None]:
pima.insert(6, "Nutritional Status", Nutritional_status)

In [None]:
pima["Nutritional Status"].value_counts()

## Preparing `syft_flwr` project code

```bash
fed-analytics-diabetes/
├── fed_analytics_diabetes/
│   ├── __init__.py
│   ├── client_app.py
│   └── server_app.py
├── pyproject.toml
└── README.md
```

In [None]:
from pathlib import Path

SYFT_FLWR_PROJECT_DIR = Path("./fed-analytics-diabetes")
assert SYFT_FLWR_PROJECT_DIR.exists()

## Run `flwr` simulation

After preparing `syft_flwr` code, DS runs `flwr run` to make sure that it's compatible with Flower

In [None]:
RUN_SIMULATION = 1

if RUN_SIMULATION:
    !flwr run {SYFT_FLWR_PROJECT_DIR}

### Bootstrapping the `flwr` project
DS runs `syft_flwr.boostrap` to turn a `flwr` project into a `syft_flwr` project

In [None]:
import syft_flwr

try:
    !rm -rf {SYFT_FLWR_PROJECT_DIR / "main.py"}
    syft_flwr.bootstrap(SYFT_FLWR_PROJECT_DIR, aggregator=DS, datasites=[DO1, DO2])
    print("Bootstrapped project successfully ✅")
except Exception as e:
    print(f"Bootstrapped project failed with error: '{e}' ❌")

### DS runs `syft_flwr` simulation

In [None]:
if RUN_SIMULATION:
    syft_flwr.run(
        SYFT_FLWR_PROJECT_DIR, [dataset1.get_mock_path(), dataset2.get_mock_path()]
    )

## DS submits jobs to DOs

In [None]:
!rm -rf {SYFT_FLWR_PROJECT_DIR / "pandas_example" / "__pycache__/"}
!rm -rf {SYFT_FLWR_PROJECT_DIR / "simulation_logs"}

In [None]:
# Jobs submission
datasites = [do_client_1, do_client_2]

for client in datasites:
    job = client.jobs.submit(
        name="iris_fed_analytics",
        description="Syft Flower Federated Analytics on the Iris Dataset",
        user_code_path=SYFT_FLWR_PROJECT_DIR,
        dataset_name=SYFTBOX_DATASET_NAME,
        tags=["federated", "analytics", "syft_flwr", "flwr"],
        entrypoint="main.py",
    )
    print(job)

## DS runs FL server code

In [None]:
import os

os.environ["SYFTBOX_CLIENT_CONFIG_PATH"] = str(ds_stack.client.config_path)
os.environ["LOGURU_LEVEL"] = "DEBUG"

!uv run {str(SYFT_FLWR_PROJECT_DIR / "main.py")} --active