In [None]:
import os
from syft_core.config import CONFIG_PATH_ENV
import syft_rds as sy
from pathlib import Path

In [None]:
# NOTE normally this is all set for users, they can skip to the next cell
# Because we are running multiple isolated clients, we need to make sure syftbox gets the right config path

DATA_OWNER = "data_owner@openmined.org"
DATA_SCIENTIST = "data_scientist@openmined.org"
CLIENTS_PATH = sy.RDS_REPO_PATH / "syft-rds" / ".clients"
syftbox_config_path = CLIENTS_PATH / DATA_SCIENTIST / "config.json"

assert syftbox_config_path.exists()

os.environ[CONFIG_PATH_ENV] = syftbox_config_path.as_posix()

In [None]:
# Both the data scientist and data owner are running an RDS app.
# In this notebook, we'll only connect to the RDS app from the data_owner

sy.discover_rds_apps()

In [None]:
ds_client = sy.init_session(host=DATA_OWNER)

In [None]:
# To check if we're connected, we call the health endpoint over RPC
# NOTE this takes about 5 seconds on a local stack

# Under the hood:
# - DS makes request file
# - sync server syncs to DO
# - DO makes response file
# - sync server syncs to DS
ds_client.rpc.health()

# DS investigates datasets

In [None]:
ds_client.datasets

In [None]:
dataset = ds_client.datasets[0]

### Experiment on mock data

In [None]:
import pandas as pd

data_file = dataset.mock_path / "data.csv"
print(f"Loading data from {data_file.name}...")
df = pd.read_csv(data_file, sep=";")

num_rows = len(df)
print(f"Calculating feature means for {num_rows} rows...")
df.mean(numeric_only=True)

# DS submits a job on private data

Create a `calculate_mean.py`, anywhere on your maching. The below cell uses a `%%writefile` jupyter magic to make this a bit easier

In [None]:
CWD = Path.cwd()
my_file = CWD / "data" / "calculate_mean.py"

In [None]:
%%writefile {my_file}

import os
import pandas as pd
from pathlib import Path

# Standard RDS environment variables, the RDS runner will fill these in.
DATA_DIR = os.environ["DATA_DIR"]
OUTPUT_DIR = os.environ["OUTPUT_DIR"]

# Load data from DATA_DIR
# DATA_DIR will contain the real data, and has the same structure as the mock data.
data_file = Path(DATA_DIR) / "data.csv"
print(f"Loading data from {data_file.name}...")
df = pd.read_csv(data_file, sep=';')

# Calculate mean
num_rows = len(df)
print(f"Calculating feature means for {num_rows} rows...")
result = df.mean(numeric_only=True)

# Save result to OUTPUT_DIR
result.to_csv(Path(OUTPUT_DIR) / "output.csv")

In [None]:
job = ds_client.jobs.submit(user_code_path=my_file, dataset_name=dataset.name)
job.describe()

# DS views the results

Before executing the cells below, switch to do_flow.ipynb to review, execute, and publish results

In [None]:
job.refresh()
job.describe()

In [None]:
# View outputs
pd.read_csv(job.output_path / "output" / "output.csv")

In [None]:
# View logs
print((job.output_path / "logs" / "stdout.log").read_text())