In [None]:
import os
from syft_core.config import CONFIG_PATH_ENV
import syft_rds as sy

# Setup 

In [None]:
# NOTE normally this is all set for users, they can skip to the next cell
# Because we are running multiple isolated clients, we need to make sure syftbox gets the right config path

DATA_OWNER = "data_owner@openmined.org"
DATA_SCIENTIST = "data_scientist@openmined.org"

CLIENTS_PATH = sy.RDS_REPO_PATH / "syft-rds" / ".clients"
syftbox_config_path = CLIENTS_PATH / DATA_OWNER / "config.json"

assert syftbox_config_path.exists()

os.environ[CONFIG_PATH_ENV] = syftbox_config_path.as_posix()

## Connect

In [None]:
# Both the data scientist and data owner are running an RDS app.
# In this notebook, we'll only connect to the RDS app from the data_owner

sy.discover_rds_apps()

In [None]:
do_client = sy.init_session(host=DATA_OWNER)

In [None]:
# To check if we're connected, we call the health endpoint over RPC
# NOTE this is almost instant, because the server is running on this datasite
do_client.rpc.health()

# Data owner creates a dataset

Prepare a small wine quality dataset from the UCI archive<br>
https://archive.ics.uci.edu/dataset/186/wine+quality

In [None]:
from pathlib import Path
import urllib.request
import os
import random

dataset_name = "red-wine-quality"
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

# Create paths
CWD = Path.cwd()
dataset_dir = CWD / "data" / dataset_name
private_dir = dataset_dir / "private"
mock_dir = dataset_dir / "mock"
markdown_path = dataset_dir / "description.md"

readme = """
# Red wine quality dataset

## About
The two datasets are related to red and white variants of the Portuguese "Vinho Verde" wine. 
For more details, consult: http://www.vinhoverde.pt/en/ or the reference [Cortez et al., 2009].
Due to privacy and logistic issues, only physicochemical (inputs) and sensory (the output) 
variables are available (e.g. there is no data about grape types, wine brand, wine selling price).

These datasets can be viewed as classification or regression tasks. The classes are ordered and 
not balanced (e.g. there are many more normal wines than excellent or poor ones). Outlier detection 
algorithms could be used to detect the few excellent or poor wines.

## Source
https://archive.ics.uci.edu/dataset/186/wine+quality
"""
print(f"Downloading {dataset_name} dataset to {dataset_dir.as_posix()}")

# Download the dataset to a local folder (no-syftbox yet)
if dataset_dir.exists():
    print("Dataset already exists, skipping download")
else:
    # Create directories
    private_dir.mkdir(parents=True, exist_ok=True)
    mock_dir.mkdir(parents=True, exist_ok=True)

    # Download files in memory
    with urllib.request.urlopen(url) as response:
        data = response.read().decode("utf-8").splitlines()
    header = data[0]
    data_rows = data[1:]

    # Split 20/80 into mock/private
    random.shuffle(data_rows)
    split_idx = int(len(data_rows) * 0.2)
    mock_data = data_rows[:split_idx]
    private_data = data_rows[split_idx:]
    mock_data = [header] + mock_data
    private_data = [header] + private_data

    with open(private_dir / "data.csv", "w") as f:
        f.write("\n".join(private_data))
    with open(mock_dir / "data.csv", "w") as f:
        f.write("\n".join(mock_data))
    with open(markdown_path, "w") as f:
        f.write(readme)

    print(
        f"Dataset {dataset_name} downloaded and split into mock ({len(mock_data)} rows) and private ({len(private_data)} rows)"
    )

In [None]:
# Add dataset to syftbox

dataset = do_client.dataset.create(
    name=dataset_name,  # MUST BE UNIQUE. Throw Exception if already exist.
    path=private_dir,  # MUST EXIST
    mock_path=mock_dir,
    summary="A dataset on red wine quality, downloaded from the UCI Machine Learning Repository",
    description_path=markdown_path,
)

In [None]:
dataset.describe()

# Data owner reviews incoming jobs

Before executing the cells below, switch to ds_flow.ipynb to submit a job on the wine quality dataset

In [None]:
jobs = do_client.job.get_all(status="pending_code_review")
jobs

In [None]:
job = jobs[0]

# Same as job.user_code.describe()
job.show_user_code()

# DO executes job and shares the result

In [None]:
do_client.run_private(job)

In [None]:
_, job = do_client.job.share_results(job)

In [None]:
# The job results are shared and now visible in the output_path
job.describe()