In [None]:
from syft_runtimes.high_low.setup import (
    initialize_high_datasite,
    initialize_sync_config,
    high_side_connect,
    sync,
    prepare_dataset_for_low_side,
    prepare_datasets_from_high_side,
)

from syft_runtimes.high_low.rsync import Side

## Initializing the high datasite and connect to it

In [None]:
# Low side settings

# NOTE: This needs to point to an existing syftbox directory. If it does not exist, create one first: https://www.syftbox.net/
lowside_syftbox_dir = "~/.syftbox/clients/alice@openmined.org/SyftBox"
lowside_email = "alice@openmined.org"

# Only required if you want to connect to the low side via SSH (for syncing from high to low)
# ssh_config: dict = {
#     "low_ssh_host": "example.com",
#     "low_ssh_user": "username",
#     "low_ssh_port": 22,
#     "low_ssh_key_path": "/path/to/private/key",
# }

# For local testing, without SSH connection
ssh_config = {}

In [None]:
# High side settings
highside_email = (
    lowside_email  # Important: must match, or we have to change the dataset URLs
)
highside_identifier = "highside-1234"

In [None]:
# First, initialize the high datasite
initialize_high_datasite(email=highside_email, force_overwrite=True)
high_syftbox_client = high_side_connect(email=highside_email)

# Initial sync

In [None]:
sync_config = initialize_sync_config(
    syftbox_client=high_syftbox_client,
    lowside_syftbox_dir=lowside_syftbox_dir,
    highside_identifier=highside_identifier,
    force_overwrite=True,
    **ssh_config,
)

In [None]:
# TODO - ensure the sync folders exist on the lowside (over ssh?)
lowside_sync_folders = [
    sync_config.jobs_dir(Side.LOW),
    sync_config.outputs_dir(Side.LOW),
    sync_config.datasets_dir(Side.LOW),
]

for folder in lowside_sync_folders:
    folder.mkdir(parents=True, exist_ok=True)

In [None]:
sync(syftbox_client=high_syftbox_client)

## Create a high-side dataset

In [None]:
import syft_datasets as syd
from syft_core.config import CONFIG_PATH_ENV
import os

# Set the high client as default syftbox client
os.environ[CONFIG_PATH_ENV] = str(
    high_syftbox_client.workspace.data_dir.parent / "config.json"
)

In [None]:
os.environ[CONFIG_PATH_ENV]

In [None]:
# Create some random mock and private data

from pathlib import Path
import random
import pandas as pd

num_mock_rows = 100
num_private_rows = 100

mock_data = {
    "age": [random.randint(0, 100) for _ in range(num_mock_rows)],
    "height": [random.uniform(150, 200) for _ in range(num_mock_rows)],
    "income": [random.randint(20000, 100000) for _ in range(num_mock_rows)],
}
private_data = {
    "age": [random.randint(0, 100) for _ in range(num_private_rows)],
    "height": [random.uniform(150, 200) for _ in range(num_private_rows)],
    "income": [random.randint(20000, 100000) for _ in range(num_private_rows)],
}

mock_df = pd.DataFrame(mock_data)
private_df = pd.DataFrame(private_data)

data_dir = Path("./data")
data_dir.mkdir(parents=True, exist_ok=True)
mock_df.to_csv(data_dir / "mock_data.csv", index=False)
private_df.to_csv(data_dir / "private_data.csv", index=False)

In [None]:
readme_content = """
# My example high-side dataset
"""
readme_path = data_dir / "README.md"
readme_path.write_text(readme_content)

In [None]:
highside_dataset = syd.create(
    name="highside_example_dataset",
    mock_path=data_dir / "mock_data.csv",
    private_path=data_dir / "private_data.csv",
    readme_path=readme_path,
    summary="This is a mock dataset for demonstration purposes.",
    tags=["example", "testing", "highside"],
    location=highside_identifier,
)

In [None]:
highside_dataset.mock_dir

In [None]:
# Copy the mock part of the dataset to the sync folder
prepare_dataset_for_low_side(highside_dataset, syftbox_client=high_syftbox_client)

In [None]:
from syft_notebook_ui import show_dir
from syft_runtimes.high_low.rsync import Side, SyncDirection

# Now we can see the dataset is in our sync folder
show_dir(sync_config.base_sync_dir(Side.HIGH))

In [None]:
# Now we can sync from high to low

sync(syftbox_client=high_syftbox_client, direction=SyncDirection.LOCAL_TO_REMOTE)

## Switch to low-side

In [None]:
from syft_core import Client as SyftBoxClient

lowside_syftbox_client = SyftBoxClient.load(
    filepath="~/.syftbox/clients/alice@openmined.org/config.json"
)

In [None]:
# On the low side, we can now publish the synced dataset to our datasite

prepare_datasets_from_high_side(
    high_side_name=highside_identifier,
    syftbox_client=lowside_syftbox_client,
    overwrite=True,  # Set to True to overwrite existing datasets on the low side
)

In [None]:
# We can now see the dataset on the low side!

from syft_datasets import SyftDatasetManager

lowside_dataset_manager = SyftDatasetManager(syftbox_client=lowside_syftbox_client)
lowside_dataset_manager.get_all()

In [None]:
dataset = lowside_dataset_manager.get(name="highside_example_dataset")

dataset.describe()

In [None]:
# Sanity check: the private data is not available on the low side
# TODO improve error reporting for this case

dataset.private_dir

## High side Job Runner
- Run job runner on high-side as daemon process, watching the folder `private/job_runners/highside-1234/jobs` for new jobs
- Manually sync pending jobs and results (just in a notebook for now)

## Low side submits jobs to the correct runner on the high side. Check for results in the outputs folder
- Low side: submit job to the correct runner (according to the dataset location: `private/job_runners/<dataset_location>/jobs`)
- watch for results in the corresponding outputs dir