In [None]:
import syft_runtimes.high_low as syhl
from syft_runtimes.high_low.rsync import RsyncConfig
from syft_rds.orchestra import setup_rds_server, remove_rds_stack_dir

In [None]:
remove_rds_stack_dir(key="low_datasites", root_dir="./")

In [None]:
DO = "do1@openmined.org"
do_stack_1 = setup_rds_server(email=DO, key="low_datasites", root_dir="./")

In [None]:
lowside_client = do_stack_1.client
lowside_syftbox_dir = do_stack_1.client.workspace.data_dir
highside_data_dir = lowside_syftbox_dir.parent / "high_datasites"
lowside_email = DO

In [None]:
!rm -rf {highside_data_dir}

In [None]:
lowside_client.email

## Initializing the high datasite and connect to it

In [None]:
# Low side settings

# NOTE: This needs to point to an existing syftbox directory. If it does not exist, create one first: https://www.syftbox.net/
print(f"{lowside_syftbox_dir = }")
print(f"{highside_data_dir = }")

# Only required if you want to connect to the low side via SSH (for syncing from high to low)
# ssh_config: dict = {
#     "low_ssh_host": "example.com",
#     "low_ssh_user": "username",
#     "low_ssh_port": 22,
#     "low_ssh_key_path": "/path/to/private/key",
# }

# For local testing, without SSH connection
ssh_config = {}

In [None]:
# High side settings
highside_identifier = "highside-1234"
highside_identifier

In [None]:
# First, initialize the high datasite - runs on the low side
syhl.initialize_high_datasite(
    lowside_syftbox_client=lowside_client,
    highside_identifier=highside_identifier,
    highside_data_dir=highside_data_dir,
    force_overwrite=True,
)

# TODO: change to (this happens on the high side) -> create the folder structure, the default sync config
syhl.initialize_high_datasite(
    email="your_email",
    force_overwrite=True,
)

Note: `syft_core`'s `Client` is a python handle to work with the `SyftBox` file system. We have the `SyftBox` file system on the high side, but we never install the `syftbox` Go Client (`curl -fsSL https://syftbox.net/install.sh | sh`)

In [None]:
highside_email = lowside_client.email
highside_client = syhl.high_side_connect(  # happens on the either / but should be on the high side for security demonstration
    email=highside_email, data_dir=highside_data_dir
)

In [None]:
highside_client.workspace.data_dir

# Create a high-side dataset

Runs on the high side

In [None]:
import syft_datasets as syd
from syft_core.config import CONFIG_PATH_ENV
import os

# Set the high client as default syftbox client
os.environ[CONFIG_PATH_ENV] = str(highside_client.config_path)

In [None]:
os.environ[CONFIG_PATH_ENV]

In [None]:
# Create some random mock and private data
from pathlib import Path
import random
import pandas as pd

num_mock_rows = 100
num_private_rows = 100

mock_data = {
    "age": [random.randint(0, 100) for _ in range(num_mock_rows)],
    "height": [random.uniform(150, 200) for _ in range(num_mock_rows)],
    "income": [random.randint(20000, 100000) for _ in range(num_mock_rows)],
}
private_data = {
    "age": [random.randint(0, 100) for _ in range(num_private_rows)],
    "height": [random.uniform(150, 200) for _ in range(num_private_rows)],
    "income": [random.randint(20000, 100000) for _ in range(num_private_rows)],
}

mock_df = pd.DataFrame(mock_data)
private_df = pd.DataFrame(private_data)

data_dir = Path("./data")
data_dir.mkdir(parents=True, exist_ok=True)
mock_df.to_csv(data_dir / "mock_data.csv", index=False)
private_df.to_csv(data_dir / "private_data.csv", index=False)

In [None]:
readme_content = """
# My example high-side dataset
"""
readme_path = data_dir / "README.md"
readme_path.write_text(readme_content)

In [None]:
DATASET_NAME = "my_cool_dataset"

highside_dataset = syd.create(
    name=DATASET_NAME,
    mock_path=data_dir / "mock_data.csv",
    private_path=data_dir / "private_data.csv",
    readme_path=readme_path,
    summary="This is a mock dataset for demonstration purposes.",
    tags=["example", "testing", "highside"],
    location=highside_identifier,
)

In [None]:
highside_dataset.mock_dir

In [None]:
highside_dataset.private_dir

In [None]:
from syft_rds import init_session

high_rds_client = init_session(host=highside_client.email)

In [None]:
print(f"{high_rds_client.is_admin = }")
high_rds_client.dataset.get_all()

Sync the mock part of the dataset to the low side, also add the dataset names into the runtime's `config.yaml` file

In [None]:
sync_config: RsyncConfig = syhl.create_default_sync_config(  # TODO: make this run in the backend when the high side is intitialized
    highside_client=highside_client,
    lowside_client=lowside_client,
    highside_identifier=highside_identifier,
    force_overwrite=True,
    **ssh_config,
)

sync_config.model_dump()

syhl.sync(syftbox_client=highside_client, rsync_config=sync_config)

In [None]:
syhl.sync_dataset(
    dataset_name=DATASET_NAME,
    highside_client=highside_client,  # TODO: we can remove this
    lowside_client=lowside_client,  # TODO: we can remove this
    verbose=True,
)

## Sync jobs and done folder

Sync `jobs` and `done` folder (Runs on the high side)

In [None]:
syhl.sync_jobs(syftbox_client=highside_client)

In [None]:
syhl.sync_results(syftbox_client=highside_client)

# Switch to low-side
We want to check if the low-side has the synced dataset (only the mock part)

In [None]:
do_stack_1.client.config_path

In [None]:
from syft_core import Client as SyftBoxClient

lowside_syftbox_client = SyftBoxClient.load(filepath=do_stack_1.client.config_path)

In [None]:
# We can now see the dataset on the low side!
from syft_datasets import SyftDatasetManager

lowside_dataset_manager = SyftDatasetManager(syftbox_client=lowside_syftbox_client)
lowside_dataset_manager.get_all()

In [None]:
dataset = lowside_dataset_manager.get(name=DATASET_NAME)

dataset.describe()

In [None]:
dataset.mock_dir

**Sanity check**: the private data is not available on the low side (should raise FileNotFoundError)


In [None]:
# TODO improve error reporting for this case
dataset.private_dir

Sync again (let's say after a job is done / new jobs arrive)

In [None]:
syhl.sync(syftbox_client=highside_client)

## Next Steps

### High side Job Runner
- Manually sync pending jobs and results (just in a notebook for now)
- Run a daemon process on the high-side that's watching the folder `private/syft_runtimes/highside-1234/jobs` for new jobs that's coming on the low-side

### Low side submits jobs to the correct runner on the high side. Check for results in the outputs folder
- Low side: submit job to the correct runner (according to the dataset location: `private/<email>/syft_runtimes/<high_low_runtime_name>/jobs`)
- Watch for results in the corresponding `done` dir
