In [1]:
import substra
import sh
import pathlib
import zipfile

In [2]:
client = substra.Client(backend_type="docker")
permissions = substra.schemas.Permissions(public=True, authorized_ids=[])

In [3]:
dataset = substra.schemas.DatasetSpec(
    name=f"My dataset",
    type="txt",
    data_opener= "dataset/opener.py",
    description= "dataset/description.md",
    permissions=permissions,
    logs_permission=permissions,
)
dataset_key = client.add_dataset(dataset)

In [4]:
data_sample = substra.schemas.DataSampleSpec(
    data_manager_keys=[dataset_key],
    test_only=False,
    path="dataset/data",
)
datasample_key = client.add_data_sample(
    data_sample, 
    local=True
)

In [7]:
assets_directory = pathlib.Path("my_substra_files")

ALGO_TRAIN_DOCKERFILE_FILES = [
    assets_directory / "test.R",
    assets_directory / "python_wrapper.py",
    assets_directory / "Dockerfile",
]

train_archive_path = assets_directory / "algo.zip"
with zipfile.ZipFile(train_archive_path, "w") as z:
    for filepath in ALGO_TRAIN_DOCKERFILE_FILES:
        z.write(filepath, arcname=filepath.name)

train_algo_inputs = [
    substra.schemas.AlgoInputSpec(
        identifier="datasamples", 
        kind=substra.schemas.AssetKind.data_sample, 
        optional=False, 
        multiple=True),
    substra.schemas.AlgoInputSpec(
        identifier="opener", 
        kind=substra.schemas.AssetKind.data_manager, 
        optional=False, 
        multiple=False),
]
train_algo_outputs = [
    substra.schemas.AlgoOutputSpec(identifier="model", kind=substra.schemas.AssetKind.model, multiple=False)
]

train_algo = substra.schemas.AlgoSpec(
    name="Run R script",
    inputs=train_algo_inputs,
    outputs=train_algo_outputs,
    description=assets_directory / "description.md",
    file=train_archive_path,
    permissions=permissions,
)


train_algo_key = client.add_algo(train_algo)

print(f"Train algo key {train_algo_key}")

Train algo key aaaa641e-1046-4812-9674-ec49d6bdad00


In [6]:
train_task = substra.schemas.TaskSpec(
    algo_key=train_algo_key,
    inputs=[
        substra.schemas.InputRef(identifier="opener", asset_key=dataset_key),
        substra.schemas.InputRef(identifier="datasamples", asset_key=datasample_key),
    ],
    outputs={"model": substra.schemas.ComputeTaskOutputSpec(permissions=permissions)},
    worker=client.organization_info().organization_id,
)

train_task_key = client.add_task(train_task)

print(f"Train task key {train_task_key}")

Train task key baf1c4de-7c8d-4029-abee-9428379908de


In [7]:
task = client.get_task(train_task_key)
print(task.status)

Status.done


In [8]:
import pickle
with open (client.download_model_from_task(train_task_key, "model", "."), 'rb') as f:
    print(pickle.load(f))

42
