Skip to content

Commit

Permalink
feat(core): allow per dataset data directory
Browse files Browse the repository at this point in the history
  • Loading branch information
Panaetius committed Jul 18, 2022
1 parent 41b20b3 commit e172452
Show file tree
Hide file tree
Showing 10 changed files with 135 additions and 69 deletions.
5 changes: 4 additions & 1 deletion renku/core/dataset/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import contextlib
import time
from pathlib import Path
from typing import Optional

from renku.command.command_builder.command import inject
Expand All @@ -41,13 +42,15 @@ def __init__(
create: Optional[bool] = False,
commit_database: Optional[bool] = False,
creator: Optional[Person] = None,
datadir: Optional[Path] = None,
) -> None:
self.name = name
self.create = create
self.commit_database = commit_database
self.creator = creator
self.dataset_provenance = DatasetsProvenance()
self.dataset: Optional[Dataset] = None
self.datadir: Optional[Path] = datadir

def __enter__(self):
"""Enter context."""
Expand All @@ -57,7 +60,7 @@ def __enter__(self):
raise errors.DatasetNotFound(name=self.name)

# NOTE: Don't update provenance when creating here because it will be updated later
self.dataset = create_dataset(name=self.name, update_provenance=False)
self.dataset = create_dataset(name=self.name, update_provenance=False, datadir=self.datadir)
elif self.create:
raise errors.DatasetExistsError('Dataset exists: "{}".'.format(self.name))

Expand Down
59 changes: 34 additions & 25 deletions renku/core/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,11 @@
from renku.core.util.dispatcher import get_client, get_database
from renku.core.util.git import clone_repository, get_cache_directory_for_repository, get_git_user
from renku.core.util.metadata import is_external_file
from renku.core.util.os import delete_file
from renku.core.util.os import delete_file, get_safe_relative_path
from renku.core.util.tabulate import tabulate
from renku.core.util.urls import get_slug
from renku.core.util.util import NO_VALUE, NoValueType
from renku.domain_model.dataset import (
Dataset,
DatasetDetailsJson,
DatasetFile,
RemoteEntity,
get_dataset_data_dir,
is_dataset_name_valid,
)
from renku.domain_model.dataset import Dataset, DatasetDetailsJson, DatasetFile, RemoteEntity, is_dataset_name_valid
from renku.domain_model.provenance.agent import Person
from renku.domain_model.provenance.annotation import Annotation
from renku.infrastructure.immutable import DynamicProxy
Expand Down Expand Up @@ -100,6 +93,7 @@ def create_dataset(
images: Optional[List[ImageRequestModel]] = None,
update_provenance: bool = True,
custom_metadata: Optional[Dict[str, Any]] = None,
datadir: Optional[Path] = None,
):
"""Create a dataset.
Expand All @@ -113,6 +107,7 @@ def create_dataset(
update_provenance(bool, optional): Whether to add this dataset to dataset provenance
(Default value = True).
custom_metadata(Optional[Dict[str, Any]], optional): Custom JSON-LD metadata (Default value = None).
datadir(Optional[Path]): Dataset's data directory (Default value = None).
Returns:
Dataset: The created dataset.
Expand Down Expand Up @@ -145,6 +140,9 @@ def create_dataset(
if custom_metadata:
annotations = [Annotation(id=Annotation.generate_id(), source="renku", body=custom_metadata)]

if datadir:
datadir = get_safe_relative_path(datadir, client.path)

dataset = Dataset(
identifier=None,
name=name,
Expand All @@ -154,6 +152,7 @@ def create_dataset(
keywords=keywords,
project_id=client.project.id,
annotations=annotations,
datadir=datadir,
)

if images:
Expand Down Expand Up @@ -383,7 +382,7 @@ def export_dataset(name, provider_name, tag, client_dispatcher: IClientDispatche
if not dataset:
raise errors.DatasetNotFound(message=f"Cannot find dataset with id: '{selected_tag.dataset_id.value}'")

data_dir = get_dataset_data_dir(client, dataset.name) # type: ignore
data_dir = dataset.get_datadir(client)
dataset = cast(Dataset, DynamicProxy(dataset))
dataset.data_dir = data_dir

Expand Down Expand Up @@ -412,28 +411,32 @@ def export_dataset(name, provider_name, tag, client_dispatcher: IClientDispatche


def import_dataset(
uri,
name="",
extract=False,
yes=False,
previous_dataset=None,
delete=False,
gitlab_token=None,
uri: str,
name: str = "",
extract: bool = False,
yes: bool = False,
datadir: Optional[Path] = None,
previous_dataset: Optional[Dataset] = None,
delete: bool = False,
gitlab_token: Optional[str] = None,
**kwargs,
):
"""Import data from a 3rd party provider or another renku project.
Args:
uri: DOI or URL of dataset to import.
name: Name to give imported dataset (Default value = "").
extract: Whether to extract compressed dataset data (Default value = False).
yes: Whether to skip user confirmation (Default value = False).
previous_dataset: Previously imported dataset version (Default value = None).
delete: Whether to delete files that don't exist anymore (Default value = False).
gitlab_token: Gitlab OAuth2 token (Default value = None).
uri(str): DOI or URL of dataset to import.
name(str): Name to give imported dataset (Default value = "").
extract(bool): Whether to extract compressed dataset data (Default value = False).
yes(bool): Whether to skip user confirmation (Default value = False).
datadir(Optional[Path]): Dataset's data directory (Default value = None).
previous_dataset(Optional[Dataset]): Previously imported dataset version (Default value = None).
delete(bool): Whether to delete files that don't exist anymore (Default value = False).
gitlab_token(Optional[str]): Gitlab OAuth2 token (Default value = None).
"""
from renku.core.dataset.dataset_add import add_to_dataset

client = get_client()

def confirm_download(files):
if yes:
return
Expand Down Expand Up @@ -464,7 +467,7 @@ def remove_files(dataset):
deleted_paths = previous_paths - current_paths

for path in deleted_paths:
delete_file(get_client().path / path, follow_symlinks=True)
delete_file(client.path / path, follow_symlinks=True)

provider = ProviderFactory.get_import_provider(uri)

Expand All @@ -487,6 +490,11 @@ def remove_files(dataset):
except (KeyError, LookupError):
pass

if datadir and previous_dataset:
raise errors.ParameterError("Can't specify datadir when updating a previously imported dataset.")
elif datadir:
datadir = get_safe_relative_path(datadir, client.path)

name = name or provider_dataset.name

new_dataset = add_to_dataset(
Expand All @@ -499,6 +507,7 @@ def remove_files(dataset):
overwrite=True,
total_size=calculate_total_size(importer.provider_dataset_files),
clear_files_before=True,
datadir=datadir,
)

new_dataset.update_metadata_from(provider_dataset)
Expand Down
9 changes: 5 additions & 4 deletions renku/core/dataset/dataset_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from renku.core.util.dispatcher import get_client, get_database
from renku.core.util.git import get_git_user
from renku.core.util.os import delete_file, get_relative_path
from renku.domain_model.dataset import Dataset, DatasetFile, get_dataset_data_dir
from renku.domain_model.dataset import Dataset, DatasetFile

if TYPE_CHECKING:
from renku.core.dataset.providers.models import DatasetAddMetadata
Expand All @@ -57,6 +57,7 @@ def add_to_dataset(
extract: bool = False,
clear_files_before: bool = False,
total_size: Optional[int] = None,
datadir: Optional[Path] = None,
) -> Dataset:
"""Import the data into the data directory."""
client = get_client()
Expand All @@ -65,7 +66,7 @@ def add_to_dataset(
_check_available_space(client, urls, total_size=total_size)

try:
with DatasetContext(name=dataset_name, create=create) as dataset:
with DatasetContext(name=dataset_name, create=create, datadir=datadir) as dataset:
destination_path = _create_destination_directory(client, dataset, destination)

client.check_external_storage() # TODO: This is not required for external storages
Expand Down Expand Up @@ -177,7 +178,7 @@ def _download_files(
revision=revision,
sources=sources,
external=external,
dataset_name=dataset.name,
dataset=dataset,
extract=extract,
)

Expand Down Expand Up @@ -210,7 +211,7 @@ def _create_destination_directory(
client: "LocalClient", dataset: Dataset, destination: Optional[Union[Path, str]] = None
) -> Path:
"""Create directory for dataset add."""
dataset_datadir = client.path / get_dataset_data_dir(client, dataset.name)
dataset_datadir = client.path / dataset.get_datadir(client)
# NOTE: Make sure that dataset's data dir exists because we check for existence of a destination later to decide
# what will be its name
dataset_datadir.mkdir(parents=True, exist_ok=True)
Expand Down
10 changes: 4 additions & 6 deletions renku/core/dataset/providers/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,21 +84,20 @@ def add(
uri: str,
destination: Path,
*,
dataset_name: str = None,
dataset: Optional["Dataset"] = None,
external: bool = False,
**kwargs,
) -> List["DatasetAddMetadata"]:
"""Add files from a URI to a dataset."""
from renku.core.dataset.providers.models import DatasetAddAction, DatasetAddMetadata
from renku.domain_model.dataset import get_dataset_data_dir

assert dataset_name is not None, "Dataset name is not passed"
assert dataset is not None, "Dataset name is not passed"

u = urllib.parse.urlparse(uri)
path = u.path

action = DatasetAddAction.SYMLINK if external else DatasetAddAction.COPY
absolute_dataset_data_dir = (client.path / get_dataset_data_dir(client, dataset_name)).resolve()
absolute_dataset_data_dir = (client.path / dataset.get_datadir(client)).resolve()
source_root = Path(get_absolute_path(path))
is_within_repo = is_subpath(path=source_root, base=client.path)
warnings = []
Expand Down Expand Up @@ -207,7 +206,6 @@ def export(self, client=None, **kwargs) -> str:
"""Execute entire export process."""
from renku.command.schema.dataset import dump_dataset_as_jsonld
from renku.core.util.yaml import write_yaml
from renku.domain_model.dataset import get_dataset_data_dir

if self._path:
dst_root = client.path / self._path
Expand All @@ -222,7 +220,7 @@ def export(self, client=None, **kwargs) -> str:

dst_root.mkdir(parents=True, exist_ok=True)

data_dir = get_dataset_data_dir(client=client, dataset_name=self._dataset.name)
data_dir = self._dataset.get_datadir(client)

with communication.progress("Copying dataset files ...", total=len(self._dataset.files)) as progressbar:
for file in self.dataset.files:
Expand Down
4 changes: 2 additions & 2 deletions renku/core/dataset/providers/renku.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ def datadir_exists(self):
def _fetch_dataset(self, client_dispatcher: IClientDispatcher, database_dispatcher: IDatabaseDispatcher):
from renku.core.dataset.providers.models import ProviderDataset, ProviderDatasetFile
from renku.core.management.client import LocalClient
from renku.domain_model.dataset import Url, get_dataset_data_dir
from renku.domain_model.dataset import Url

repository = None
client = client_dispatcher.current_client
Expand Down Expand Up @@ -514,7 +514,7 @@ def _fetch_dataset(self, client_dispatcher: IClientDispatcher, database_dispatch
database_dispatcher.pop_database()
client_dispatcher.pop_client()

provider_dataset.data_dir = get_dataset_data_dir(self._remote_client, provider_dataset.name)
provider_dataset.data_dir = provider_dataset.get_datadir(self._remote_client)
provider_dataset.derived_from = None
provider_dataset.same_as = Url(url_id=remove_credentials(self.latest_uri))

Expand Down
16 changes: 10 additions & 6 deletions renku/domain_model/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ def __init__(
same_as: Optional[Url] = None,
title: Optional[str] = None,
version: Optional[str] = None,
datadir: Optional[Path] = None,
):
if not name:
assert title, "Either 'name' or 'title' must be set."
Expand Down Expand Up @@ -380,6 +381,7 @@ def __init__(
self.title: Optional[str] = title
self.version: Optional[str] = version
self.annotations: List["Annotation"] = annotations or []
self._datadir: Optional[str] = str(datadir) if datadir else None

@staticmethod
def generate_id(identifier: str) -> str:
Expand Down Expand Up @@ -420,6 +422,13 @@ def keywords_csv(self):
"""Comma-separated list of keywords associated with dataset."""
return ", ".join(self.keywords)

def get_datadir(self, client) -> Path:
"""Return dataset's datadir."""
if getattr(self, "_datadir", None):
return Path(self._datadir)

return Path(os.path.join(client.data_dir, self.name))

def __repr__(self) -> str:
return f"<Dataset {self.identifier} {self.name}>"

Expand Down Expand Up @@ -664,14 +673,9 @@ class ImageObjectRequestJson(marshmallow.Schema):
mirror_locally = marshmallow.fields.Bool(default=False)


def get_dataset_data_dir(client, dataset_name: str) -> str:
"""Return default data directory for a dataset."""
return os.path.join(client.data_dir, dataset_name)


def get_file_path_in_dataset(client, dataset: Dataset, dataset_file: DatasetFile) -> Path:
"""Return path of a file relative to dataset's data dir."""
try:
return (client.path / dataset_file.entity.path).relative_to(get_dataset_data_dir(client, dataset.name))
return (client.path / dataset_file.entity.path).relative_to(dataset.get_datadir(client))
except ValueError: # NOTE: File is not in the dataset's data dir
return Path(dataset_file.entity.path)
28 changes: 24 additions & 4 deletions renku/ui/cli/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,13 @@ def list_dataset(format, columns):
help="Custom metadata to be associated with the dataset.",
)
@click.option("-k", "--keyword", default=None, multiple=True, type=click.STRING, help="List of keywords.")
def create(name, title, description, creators, metadata, keyword):
@click.option(
"--datadir",
default=None,
type=click.Path(),
help="Dataset's data directory (defaults to 'data/<dataset name>').",
)
def create(name, title, description, creators, metadata, keyword, datadir):
"""Create an empty dataset in the current repo."""
from renku.command.dataset import create_dataset_command
from renku.core.util.metadata import construct_creators
Expand Down Expand Up @@ -641,6 +647,7 @@ def create(name, title, description, creators, metadata, keyword):
creators=creators,
keywords=keyword,
custom_metadata=custom_metadata,
datadir=datadir,
)
)

Expand Down Expand Up @@ -807,8 +814,14 @@ def add_provider_options(*param_decls, **attrs):
@click.option("-o", "--overwrite", is_flag=True, help="Overwrite existing files.")
@click.option("-c", "--create", is_flag=True, help="Create dataset if it does not exist.")
@click.option("-d", "--destination", default="", help="Destination directory within the dataset path")
@click.option(
"--datadir",
default=None,
type=click.Path(),
help="Dataset's data directory (defaults to 'data/<dataset name>').",
)
@add_provider_options()
def add(name, urls, external, force, overwrite, create, destination, **kwargs):
def add(name, urls, external, force, overwrite, create, destination, datadir, **kwargs):
"""Add data to a dataset."""
from renku.command.dataset import add_to_dataset_command
from renku.ui.cli.utils.callback import ClickCallback
Expand All @@ -822,6 +835,7 @@ def add(name, urls, external, force, overwrite, create, destination, **kwargs):
overwrite=overwrite,
create=create,
destination=destination,
datadir=datadir,
**kwargs,
)
click.secho("OK", fg=color.GREEN)
Expand Down Expand Up @@ -1003,8 +1017,14 @@ def import_provider_options(*param_decls, **attrs):
@click.option("--short-name", "--name", "name", default=None, help="A convenient name for dataset.")
@click.option("-x", "--extract", is_flag=True, help="Extract files before importing to dataset.")
@click.option("-y", "--yes", is_flag=True, help="Bypass download confirmation.")
@click.option(
"--datadir",
default=None,
type=click.Path(),
help="Dataset's data directory (defaults to 'data/<dataset name>').",
)
@import_provider_options()
def import_(uri, name, extract, yes, **kwargs):
def import_(uri, name, extract, yes, datadir, **kwargs):
"""Import data from a 3rd party provider or another renku project.
Supported providers: [Dataverse, Renku, Zenodo]
Expand All @@ -1014,7 +1034,7 @@ def import_(uri, name, extract, yes, **kwargs):

communicator = ClickCallback()
import_dataset_command().with_communicator(communicator).build().execute(
uri=uri, name=name, extract=extract, yes=yes, **kwargs
uri=uri, name=name, extract=extract, yes=yes, datadir=datadir, **kwargs
)

click.secho(" " * 79 + "\r", nl=False)
Expand Down

0 comments on commit e172452

Please sign in to comment.