Skip to content
41 changes: 34 additions & 7 deletions renku/cli/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,9 +385,34 @@ def dataset(ctx, revision, datadir, format):

@dataset.command()
@click.argument('name')
def create(name):
@click.option(
'--short-name', default='', help='A convenient name for dataset.'
)
@click.option(
'-d', '--description', default='', help='Dataset\'s description.'
)
@click.option(
'-c',
'--creator',
default=None,
multiple=True,
help='Creator\'s name and email ("Name <email>").'
)
def create(name, short_name, description, creator):
"""Create an empty dataset in the current repo."""
create_dataset(name)
creators = creator or ()

dataset = create_dataset(
name=name,
short_name=short_name,
description=description,
creators=creators
)
click.echo(
'Use the name "{}" to refer to this dataset.'.format(
dataset.short_name
)
)
click.secho('OK', fg='green')


Expand Down Expand Up @@ -606,14 +631,16 @@ def export_(id, provider, publish, tag):

@dataset.command('import')
@click.argument('uri')
@click.option('-n', '--name', help='Dataset name.')
@click.option(
'--short-name', default='', help='A convenient name for dataset.'
)
@click.option(
'-x',
'--extract',
is_flag=True,
help='Extract files before importing to dataset.'
)
def import_(uri, name, extract):
def import_(uri, short_name, extract):
"""Import data from a 3rd party provider.

Supported providers: [Zenodo, Dataverse]
Expand All @@ -638,9 +665,9 @@ def _init(lock, id_queue):
tqdm.set_lock(lock)

import_dataset(
uri,
name,
extract,
uri=uri,
short_name=short_name,
extract=extract,
with_prompt=True,
pool_init_fn=_init,
pool_init_args=(mp.RLock(), id_queue),
Expand Down
2 changes: 1 addition & 1 deletion renku/core/commands/checks/migration.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def migrate_broken_dataset_paths(client):
# migrate the refs
ref = LinkReference.create(
client=client,
name='datasets/{0}'.format(dataset.display_name),
name='datasets/{0}'.format(dataset.short_name),
force=True,
)
ref.set_reference(expected_path / client.METADATA)
Expand Down
48 changes: 33 additions & 15 deletions renku/core/commands/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
MigrationRequired, ParameterError, UsageError
from renku.core.management.datasets import DATASET_METADATA_PATHS
from renku.core.management.git import COMMIT_DIFF_STRATEGY
from renku.core.models.datasets import Dataset
from renku.core.models.datasets import Dataset, generate_default_short_name
from renku.core.models.provenance.agents import Person
from renku.core.models.refs import LinkReference
from renku.core.models.tabulate import tabulate
Expand Down Expand Up @@ -101,15 +101,26 @@ def dataset_parent(client, revision, datadir, format, ctx=None):
@pass_local_client(
clean=False, commit=True, commit_only=DATASET_METADATA_PATHS
)
def create_dataset(client, name, commit_message=None):
def create_dataset(
client, name, short_name, description, creators, commit_message=None
):
"""Create an empty dataset in the current repo.

:raises: ``renku.core.errors.ParameterError``
"""
with client.with_dataset(name=name, create=True) as dataset:
creator = Person.from_git(client.repo)
if creator not in dataset.creator:
dataset.creator.append(creator)
if not creators:
creators = [Person.from_git(client.repo)]
else:
creators = [Person.from_string(c) for c in creators]

dataset, _, __ = client.create_dataset(
name=name,
short_name=short_name,
description=description,
creators=creators
)

return dataset


@pass_local_client(
Expand Down Expand Up @@ -284,7 +295,7 @@ def dataset_remove(
commit_message=None
):
"""Delete a dataset."""
datasets = {name: client.dataset_path(name) for name in names}
datasets = {name: client.get_dataset_path(name) for name in names}

if not datasets:
raise ParameterError(
Expand Down Expand Up @@ -422,8 +433,8 @@ def export_dataset(
def import_dataset(
client,
uri,
name,
extract,
short_name='',
extract=False,
with_prompt=False,
pool_init_fn=None,
pool_init_args=None,
Expand Down Expand Up @@ -474,6 +485,15 @@ def import_dataset(
)

if files:
if not short_name:
short_name = generate_default_short_name(
dataset.name, dataset.version
)

dataset.short_name = short_name

client.create_dataset(name=dataset.name, short_name=short_name)

data_folder = tempfile.mkdtemp()

pool_size = min(
Expand Down Expand Up @@ -511,20 +531,18 @@ def import_dataset(
))
pool.close()

dataset_name = name or dataset.display_name
dataset.url = remove_credentials(dataset.url)
add_to_dataset(
client,
urls=[str(p) for p in Path(data_folder).glob('*')],
name=dataset_name,
with_metadata=dataset,
create=True
name=short_name,
with_metadata=dataset
)

if dataset.version:
tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version)
tag_dataset(
client, dataset_name, tag_name,
client, short_name, tag_name,
'Tag {} created by renku import'.format(dataset.version)
)

Expand Down Expand Up @@ -633,7 +651,7 @@ def _filter(client, names=None, creators=None, include=None, exclude=None):

records = []
for path_, dataset in client.datasets.items():
if not names or dataset.name in names:
if not names or dataset.short_name in names:
for file_ in dataset.files:
file_.dataset = dataset.name
path_ = file_.full_path.relative_to(client.path)
Expand Down
2 changes: 1 addition & 1 deletion renku/core/commands/format/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def tabular(client, datasets):
datasets,
headers=OrderedDict((
('uid', 'id'),
('display_name', None),
('short_name', None),
('version', None),
('created', None),
('creators_csv', 'creators'),
Expand Down
2 changes: 1 addition & 1 deletion renku/core/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def __init__(self, message=None):
'Please use the "git config" command to configure it.\n\n'
'\tgit config --set user.email "john.doe@example.com"\n'
)
super(MissingUsername, self).__init__(message)
super().__init__(message)


class AuthenticationError(RenkuException):
Expand Down
108 changes: 68 additions & 40 deletions renku/core/management/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
from renku.core import errors
from renku.core.management.clone import clone
from renku.core.management.config import RENKU_HOME
from renku.core.models.datasets import Dataset, DatasetFile, DatasetTag
from renku.core.models.datasets import Dataset, DatasetFile, DatasetTag, \
generate_default_short_name, is_dataset_name_valid
from renku.core.models.git import GitURL
from renku.core.models.locals import with_reference
from renku.core.models.provenance.agents import Person
Expand Down Expand Up @@ -85,31 +86,35 @@ def datasets(self):
result = {}
paths = (self.path / self.renku_datasets_path).rglob(self.METADATA)
for path in paths:
result[path] = self.get_dataset(path)
result[path] = self.load_dataset_from_path(path)
return result

def get_dataset(self, path, commit=None):
def load_dataset_from_path(self, path, commit=None):
"""Return a dataset from a given path."""
path = Path(path)
if not path.is_absolute():
path = self.path / path
return Dataset.from_yaml(path, client=self, commit=commit)

def dataset_path(self, name):
def get_dataset_path(self, name):
"""Get dataset path from name."""
path = self.renku_datasets_path / name / self.METADATA
if not path.exists():
path = LinkReference(
client=self, name='datasets/' + name
).reference
try:
path = LinkReference(
client=self, name='datasets/' + name
).reference
except errors.ParameterError:
return None

return path

def load_dataset(self, name=None):
"""Load dataset reference file."""
if name:
path = self.dataset_path(name)
if path.exists():
return self.get_dataset(path)
path = self.get_dataset_path(name)
if path and path.exists():
return self.load_dataset_from_path(path)

@contextmanager
def with_dataset(self, name=None, identifier=None, create=False):
Expand All @@ -118,50 +123,25 @@ def with_dataset(self, name=None, identifier=None, create=False):
clean_up_required = False

if dataset is None:
# Avoid nested datasets: name mustn't have '/' in it
if len(Path(name).parts) > 1:
raise errors.ParameterError(
'Dataset name {} is not valid.'.format(name)
)

if not create:
raise errors.DatasetNotFound
clean_up_required = True
dataset_ref = None
identifier = str(uuid.uuid4())
path = (self.renku_datasets_path / identifier / self.METADATA)
try:
path.parent.mkdir(parents=True, exist_ok=False)
except FileExistsError:
raise errors.DatasetExistsError(
'Dataset with reference {} exists'.format(path.parent)
)

with with_reference(path):
dataset = Dataset(
identifier=identifier, name=name, client=self
)

if name:
dataset_ref = LinkReference.create(
client=self, name='datasets/' + name
)
dataset_ref.set_reference(path)

clean_up_required = True
dataset, path, dataset_ref = self.create_dataset(name)
elif create:
raise errors.DatasetExistsError(
'Dataset exists: "{}".'.format(name)
)

dataset_path = self.path / self.datadir / dataset.name
dataset_path = self.path / self.datadir / dataset.short_name
dataset_path.mkdir(parents=True, exist_ok=True)

try:
yield dataset
except Exception:
# TODO use a general clean-up strategy
# https://github.com/SwissDataScienceCenter/renku-python/issues/736
if clean_up_required and dataset_ref:
if clean_up_required:
dataset_ref.delete()
shutil.rmtree(path.parent, ignore_errors=True)
raise
Expand All @@ -174,6 +154,54 @@ def with_dataset(self, name=None, identifier=None, create=False):

dataset.to_yaml()

def create_dataset(
self, name, short_name=None, description='', creators=()
):
"""Create a dataset."""
if not name:
raise errors.ParameterError('Dataset name must be provided.')

if not short_name:
short_name = generate_default_short_name(name, None)

if not is_dataset_name_valid(short_name):
raise errors.ParameterError(
'Dataset name "{}" is not valid.'.format(short_name)
)

if self.load_dataset(name=short_name):
raise errors.DatasetExistsError(
'Dataset exists: "{}".'.format(short_name)
)

identifier = str(uuid.uuid4())
path = (self.renku_datasets_path / identifier / self.METADATA)
try:
path.parent.mkdir(parents=True, exist_ok=False)
except FileExistsError:
raise errors.DatasetExistsError(
'Dataset with reference {} exists'.format(path.parent)
)

with with_reference(path):
dataset = Dataset(
client=self,
identifier=identifier,
name=name,
short_name=short_name,
description=description,
creator=creators
)

dataset_ref = LinkReference.create(
client=self, name='datasets/' + short_name
)
dataset_ref.set_reference(path)

dataset.to_yaml()

return dataset, path, dataset_ref

def add_data_to_dataset(
self,
dataset,
Expand All @@ -186,7 +214,7 @@ def add_data_to_dataset(
):
"""Import the data into the data directory."""
warning_message = ''
dataset_path = self.path / self.datadir / dataset.name
dataset_path = self.path / self.datadir / dataset.short_name

destination = destination or Path('.')
destination = self._resolve_path(dataset_path, destination)
Expand Down
Loading