From 640dd33b16c8250d5d5694eaabe568b0314efde4 Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Sat, 30 Nov 2019 22:47:59 +0100 Subject: [PATCH 01/10] refactor: dataset creation --- renku/core/commands/dataset.py | 6 +-- renku/core/management/datasets.py | 73 +++++++++++++++++++------------ 2 files changed, 47 insertions(+), 32 deletions(-) diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index 51e78fc256..2b7ed329ff 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -106,10 +106,8 @@ def create_dataset(client, name, commit_message=None): :raises: ``renku.core.errors.ParameterError`` """ - with client.with_dataset(name=name, create=True) as dataset: - creator = Person.from_git(client.repo) - if creator not in dataset.creator: - dataset.creator.append(creator) + creator = Person.from_git(client.repo) + client.create_dataset(name=name, creators=[creator]) @pass_local_client( diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index b48af3e039..bca7a24bc4 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -118,36 +118,11 @@ def with_dataset(self, name=None, identifier=None, create=False): clean_up_required = False if dataset is None: - # Avoid nested datasets: name mustn't have '/' in it - if len(Path(name).parts) > 1: - raise errors.ParameterError( - 'Dataset name {} is not valid.'.format(name) - ) - if not create: raise errors.DatasetNotFound - clean_up_required = True - dataset_ref = None - identifier = str(uuid.uuid4()) - path = (self.renku_datasets_path / identifier / self.METADATA) - try: - path.parent.mkdir(parents=True, exist_ok=False) - except FileExistsError: - raise errors.DatasetExistsError( - 'Dataset with reference {} exists'.format(path.parent) - ) - - with with_reference(path): - dataset = Dataset( - identifier=identifier, name=name, client=self - ) - - if name: - dataset_ref = LinkReference.create( - client=self, name='datasets/' + name - ) - dataset_ref.set_reference(path) + clean_up_required = True + dataset, path, dataset_ref = self.create_dataset(name) elif create: raise errors.DatasetExistsError( 'Dataset exists: "{}".'.format(name) @@ -161,7 +136,7 @@ def with_dataset(self, name=None, identifier=None, create=False): except Exception: # TODO use a general clean-up strategy # https://github.com/SwissDataScienceCenter/renku-python/issues/736 - if clean_up_required and dataset_ref: + if clean_up_required: dataset_ref.delete() shutil.rmtree(path.parent, ignore_errors=True) raise @@ -174,6 +149,48 @@ def with_dataset(self, name=None, identifier=None, create=False): dataset.to_yaml() + def create_dataset(self, name, creators=()): + """Create a dataset.""" + # FIXME create a test for duplicate dataset creation + if self.load_dataset(name=name): + raise errors.DatasetExistsError( + 'Dataset exists: "{}".'.format(name) + ) + + # Avoid nested datasets: name mustn't have '/' in it + if len(Path(name).parts) > 1: + raise errors.ParameterError( + 'Dataset name {} is not valid.'.format(name) + ) + if not name: + raise errors.ParameterError('Dataset name must be provided.') + + identifier = str(uuid.uuid4()) + path = (self.renku_datasets_path / identifier / self.METADATA) + try: + path.parent.mkdir(parents=True, exist_ok=False) + except FileExistsError: + raise errors.DatasetExistsError( + 'Dataset with reference {} exists'.format(path.parent) + ) + + with with_reference(path): + dataset = Dataset( + client=self, + identifier=identifier, + name=name, + creator=creators + ) + + dataset_ref = LinkReference.create( + client=self, name='datasets/' + name + ) + dataset_ref.set_reference(path) + + dataset.to_yaml() + + return dataset, path, dataset_ref + def add_data_to_dataset( self, dataset, From 8642b0bd38d0b1d5c5c296b6569924acc604e0bf Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Tue, 3 Dec 2019 10:54:07 +0100 Subject: [PATCH 02/10] refactor: rename methods --- renku/core/commands/dataset.py | 2 +- renku/core/management/datasets.py | 11 ++++++----- renku/core/models/provenance/activities.py | 2 +- tests/cli/test_datasets.py | 4 ++-- tests/cli/test_integration_datasets.py | 5 ++--- tests/core/commands/test_serialization.py | 6 ++++-- 6 files changed, 16 insertions(+), 14 deletions(-) diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index 2b7ed329ff..2750590b15 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -282,7 +282,7 @@ def dataset_remove( commit_message=None ): """Delete a dataset.""" - datasets = {name: client.dataset_path(name) for name in names} + datasets = {name: client.get_dataset_path(name) for name in names} if not datasets: raise ParameterError( diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index bca7a24bc4..b399a4e658 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -85,16 +85,17 @@ def datasets(self): result = {} paths = (self.path / self.renku_datasets_path).rglob(self.METADATA) for path in paths: - result[path] = self.get_dataset(path) + result[path] = self.load_dataset_from_path(path) return result - def get_dataset(self, path, commit=None): + def load_dataset_from_path(self, path, commit=None): """Return a dataset from a given path.""" + path = Path(path) if not path.is_absolute(): path = self.path / path return Dataset.from_yaml(path, client=self, commit=commit) - def dataset_path(self, name): + def get_dataset_path(self, name): """Get dataset path from name.""" path = self.renku_datasets_path / name / self.METADATA if not path.exists(): @@ -107,9 +108,9 @@ def dataset_path(self, name): def load_dataset(self, name=None): """Load dataset reference file.""" if name: - path = self.dataset_path(name) + path = self.get_dataset_path(name) if path.exists(): - return self.get_dataset(path) + return self.load_dataset_from_path(path) @contextmanager def with_dataset(self, name=None, identifier=None, create=False): diff --git a/renku/core/models/provenance/activities.py b/renku/core/models/provenance/activities.py index b658adc0a6..4e474de406 100644 --- a/renku/core/models/provenance/activities.py +++ b/renku/core/models/provenance/activities.py @@ -156,7 +156,7 @@ def default_generated(self): if str(path).startswith( os.path.join(client.renku_home, client.DATASETS) ): - entity = client.get_dataset(Path(path), commit=commit) + entity = client.load_dataset_from_path(path, commit=commit) else: entity = entity_cls( commit=commit, diff --git a/tests/cli/test_datasets.py b/tests/cli/test_datasets.py index a7efdb7890..8a0cc18740 100644 --- a/tests/cli/test_datasets.py +++ b/tests/cli/test_datasets.py @@ -861,7 +861,7 @@ def test_dataset_date_created_format(runner, client, project): assert 0 == result.exit_code assert 'OK' in result.output - path = client.dataset_path('dataset') + path = client.get_dataset_path('dataset') assert path.exists() with path.open(mode='r') as fp: @@ -880,7 +880,7 @@ def test_dataset_file_date_created_format(tmpdir, runner, client, project): assert 0 == result.exit_code assert 'OK' in result.output - path = client.dataset_path('dataset') + path = client.get_dataset_path('dataset') assert path.exists() # Create data file. diff --git a/tests/cli/test_integration_datasets.py b/tests/cli/test_integration_datasets.py index 06f94c8b29..8743d811fd 100644 --- a/tests/cli/test_integration_datasets.py +++ b/tests/cli/test_integration_datasets.py @@ -545,8 +545,7 @@ def test_add_from_git_copies_metadata(runner, client): ) assert 0 == result.exit_code - path = client.dataset_path('remote') - dataset = client.get_dataset(path) + dataset = client.load_dataset('remote') assert dataset.files[0].name == 'README.rst' assert 'mailto:jiri.kuncar@gmail.com' in str(dataset.files[0].creator) assert 'mailto:rokroskar@gmail.co' in str(dataset.files[0].creator) @@ -596,7 +595,7 @@ def test_usage_error_in_add_from_git(runner, client, params, n_urls, message): def read_dataset_file_metadata(client, dataset_name, filename): """Return metadata from dataset's YAML file.""" with client.with_dataset(dataset_name) as dataset: - assert client.dataset_path(dataset.name).exists() + assert client.get_dataset_path(dataset.name).exists() for file_ in dataset.files: if file_.path.endswith(filename): diff --git a/tests/core/commands/test_serialization.py b/tests/core/commands/test_serialization.py index a48eec81f2..1c29ec6768 100644 --- a/tests/core/commands/test_serialization.py +++ b/tests/core/commands/test_serialization.py @@ -31,7 +31,7 @@ def test_dataset_serialization(client, dataset, data_file): """Test Dataset serialization.""" def load_dataset(name): - with open(str(client.dataset_path(name))) as f: + with open(str(client.get_dataset_path(name))) as f: return yaml.safe_load(f) d_dict = load_dataset('dataset') @@ -54,7 +54,9 @@ def load_dataset(name): def test_dataset_deserialization(client, dataset): """Test Dataset deserialization.""" from renku.core.models.datasets import Dataset - dataset_ = Dataset.from_yaml(client.dataset_path('dataset'), client=client) + dataset_ = Dataset.from_yaml( + client.get_dataset_path('dataset'), client=client + ) dataset_types = { 'created': datetime.datetime, From 13282da82e9ff53a0002692fe0db09e408f26819 Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Tue, 3 Dec 2019 11:33:33 +0100 Subject: [PATCH 03/10] refactor: consistent use of display_name --- renku/cli/dataset.py | 32 +++++++++++---- renku/core/commands/dataset.py | 35 ++++++++++------ renku/core/management/datasets.py | 25 ++++++++---- renku/core/models/datasets.py | 66 +++++++++++++++++++++---------- renku/core/models/refs.py | 2 +- 5 files changed, 112 insertions(+), 48 deletions(-) diff --git a/renku/cli/dataset.py b/renku/cli/dataset.py index 6f7c9918ea..f106109aca 100644 --- a/renku/cli/dataset.py +++ b/renku/cli/dataset.py @@ -385,9 +385,27 @@ def dataset(ctx, revision, datadir, format): @dataset.command() @click.argument('name') -def create(name): +@click.option('--display-name', default='', help='Dataset\'s display name.') +@click.option( + '-d', '--description', default='', help='Dataset\'s description.' +) +@click.option( + '-c', + '--creator', + default=None, + multiple=True, + help='Creator\'s name and .' +) +def create(name, display_name, description, creator): """Create an empty dataset in the current repo.""" - create_dataset(name) + creators = creator or () + + create_dataset( + name=name, + display_name=display_name, + description=description, + creators=creators + ) click.secho('OK', fg='green') @@ -606,14 +624,14 @@ def export_(id, provider, publish, tag): @dataset.command('import') @click.argument('uri') -@click.option('-n', '--name', help='Dataset name.') +@click.option('--display-name', default='', help='Alias for dataset\'s name.') @click.option( '-x', '--extract', is_flag=True, help='Extract files before importing to dataset.' ) -def import_(uri, name, extract): +def import_(uri, display_name, extract): """Import data from a 3rd party provider. Supported providers: [Zenodo, Dataverse] @@ -638,9 +656,9 @@ def _init(lock, id_queue): tqdm.set_lock(lock) import_dataset( - uri, - name, - extract, + uri=uri, + display_name=display_name, + extract=extract, with_prompt=True, pool_init_fn=_init, pool_init_args=(mp.RLock(), id_queue), diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index 2750590b15..3fbf7ff2f8 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -42,7 +42,7 @@ MigrationRequired, ParameterError, UsageError from renku.core.management.datasets import DATASET_METADATA_PATHS from renku.core.management.git import COMMIT_DIFF_STRATEGY -from renku.core.models.datasets import Dataset +from renku.core.models.datasets import Dataset, generate_default_display_name from renku.core.models.provenance.agents import Person from renku.core.models.refs import LinkReference from renku.core.models.tabulate import tabulate @@ -101,13 +101,21 @@ def dataset_parent(client, revision, datadir, format, ctx=None): @pass_local_client( clean=False, commit=True, commit_only=DATASET_METADATA_PATHS ) -def create_dataset(client, name, commit_message=None): +def create_dataset( + client, name, display_name, description, creators, commit_message=None +): """Create an empty dataset in the current repo. :raises: ``renku.core.errors.ParameterError`` """ - creator = Person.from_git(client.repo) - client.create_dataset(name=name, creators=[creator]) + if not creators: + creators = [Person.from_git(client.repo)] + client.create_dataset( + name=name, + display_name=display_name, + description=description, + creators=creators + ) @pass_local_client( @@ -420,8 +428,8 @@ def export_dataset( def import_dataset( client, uri, - name, - extract, + display_name='', + extract=False, with_prompt=False, pool_init_fn=None, pool_init_args=None, @@ -472,6 +480,13 @@ def import_dataset( ) if files: + if not display_name: + display_name = generate_default_display_name(dataset) + + dataset.display_name = display_name + + client.create_dataset(name=dataset.name, display_name=display_name) + data_folder = tempfile.mkdtemp() pool_size = min( @@ -509,20 +524,18 @@ def import_dataset( )) pool.close() - dataset_name = name or dataset.display_name dataset.url = remove_credentials(dataset.url) add_to_dataset( client, urls=[str(p) for p in Path(data_folder).glob('*')], - name=dataset_name, - with_metadata=dataset, - create=True + name=display_name, + with_metadata=dataset ) if dataset.version: tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version) tag_dataset( - client, dataset_name, tag_name, + client, display_name, tag_name, 'Tag {} created by renku import'.format(dataset.version) ) diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index b399a4e658..2fb3763a78 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -99,9 +99,12 @@ def get_dataset_path(self, name): """Get dataset path from name.""" path = self.renku_datasets_path / name / self.METADATA if not path.exists(): - path = LinkReference( - client=self, name='datasets/' + name - ).reference + try: + path = LinkReference( + client=self, name='datasets/' + name + ).reference + except errors.ParameterError: + return None return path @@ -109,7 +112,7 @@ def load_dataset(self, name=None): """Load dataset reference file.""" if name: path = self.get_dataset_path(name) - if path.exists(): + if path and path.exists(): return self.load_dataset_from_path(path) @contextmanager @@ -129,7 +132,7 @@ def with_dataset(self, name=None, identifier=None, create=False): 'Dataset exists: "{}".'.format(name) ) - dataset_path = self.path / self.datadir / dataset.name + dataset_path = self.path / self.datadir / dataset.display_name dataset_path.mkdir(parents=True, exist_ok=True) try: @@ -150,7 +153,9 @@ def with_dataset(self, name=None, identifier=None, create=False): dataset.to_yaml() - def create_dataset(self, name, creators=()): + def create_dataset( + self, name, display_name='', description='', creators=() + ): """Create a dataset.""" # FIXME create a test for duplicate dataset creation if self.load_dataset(name=name): @@ -180,11 +185,15 @@ def create_dataset(self, name, creators=()): client=self, identifier=identifier, name=name, + display_name=display_name, + description=description, creator=creators ) + reference_name = display_name or name + dataset_ref = LinkReference.create( - client=self, name='datasets/' + name + client=self, name='datasets/' + reference_name ) dataset_ref.set_reference(path) @@ -204,7 +213,7 @@ def add_data_to_dataset( ): """Import the data into the data directory.""" warning_message = '' - dataset_path = self.path / self.datadir / dataset.name + dataset_path = self.path / self.datadir / dataset.display_name destination = destination or Path('.') destination = self._resolve_path(dataset_path, destination) diff --git a/renku/core/models/datasets.py b/renku/core/models/datasets.py index 8ac9989ba2..bc8041dd98 100644 --- a/renku/core/models/datasets.py +++ b/renku/core/models/datasets.py @@ -29,7 +29,7 @@ import attr from attr.validators import instance_of -# from renku.core.models.creators import CreatorsMixin +from renku.core import errors from renku.core.models.entities import Entity from renku.core.models.provenance.agents import Person from renku.core.utils.datetime8601 import parse_date @@ -276,7 +276,7 @@ class Dataset(Entity, CreatorMixin): EDITABLE_FIELDS = [ 'creator', 'date_published', 'description', 'in_language', 'keywords', - 'license', 'name', 'url', 'version', 'created', 'files' + 'license', 'name', 'url', 'version', 'created', 'files', 'display_name' ] _id = jsonld.ib(default=None, context='@id', kw_only=True) @@ -350,30 +350,24 @@ class Dataset(Entity, CreatorMixin): same_as = jsonld.ib(context='schema:sameAs', default=None, kw_only=True) + display_name = jsonld.ib( + default=None, context='schema:alternateName', kw_only=True + ) + @created.default def _now(self): """Define default value for datetime fields.""" return datetime.datetime.now(datetime.timezone.utc) - @property - def display_name(self): - """Get dataset display name.""" - name = re.sub(' +', ' ', self.name.lower()[:24]) - - def to_unix(el): - """Parse string to unix friendly name.""" - parsed_ = re.sub('[^a-zA-Z0-9]', '', re.sub(' +', ' ', el)) - parsed_ = re.sub(' .+', '.', parsed_.lower()) - return parsed_ - - short_name = [to_unix(el) for el in name.split()] - - if self.version: - version = to_unix(self.version) - name = '{0}_{1}'.format('_'.join(short_name), version) - return name - - return '.'.join(short_name) + @display_name.validator + def display_name_validator(self, attribute, value): + """Validate display_name.""" + if value: + scaped = urllib.parse.quote(value, safe='%') + if scaped != value or '~' in value: + raise errors.ParameterError( + 'Invalid "display_name": {}'.format(value) + ) @property def uid(self): @@ -527,3 +521,33 @@ def __attrs_post_init__(self): except KeyError: # if with_dataset is used, the dataset is not committed yet pass + + if not self.display_name: + # For compatibility with older versions use name as display_name + # if it is valid; otherwise, use converted name + try: + self.display_name_validator(None, self.name) + except errors.ParameterError: + self.display_name = generate_default_display_name(self) + else: + self.display_name = self.name + + +def generate_default_display_name(dataset): + """Get dataset display name.""" + name = re.sub(' +', ' ', dataset.name.lower()[:24]) + + def to_unix(el): + """Parse string to unix friendly name.""" + parsed_ = re.sub('[^a-zA-Z0-9]', '', re.sub(' +', ' ', el)) + parsed_ = re.sub(' .+', '.', parsed_.lower()) + return parsed_ + + short_name = [to_unix(el) for el in name.split()] + + if dataset.version: + version = to_unix(dataset.version) + name = '{0}_{1}'.format('_'.join(short_name), version) + return name + + return '.'.join(short_name) diff --git a/renku/core/models/refs.py b/renku/core/models/refs.py index 23d7f8f0cf..dd6d17141e 100644 --- a/renku/core/models/refs.py +++ b/renku/core/models/refs.py @@ -58,7 +58,7 @@ def check_ref_format(cls, name): def name_validator(self, attribute, value): """Validate reference name.""" if not self.check_ref_format(value): - raise errors.UsageError( + raise errors.ParameterError( 'The reference name "{0}" is not valid.'.format(value) ) From 21367dd22e250ed169a96113d415e7bee8e3cbae Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Tue, 3 Dec 2019 12:08:40 +0100 Subject: [PATCH 04/10] refactor: dataset name validation --- renku/core/commands/dataset.py | 6 ++++-- renku/core/management/datasets.py | 23 +++++++++++++++-------- renku/core/models/datasets.py | 31 +++++++++++++++++++------------ renku/core/models/refs.py | 7 ++++--- renku/data/shacl_shape.json | 8 ++++++++ 5 files changed, 50 insertions(+), 25 deletions(-) diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index 3fbf7ff2f8..a8c2371fa4 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -485,7 +485,9 @@ def import_dataset( dataset.display_name = display_name - client.create_dataset(name=dataset.name, display_name=display_name) + client.create_dataset( + name=dataset.name, display_name=display_name, is_import=True + ) data_folder = tempfile.mkdtemp() @@ -644,7 +646,7 @@ def _filter(client, names=None, creators=None, include=None, exclude=None): records = [] for path_, dataset in client.datasets.items(): - if not names or dataset.name in names: + if not names or dataset.name in names or dataset.display_name in names: for file_ in dataset.files: file_.dataset = dataset.name path_ = file_.full_path.relative_to(client.path) diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index 2fb3763a78..d48db43b77 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -36,7 +36,8 @@ from renku.core import errors from renku.core.management.clone import clone from renku.core.management.config import RENKU_HOME -from renku.core.models.datasets import Dataset, DatasetFile, DatasetTag +from renku.core.models.datasets import Dataset, DatasetFile, DatasetTag, \ + is_dataset_name_valid from renku.core.models.git import GitURL from renku.core.models.locals import with_reference from renku.core.models.provenance.agents import Person @@ -154,22 +155,28 @@ def with_dataset(self, name=None, identifier=None, create=False): dataset.to_yaml() def create_dataset( - self, name, display_name='', description='', creators=() + self, + name, + display_name='', + description='', + creators=(), + is_import=False ): """Create a dataset.""" - # FIXME create a test for duplicate dataset creation - if self.load_dataset(name=name): + if self.load_dataset( + name=name + ) or (is_import and self.load_dataset(name=display_name)): raise errors.DatasetExistsError( 'Dataset exists: "{}".'.format(name) ) - # Avoid nested datasets: name mustn't have '/' in it - if len(Path(name).parts) > 1: + if not name: + raise errors.ParameterError('Dataset name must be provided.') + # Do not validate imported datasets' names + if not is_import and not is_dataset_name_valid(name): raise errors.ParameterError( 'Dataset name {} is not valid.'.format(name) ) - if not name: - raise errors.ParameterError('Dataset name must be provided.') identifier = str(uuid.uuid4()) path = (self.renku_datasets_path / identifier / self.METADATA) diff --git a/renku/core/models/datasets.py b/renku/core/models/datasets.py index bc8041dd98..a1ca114a62 100644 --- a/renku/core/models/datasets.py +++ b/renku/core/models/datasets.py @@ -32,6 +32,7 @@ from renku.core import errors from renku.core.models.entities import Entity from renku.core.models.provenance.agents import Person +from renku.core.models.refs import LinkReference from renku.core.utils.datetime8601 import parse_date from renku.core.utils.doi import extract_doi, is_doi @@ -362,12 +363,11 @@ def _now(self): @display_name.validator def display_name_validator(self, attribute, value): """Validate display_name.""" - if value: - scaped = urllib.parse.quote(value, safe='%') - if scaped != value or '~' in value: - raise errors.ParameterError( - 'Invalid "display_name": {}'.format(value) - ) + # display_name might have been scaped and have '%' in it + if value and not is_dataset_name_valid(value, safe='%'): + raise errors.ParameterError( + 'Invalid "display_name": {}'.format(value) + ) @property def uid(self): @@ -524,13 +524,20 @@ def __attrs_post_init__(self): if not self.display_name: # For compatibility with older versions use name as display_name - # if it is valid; otherwise, use converted name - try: - self.display_name_validator(None, self.name) - except errors.ParameterError: - self.display_name = generate_default_display_name(self) - else: + # if it is valid; otherwise, use encoded name + if is_dataset_name_valid(self.name): self.display_name = self.name + else: + self.display_name = generate_default_display_name(self) + + +def is_dataset_name_valid(name, safe=''): + """A valid name is a valid Git reference name with no /.""" + # TODO make name an RFC 3986 compatible name and migrate old projects + return ( + name and LinkReference.check_ref_format(name, no_slashes=True) and + '/' not in name + ) def generate_default_display_name(dataset): diff --git a/renku/core/models/refs.py b/renku/core/models/refs.py index dd6d17141e..30e6c3fab5 100644 --- a/renku/core/models/refs.py +++ b/renku/core/models/refs.py @@ -37,7 +37,7 @@ class LinkReference: """Define a name of the folder with references in the Renku folder.""" @classmethod - def check_ref_format(cls, name): + def check_ref_format(cls, name, no_slashes=False): r"""Ensures that a reference name is well formed. It follows Git naming convention: @@ -51,8 +51,9 @@ def check_ref_format(cls, name): - it ends with ".lock", or - it contains a "@{" portion """ - return subprocess.run(('git', 'check-ref-format', name) - ).returncode == 0 + params = ('--allow-onelevel', name) if no_slashes else (name, ) + return subprocess.run(('git', 'check-ref-format') + + params).returncode == 0 @name.validator def name_validator(self, attribute, value): diff --git a/renku/data/shacl_shape.json b/renku/data/shacl_shape.json index 31d53f2277..679e544223 100644 --- a/renku/data/shacl_shape.json +++ b/renku/data/shacl_shape.json @@ -380,6 +380,14 @@ "sh:class": { "@id": "prov:Generation" } + }, + { + "nodeKind": "sh:Literal", + "path": "schema:alternateName", + "datatype": { + "@id": "xsd:string" + }, + "maxCount": 1 } ] }, From 11c25a7b0136e704b3155e50891fb4dd013c32a4 Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Tue, 3 Dec 2019 15:22:31 +0100 Subject: [PATCH 05/10] feat: parse creators from string --- renku/cli/dataset.py | 2 +- renku/core/commands/dataset.py | 3 +++ renku/core/errors.py | 2 +- renku/core/models/provenance/agents.py | 18 ++++++++++++++++++ 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/renku/cli/dataset.py b/renku/cli/dataset.py index f106109aca..219adec1ea 100644 --- a/renku/cli/dataset.py +++ b/renku/cli/dataset.py @@ -394,7 +394,7 @@ def dataset(ctx, revision, datadir, format): '--creator', default=None, multiple=True, - help='Creator\'s name and .' + help='Creator\'s name and email ("Name ").' ) def create(name, display_name, description, creator): """Create an empty dataset in the current repo.""" diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index a8c2371fa4..25f9340d0c 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -110,6 +110,9 @@ def create_dataset( """ if not creators: creators = [Person.from_git(client.repo)] + else: + creators = [Person.from_string(c) for c in creators] + client.create_dataset( name=name, display_name=display_name, diff --git a/renku/core/errors.py b/renku/core/errors.py index 3b143a8671..8c68fde42c 100644 --- a/renku/core/errors.py +++ b/renku/core/errors.py @@ -128,7 +128,7 @@ def __init__(self, message=None): 'Please use the "git config" command to configure it.\n\n' '\tgit config --set user.email "john.doe@example.com"\n' ) - super(MissingUsername, self).__init__(message) + super().__init__(message) class AuthenticationError(RenkuException): diff --git a/renku/core/models/provenance/agents.py b/renku/core/models/provenance/agents.py index 7e731b1c5a..edd82a0680 100644 --- a/renku/core/models/provenance/agents.py +++ b/renku/core/models/provenance/agents.py @@ -127,6 +127,24 @@ def from_git(cls, git): return cls(name=name, email=email) + @classmethod + def from_string(cls, string): + """Create an instance from a 'Name ' string.""" + REGEX = r'([^<]*)<{0,1}([^@<>]+@[^@<>]+\.[^@<>]+)*>{0,1}' + name, email = re.search(REGEX, string).groups() + name = name.rstrip() + # Check the git configuration. + if not name: # pragma: no cover + raise errors.ParameterError( + 'Name is not valid: Valid format is "Name "' + ) + if not email: # pragma: no cover + raise errors.ParameterError( + 'Email is not valid: Valid format is "Name "' + ) + + return cls(name=name, email=email) + def __attrs_post_init__(self): """Finish object initialization.""" # handle the case where ids were improperly set From 371d14a516dc81290ec86df87f947cf3e00ac8d5 Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Wed, 4 Dec 2019 09:49:37 +0100 Subject: [PATCH 06/10] add more tests --- tests/cli/test_datasets.py | 100 +++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/tests/cli/test_datasets.py b/tests/cli/test_datasets.py index 8a0cc18740..b162fbcbc4 100644 --- a/tests/cli/test_datasets.py +++ b/tests/cli/test_datasets.py @@ -56,6 +56,66 @@ def test_datasets_create_clean(runner, project, client): assert 'datasets' not in file_path +def test_datasets_create_with_metadata(runner, client): + """Test creating a dataset with metadata.""" + result = runner.invoke( + cli, [ + 'dataset', 'create', 'dataset', '--display-name', 'dataset-name', + '--description', 'some description here', '-c', + 'John Doe ', '-c', + 'John Smiths' + ] + ) + assert 0 == result.exit_code + assert 'OK' in result.output + + dataset = client.load_dataset(name='dataset-name') + assert dataset.name == 'dataset' + assert dataset.display_name == 'dataset-name' + assert dataset.description == 'some description here' + assert 'John Doe' in [c.name for c in dataset.creator] + assert 'john.doe@mail.ch' in [c.email for c in dataset.creator] + assert 'John Smiths' in [c.name for c in dataset.creator] + assert 'john.smiths@mail.ch' in [c.email for c in dataset.creator] + + +def test_datasets_create_different_display_name(runner, client): + """Test creating datasets with same name but different display name.""" + result = runner.invoke( + cli, ['dataset', 'create', 'dataset', '--display-name', 'dataset-1'] + ) + assert 0 == result.exit_code + assert 'OK' in result.output + + result = runner.invoke( + cli, ['dataset', 'create', 'dataset', '--display-name', 'dataset-2'] + ) + assert 0 == result.exit_code + assert 'OK' in result.output + + +def test_datasets_create_with_same_name(runner, client): + """Test creating datasets with same name.""" + result = runner.invoke(cli, ['dataset', 'create', 'dataset']) + assert 0 == result.exit_code + assert 'OK' in result.output + + result = runner.invoke(cli, ['dataset', 'create', 'dataset']) + assert 1 == result.exit_code + assert 'Dataset exists: "dataset"' in result.output + + +def test_datasets_display_name_generation(runner, client): + """Test display_name is same as name when not provided.""" + result = runner.invoke(cli, ['dataset', 'create', 'dataset-name']) + assert 0 == result.exit_code + assert 'OK' in result.output + + dataset = client.load_dataset(name='dataset-name') + assert dataset.name == 'dataset-name' + assert dataset.display_name == 'dataset-name' + + def test_datasets_create_dirty(runner, project, client): """Test creating a dataset in dirty repository.""" # Create a file in root of the repository. @@ -189,6 +249,18 @@ def test_dataset_name_is_valid(client, runner, project, name): assert 'is not valid' in result.output +@pytest.mark.parametrize( + 'creator,field', [('John Doe', 'Email'), ('John Doe<>', 'Email'), + ('', 'Name'), + ('John Doe', 'Email')] +) +def test_dataset_creator_is_invalid(client, runner, creator, field): + """Test create dataset with invalid creator format.""" + result = runner.invoke(cli, ['dataset', 'create', 'ds', '-c', creator]) + assert 2 == result.exit_code + assert field + ' is not valid' in result.output + + @pytest.mark.parametrize('output_format', DATASETS_FORMATS.keys()) def test_datasets_list_empty(output_format, runner, project): """Test listing without datasets.""" @@ -665,6 +737,34 @@ def test_datasets_ls_files_correct_paths(tmpdir, runner, project): assert Path(record['path']).exists() +def test_datasets_ls_files_with_display_name(directory_tree, runner, project): + """Test listing of data within dataset with include/exclude filters.""" + # create a dataset + result = runner.invoke( + cli, + ['dataset', 'create', 'my-dataset', '--display-name', 'dataset-name'] + ) + assert 0 == result.exit_code + + # add data to dataset + result = runner.invoke( + cli, + ['dataset', 'add', 'dataset-name', directory_tree.strpath], + catch_exceptions=False, + ) + assert 0 == result.exit_code + + # list files with dataset name + result = runner.invoke(cli, ['dataset', 'ls-files', 'my-dataset']) + assert 0 == result.exit_code + assert 'dir2/file2' in result.output + + # list files with dataset display name + result = runner.invoke(cli, ['dataset', 'ls-files', 'dataset-name']) + assert 0 == result.exit_code + assert 'dir2/file2' in result.output + + def test_dataset_unlink_file_not_found(runner, project): """Test unlinking of file from dataset with no files found.""" # create a dataset From afbd67166826febab23df8ad05927059628281c0 Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Wed, 11 Dec 2019 14:29:18 +0100 Subject: [PATCH 07/10] fix: do not allow display_name at dataset creation --- renku/cli/dataset.py | 16 +++----- renku/core/commands/checks/migration.py | 2 +- renku/core/commands/dataset.py | 22 +++++------ renku/core/commands/format/datasets.py | 2 +- renku/core/management/datasets.py | 38 ++++++++---------- renku/core/models/datasets.py | 33 ++++++++-------- tests/cli/test_datasets.py | 51 ++++--------------------- 7 files changed, 57 insertions(+), 107 deletions(-) diff --git a/renku/cli/dataset.py b/renku/cli/dataset.py index 219adec1ea..f4f3c19eb7 100644 --- a/renku/cli/dataset.py +++ b/renku/cli/dataset.py @@ -385,7 +385,6 @@ def dataset(ctx, revision, datadir, format): @dataset.command() @click.argument('name') -@click.option('--display-name', default='', help='Dataset\'s display name.') @click.option( '-d', '--description', default='', help='Dataset\'s description.' ) @@ -396,16 +395,11 @@ def dataset(ctx, revision, datadir, format): multiple=True, help='Creator\'s name and email ("Name ").' ) -def create(name, display_name, description, creator): +def create(name, description, creator): """Create an empty dataset in the current repo.""" creators = creator or () - create_dataset( - name=name, - display_name=display_name, - description=description, - creators=creators - ) + create_dataset(name=name, description=description, creators=creators) click.secho('OK', fg='green') @@ -624,14 +618,14 @@ def export_(id, provider, publish, tag): @dataset.command('import') @click.argument('uri') -@click.option('--display-name', default='', help='Alias for dataset\'s name.') +@click.option('--name', default='', help='Renku-compatible name for dataset.') @click.option( '-x', '--extract', is_flag=True, help='Extract files before importing to dataset.' ) -def import_(uri, display_name, extract): +def import_(uri, name, extract): """Import data from a 3rd party provider. Supported providers: [Zenodo, Dataverse] @@ -657,7 +651,7 @@ def _init(lock, id_queue): import_dataset( uri=uri, - display_name=display_name, + internal_name=name, extract=extract, with_prompt=True, pool_init_fn=_init, diff --git a/renku/core/commands/checks/migration.py b/renku/core/commands/checks/migration.py index 6bec34afce..ca4033fc8b 100644 --- a/renku/core/commands/checks/migration.py +++ b/renku/core/commands/checks/migration.py @@ -162,7 +162,7 @@ def migrate_broken_dataset_paths(client): # migrate the refs ref = LinkReference.create( client=client, - name='datasets/{0}'.format(dataset.display_name), + name='datasets/{0}'.format(dataset.internal_name), force=True, ) ref.set_reference(expected_path / client.METADATA) diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index 25f9340d0c..a51e0be0b2 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -42,7 +42,7 @@ MigrationRequired, ParameterError, UsageError from renku.core.management.datasets import DATASET_METADATA_PATHS from renku.core.management.git import COMMIT_DIFF_STRATEGY -from renku.core.models.datasets import Dataset, generate_default_display_name +from renku.core.models.datasets import Dataset, generate_default_internal_name from renku.core.models.provenance.agents import Person from renku.core.models.refs import LinkReference from renku.core.models.tabulate import tabulate @@ -115,7 +115,7 @@ def create_dataset( client.create_dataset( name=name, - display_name=display_name, + internal_name=name, description=description, creators=creators ) @@ -431,7 +431,7 @@ def export_dataset( def import_dataset( client, uri, - display_name='', + internal_name='', extract=False, with_prompt=False, pool_init_fn=None, @@ -483,14 +483,12 @@ def import_dataset( ) if files: - if not display_name: - display_name = generate_default_display_name(dataset) + if not internal_name: + internal_name = generate_default_internal_name(dataset) - dataset.display_name = display_name + dataset.internal_name = internal_name - client.create_dataset( - name=dataset.name, display_name=display_name, is_import=True - ) + client.create_dataset(name=dataset.name, internal_name=internal_name) data_folder = tempfile.mkdtemp() @@ -533,14 +531,14 @@ def import_dataset( add_to_dataset( client, urls=[str(p) for p in Path(data_folder).glob('*')], - name=display_name, + name=internal_name, with_metadata=dataset ) if dataset.version: tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version) tag_dataset( - client, display_name, tag_name, + client, internal_name, tag_name, 'Tag {} created by renku import'.format(dataset.version) ) @@ -649,7 +647,7 @@ def _filter(client, names=None, creators=None, include=None, exclude=None): records = [] for path_, dataset in client.datasets.items(): - if not names or dataset.name in names or dataset.display_name in names: + if not names or dataset.internal_name in names: for file_ in dataset.files: file_.dataset = dataset.name path_ = file_.full_path.relative_to(client.path) diff --git a/renku/core/commands/format/datasets.py b/renku/core/commands/format/datasets.py index da3e62e202..d53dcc8a95 100644 --- a/renku/core/commands/format/datasets.py +++ b/renku/core/commands/format/datasets.py @@ -31,7 +31,7 @@ def tabular(client, datasets): datasets, headers=OrderedDict(( ('uid', 'id'), - ('display_name', None), + ('internal_name', None), ('version', None), ('created', None), ('creators_csv', 'creators'), diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index d48db43b77..74812d3160 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -133,7 +133,7 @@ def with_dataset(self, name=None, identifier=None, create=False): 'Dataset exists: "{}".'.format(name) ) - dataset_path = self.path / self.datadir / dataset.display_name + dataset_path = self.path / self.datadir / dataset.internal_name dataset_path.mkdir(parents=True, exist_ok=True) try: @@ -155,27 +155,23 @@ def with_dataset(self, name=None, identifier=None, create=False): dataset.to_yaml() def create_dataset( - self, - name, - display_name='', - description='', - creators=(), - is_import=False + self, name, internal_name='', description='', creators=() ): """Create a dataset.""" - if self.load_dataset( - name=name - ) or (is_import and self.load_dataset(name=display_name)): - raise errors.DatasetExistsError( - 'Dataset exists: "{}".'.format(name) - ) - if not name: raise errors.ParameterError('Dataset name must be provided.') - # Do not validate imported datasets' names - if not is_import and not is_dataset_name_valid(name): + + if not internal_name: + internal_name = name + + if not is_dataset_name_valid(internal_name): raise errors.ParameterError( - 'Dataset name {} is not valid.'.format(name) + 'Dataset name {} is not valid.'.format(internal_name) + ) + + if self.load_dataset(name=internal_name): + raise errors.DatasetExistsError( + 'Dataset exists: "{}".'.format(internal_name) ) identifier = str(uuid.uuid4()) @@ -192,15 +188,13 @@ def create_dataset( client=self, identifier=identifier, name=name, - display_name=display_name, + internal_name=internal_name, description=description, creator=creators ) - reference_name = display_name or name - dataset_ref = LinkReference.create( - client=self, name='datasets/' + reference_name + client=self, name='datasets/' + internal_name ) dataset_ref.set_reference(path) @@ -220,7 +214,7 @@ def add_data_to_dataset( ): """Import the data into the data directory.""" warning_message = '' - dataset_path = self.path / self.datadir / dataset.display_name + dataset_path = self.path / self.datadir / dataset.internal_name destination = destination or Path('.') destination = self._resolve_path(dataset_path, destination) diff --git a/renku/core/models/datasets.py b/renku/core/models/datasets.py index a1ca114a62..676dcf8ed5 100644 --- a/renku/core/models/datasets.py +++ b/renku/core/models/datasets.py @@ -277,7 +277,8 @@ class Dataset(Entity, CreatorMixin): EDITABLE_FIELDS = [ 'creator', 'date_published', 'description', 'in_language', 'keywords', - 'license', 'name', 'url', 'version', 'created', 'files', 'display_name' + 'license', 'name', 'url', 'version', 'created', 'files', + 'internal_name' ] _id = jsonld.ib(default=None, context='@id', kw_only=True) @@ -351,22 +352,20 @@ class Dataset(Entity, CreatorMixin): same_as = jsonld.ib(context='schema:sameAs', default=None, kw_only=True) - display_name = jsonld.ib( - default=None, context='schema:alternateName', kw_only=True - ) + internal_name = jsonld.ib(default='', context=None, kw_only=True) @created.default def _now(self): """Define default value for datetime fields.""" return datetime.datetime.now(datetime.timezone.utc) - @display_name.validator - def display_name_validator(self, attribute, value): - """Validate display_name.""" - # display_name might have been scaped and have '%' in it + @internal_name.validator + def internal_name_validator(self, attribute, value): + """Validate internal_name.""" + # internal_name might have been scaped and have '%' in it if value and not is_dataset_name_valid(value, safe='%'): raise errors.ParameterError( - 'Invalid "display_name": {}'.format(value) + 'Invalid "internal_name": {}'.format(value) ) @property @@ -522,13 +521,13 @@ def __attrs_post_init__(self): # if with_dataset is used, the dataset is not committed yet pass - if not self.display_name: - # For compatibility with older versions use name as display_name + if not self.internal_name: + # For compatibility with older versions use name as internal_name # if it is valid; otherwise, use encoded name if is_dataset_name_valid(self.name): - self.display_name = self.name + self.internal_name = self.name else: - self.display_name = generate_default_display_name(self) + self.internal_name = generate_default_internal_name(self) def is_dataset_name_valid(name, safe=''): @@ -540,13 +539,13 @@ def is_dataset_name_valid(name, safe=''): ) -def generate_default_display_name(dataset): - """Get dataset display name.""" - name = re.sub(' +', ' ', dataset.name.lower()[:24]) +def generate_default_internal_name(dataset): + """Get dataset internal_name.""" + name = re.sub(r'\s+', ' ', dataset.name.lower()[:24]) def to_unix(el): """Parse string to unix friendly name.""" - parsed_ = re.sub('[^a-zA-Z0-9]', '', re.sub(' +', ' ', el)) + parsed_ = re.sub('[^a-zA-Z0-9]', '', re.sub(r'\s+', ' ', el)) parsed_ = re.sub(' .+', '.', parsed_.lower()) return parsed_ diff --git a/tests/cli/test_datasets.py b/tests/cli/test_datasets.py index b162fbcbc4..6a13c0603d 100644 --- a/tests/cli/test_datasets.py +++ b/tests/cli/test_datasets.py @@ -60,18 +60,17 @@ def test_datasets_create_with_metadata(runner, client): """Test creating a dataset with metadata.""" result = runner.invoke( cli, [ - 'dataset', 'create', 'dataset', '--display-name', 'dataset-name', - '--description', 'some description here', '-c', - 'John Doe ', '-c', + 'dataset', 'create', 'my-dataset', '--description', + 'some description here', '-c', 'John Doe ', '-c', 'John Smiths' ] ) assert 0 == result.exit_code assert 'OK' in result.output - dataset = client.load_dataset(name='dataset-name') - assert dataset.name == 'dataset' - assert dataset.display_name == 'dataset-name' + dataset = client.load_dataset(name='my-dataset') + assert dataset.name == 'my-dataset' + assert dataset.internal_name == 'my-dataset' assert dataset.description == 'some description here' assert 'John Doe' in [c.name for c in dataset.creator] assert 'john.doe@mail.ch' in [c.email for c in dataset.creator] @@ -79,21 +78,6 @@ def test_datasets_create_with_metadata(runner, client): assert 'john.smiths@mail.ch' in [c.email for c in dataset.creator] -def test_datasets_create_different_display_name(runner, client): - """Test creating datasets with same name but different display name.""" - result = runner.invoke( - cli, ['dataset', 'create', 'dataset', '--display-name', 'dataset-1'] - ) - assert 0 == result.exit_code - assert 'OK' in result.output - - result = runner.invoke( - cli, ['dataset', 'create', 'dataset', '--display-name', 'dataset-2'] - ) - assert 0 == result.exit_code - assert 'OK' in result.output - - def test_datasets_create_with_same_name(runner, client): """Test creating datasets with same name.""" result = runner.invoke(cli, ['dataset', 'create', 'dataset']) @@ -105,17 +89,6 @@ def test_datasets_create_with_same_name(runner, client): assert 'Dataset exists: "dataset"' in result.output -def test_datasets_display_name_generation(runner, client): - """Test display_name is same as name when not provided.""" - result = runner.invoke(cli, ['dataset', 'create', 'dataset-name']) - assert 0 == result.exit_code - assert 'OK' in result.output - - dataset = client.load_dataset(name='dataset-name') - assert dataset.name == 'dataset-name' - assert dataset.display_name == 'dataset-name' - - def test_datasets_create_dirty(runner, project, client): """Test creating a dataset in dirty repository.""" # Create a file in root of the repository. @@ -737,19 +710,16 @@ def test_datasets_ls_files_correct_paths(tmpdir, runner, project): assert Path(record['path']).exists() -def test_datasets_ls_files_with_display_name(directory_tree, runner, project): +def test_datasets_ls_files(directory_tree, runner, project): """Test listing of data within dataset with include/exclude filters.""" # create a dataset - result = runner.invoke( - cli, - ['dataset', 'create', 'my-dataset', '--display-name', 'dataset-name'] - ) + result = runner.invoke(cli, ['dataset', 'create', 'my-dataset']) assert 0 == result.exit_code # add data to dataset result = runner.invoke( cli, - ['dataset', 'add', 'dataset-name', directory_tree.strpath], + ['dataset', 'add', 'my-dataset', directory_tree.strpath], catch_exceptions=False, ) assert 0 == result.exit_code @@ -759,11 +729,6 @@ def test_datasets_ls_files_with_display_name(directory_tree, runner, project): assert 0 == result.exit_code assert 'dir2/file2' in result.output - # list files with dataset display name - result = runner.invoke(cli, ['dataset', 'ls-files', 'dataset-name']) - assert 0 == result.exit_code - assert 'dir2/file2' in result.output - def test_dataset_unlink_file_not_found(runner, project): """Test unlinking of file from dataset with no files found.""" From ae2e6880b0f00b7d66a190d0f80b04ddf45b7042 Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Mon, 16 Dec 2019 11:50:58 +0100 Subject: [PATCH 08/10] chore: resolve rebasing conflicts --- renku/core/commands/dataset.py | 4 +--- tests/core/commands/test_dataset.py | 7 ++++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index a51e0be0b2..cc420a825d 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -101,9 +101,7 @@ def dataset_parent(client, revision, datadir, format, ctx=None): @pass_local_client( clean=False, commit=True, commit_only=DATASET_METADATA_PATHS ) -def create_dataset( - client, name, display_name, description, creators, commit_message=None -): +def create_dataset(client, name, description, creators, commit_message=None): """Create an empty dataset in the current repo. :raises: ``renku.core.errors.ParameterError`` diff --git a/tests/core/commands/test_dataset.py b/tests/core/commands/test_dataset.py index 481b054738..cc01dd2f18 100644 --- a/tests/core/commands/test_dataset.py +++ b/tests/core/commands/test_dataset.py @@ -180,6 +180,11 @@ def test_dataset_serialization(dataset): def test_create_dataset_custom_message(project): """Test create dataset custom message.""" - create_dataset('ds1', commit_message='my awesome dataset') + create_dataset( + 'ds1', + description='', + creators=[], + commit_message='my awesome dataset' + ) last_commit = Repo('.').head.commit assert 'my awesome dataset' == last_commit.message From d482ec2c5071bc25f0ef2d540306e630aefb67f6 Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Mon, 16 Dec 2019 14:56:38 +0100 Subject: [PATCH 09/10] feat: allow any character in dataset name --- renku/cli/dataset.py | 9 ++++++++- renku/core/commands/dataset.py | 13 +++++++------ renku/core/management/datasets.py | 6 +++--- renku/core/models/datasets.py | 26 +++++++++++++++----------- tests/cli/test_datasets.py | 28 ++++++++++++++++++++-------- 5 files changed, 53 insertions(+), 29 deletions(-) diff --git a/renku/cli/dataset.py b/renku/cli/dataset.py index f4f3c19eb7..9ff08d5b77 100644 --- a/renku/cli/dataset.py +++ b/renku/cli/dataset.py @@ -399,7 +399,14 @@ def create(name, description, creator): """Create an empty dataset in the current repo.""" creators = creator or () - create_dataset(name=name, description=description, creators=creators) + dataset = create_dataset( + name=name, description=description, creators=creators + ) + click.echo( + 'Use the name "{}" to refer to this dataset.'.format( + dataset.internal_name + ) + ) click.secho('OK', fg='green') diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index cc420a825d..be32567214 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -111,13 +111,12 @@ def create_dataset(client, name, description, creators, commit_message=None): else: creators = [Person.from_string(c) for c in creators] - client.create_dataset( - name=name, - internal_name=name, - description=description, - creators=creators + dataset, _, __ = client.create_dataset( + name=name, description=description, creators=creators ) + return dataset + @pass_local_client( clean=False, commit=True, commit_only=DATASET_METADATA_PATHS @@ -482,7 +481,9 @@ def import_dataset( if files: if not internal_name: - internal_name = generate_default_internal_name(dataset) + internal_name = generate_default_internal_name( + dataset.name, dataset.version + ) dataset.internal_name = internal_name diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index 74812d3160..fb0f0d3d86 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -37,7 +37,7 @@ from renku.core.management.clone import clone from renku.core.management.config import RENKU_HOME from renku.core.models.datasets import Dataset, DatasetFile, DatasetTag, \ - is_dataset_name_valid + generate_default_internal_name, is_dataset_name_valid from renku.core.models.git import GitURL from renku.core.models.locals import with_reference from renku.core.models.provenance.agents import Person @@ -162,11 +162,11 @@ def create_dataset( raise errors.ParameterError('Dataset name must be provided.') if not internal_name: - internal_name = name + internal_name = generate_default_internal_name(name, None) if not is_dataset_name_valid(internal_name): raise errors.ParameterError( - 'Dataset name {} is not valid.'.format(internal_name) + 'Dataset name "{}" is not valid.'.format(internal_name) ) if self.load_dataset(name=internal_name): diff --git a/renku/core/models/datasets.py b/renku/core/models/datasets.py index 676dcf8ed5..3402b0c08f 100644 --- a/renku/core/models/datasets.py +++ b/renku/core/models/datasets.py @@ -522,12 +522,9 @@ def __attrs_post_init__(self): pass if not self.internal_name: - # For compatibility with older versions use name as internal_name - # if it is valid; otherwise, use encoded name - if is_dataset_name_valid(self.name): - self.internal_name = self.name - else: - self.internal_name = generate_default_internal_name(self) + self.internal_name = generate_default_internal_name( + self.name, self.version + ) def is_dataset_name_valid(name, safe=''): @@ -539,9 +536,15 @@ def is_dataset_name_valid(name, safe=''): ) -def generate_default_internal_name(dataset): +def generate_default_internal_name(dataset_name, dataset_version): """Get dataset internal_name.""" - name = re.sub(r'\s+', ' ', dataset.name.lower()[:24]) + # For compatibility with older versions use name as internal_name + # if it is valid; otherwise, use encoded name + if is_dataset_name_valid(dataset_name): + return dataset_name + + name = re.sub(r'\s+', ' ', dataset_name) + name = name.lower()[:24] def to_unix(el): """Parse string to unix friendly name.""" @@ -550,10 +553,11 @@ def to_unix(el): return parsed_ short_name = [to_unix(el) for el in name.split()] + short_name = [el for el in short_name if el] - if dataset.version: - version = to_unix(dataset.version) + if dataset_version: + version = to_unix(dataset_version) name = '{0}_{1}'.format('_'.join(short_name), version) return name - return '.'.join(short_name) + return '_'.join(short_name) diff --git a/tests/cli/test_datasets.py b/tests/cli/test_datasets.py index 6a13c0603d..8c638259a6 100644 --- a/tests/cli/test_datasets.py +++ b/tests/cli/test_datasets.py @@ -89,6 +89,26 @@ def test_datasets_create_with_same_name(runner, client): assert 'Dataset exists: "dataset"' in result.output +@pytest.mark.parametrize( + 'name,internal_name', + [('v.a.l.i.d_internal-name', 'v.a.l.i.d_internal-name'), + ('any name /@#$!', 'any_name'), + ('name longer than 24 characters', 'name_longer_than_24_char'), + ('semi valid-name', 'semi_validname'), ('dataset/new', 'datasetnew'), + ('/dataset', 'dataset'), ('dataset/', 'dataset')] +) +def test_datasets_valid_name(runner, client, name, internal_name): + """Test creating datasets with valid name.""" + result = runner.invoke(cli, ['dataset', 'create', name]) + assert 0 == result.exit_code + assert 'Use the name "{}"'.format(internal_name) in result.output + assert 'OK' in result.output + + dataset = client.load_dataset(name=internal_name) + assert dataset.name == name + assert dataset.internal_name == internal_name + + def test_datasets_create_dirty(runner, project, client): """Test creating a dataset in dirty repository.""" # Create a file in root of the repository. @@ -214,14 +234,6 @@ def test_dataset_create_exception_refs(runner, project, client): assert 'a' in result.output -@pytest.mark.parametrize('name', ['dataset/new', '/dataset', 'dataset/']) -def test_dataset_name_is_valid(client, runner, project, name): - """Test dataset name has no '/' character to avoid nested datasets.""" - result = runner.invoke(cli, ['dataset', 'create', name]) - assert 2 == result.exit_code - assert 'is not valid' in result.output - - @pytest.mark.parametrize( 'creator,field', [('John Doe', 'Email'), ('John Doe<>', 'Email'), ('', 'Name'), From 5b63f584bd4c606567861ea2ea78698108b0fd97 Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Mon, 16 Dec 2019 18:00:41 +0100 Subject: [PATCH 10/10] feat: allow short_name for datasets --- renku/cli/dataset.py | 20 +++++++---- renku/core/commands/checks/migration.py | 2 +- renku/core/commands/dataset.py | 27 ++++++++------ renku/core/commands/format/datasets.py | 2 +- renku/core/management/datasets.py | 24 ++++++------- renku/core/models/datasets.py | 27 +++++++------- tests/cli/test_datasets.py | 47 ++++++++++++++++++------- tests/core/commands/test_dataset.py | 1 + 8 files changed, 94 insertions(+), 56 deletions(-) diff --git a/renku/cli/dataset.py b/renku/cli/dataset.py index 9ff08d5b77..47d4c973be 100644 --- a/renku/cli/dataset.py +++ b/renku/cli/dataset.py @@ -385,6 +385,9 @@ def dataset(ctx, revision, datadir, format): @dataset.command() @click.argument('name') +@click.option( + '--short-name', default='', help='A convenient name for dataset.' +) @click.option( '-d', '--description', default='', help='Dataset\'s description.' ) @@ -395,16 +398,19 @@ def dataset(ctx, revision, datadir, format): multiple=True, help='Creator\'s name and email ("Name ").' ) -def create(name, description, creator): +def create(name, short_name, description, creator): """Create an empty dataset in the current repo.""" creators = creator or () dataset = create_dataset( - name=name, description=description, creators=creators + name=name, + short_name=short_name, + description=description, + creators=creators ) click.echo( 'Use the name "{}" to refer to this dataset.'.format( - dataset.internal_name + dataset.short_name ) ) click.secho('OK', fg='green') @@ -625,14 +631,16 @@ def export_(id, provider, publish, tag): @dataset.command('import') @click.argument('uri') -@click.option('--name', default='', help='Renku-compatible name for dataset.') +@click.option( + '--short-name', default='', help='A convenient name for dataset.' +) @click.option( '-x', '--extract', is_flag=True, help='Extract files before importing to dataset.' ) -def import_(uri, name, extract): +def import_(uri, short_name, extract): """Import data from a 3rd party provider. Supported providers: [Zenodo, Dataverse] @@ -658,7 +666,7 @@ def _init(lock, id_queue): import_dataset( uri=uri, - internal_name=name, + short_name=short_name, extract=extract, with_prompt=True, pool_init_fn=_init, diff --git a/renku/core/commands/checks/migration.py b/renku/core/commands/checks/migration.py index ca4033fc8b..ddb5a69524 100644 --- a/renku/core/commands/checks/migration.py +++ b/renku/core/commands/checks/migration.py @@ -162,7 +162,7 @@ def migrate_broken_dataset_paths(client): # migrate the refs ref = LinkReference.create( client=client, - name='datasets/{0}'.format(dataset.internal_name), + name='datasets/{0}'.format(dataset.short_name), force=True, ) ref.set_reference(expected_path / client.METADATA) diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index be32567214..fb4fc42e5e 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -42,7 +42,7 @@ MigrationRequired, ParameterError, UsageError from renku.core.management.datasets import DATASET_METADATA_PATHS from renku.core.management.git import COMMIT_DIFF_STRATEGY -from renku.core.models.datasets import Dataset, generate_default_internal_name +from renku.core.models.datasets import Dataset, generate_default_short_name from renku.core.models.provenance.agents import Person from renku.core.models.refs import LinkReference from renku.core.models.tabulate import tabulate @@ -101,7 +101,9 @@ def dataset_parent(client, revision, datadir, format, ctx=None): @pass_local_client( clean=False, commit=True, commit_only=DATASET_METADATA_PATHS ) -def create_dataset(client, name, description, creators, commit_message=None): +def create_dataset( + client, name, short_name, description, creators, commit_message=None +): """Create an empty dataset in the current repo. :raises: ``renku.core.errors.ParameterError`` @@ -112,7 +114,10 @@ def create_dataset(client, name, description, creators, commit_message=None): creators = [Person.from_string(c) for c in creators] dataset, _, __ = client.create_dataset( - name=name, description=description, creators=creators + name=name, + short_name=short_name, + description=description, + creators=creators ) return dataset @@ -428,7 +433,7 @@ def export_dataset( def import_dataset( client, uri, - internal_name='', + short_name='', extract=False, with_prompt=False, pool_init_fn=None, @@ -480,14 +485,14 @@ def import_dataset( ) if files: - if not internal_name: - internal_name = generate_default_internal_name( + if not short_name: + short_name = generate_default_short_name( dataset.name, dataset.version ) - dataset.internal_name = internal_name + dataset.short_name = short_name - client.create_dataset(name=dataset.name, internal_name=internal_name) + client.create_dataset(name=dataset.name, short_name=short_name) data_folder = tempfile.mkdtemp() @@ -530,14 +535,14 @@ def import_dataset( add_to_dataset( client, urls=[str(p) for p in Path(data_folder).glob('*')], - name=internal_name, + name=short_name, with_metadata=dataset ) if dataset.version: tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version) tag_dataset( - client, internal_name, tag_name, + client, short_name, tag_name, 'Tag {} created by renku import'.format(dataset.version) ) @@ -646,7 +651,7 @@ def _filter(client, names=None, creators=None, include=None, exclude=None): records = [] for path_, dataset in client.datasets.items(): - if not names or dataset.internal_name in names: + if not names or dataset.short_name in names: for file_ in dataset.files: file_.dataset = dataset.name path_ = file_.full_path.relative_to(client.path) diff --git a/renku/core/commands/format/datasets.py b/renku/core/commands/format/datasets.py index d53dcc8a95..92a7a3a046 100644 --- a/renku/core/commands/format/datasets.py +++ b/renku/core/commands/format/datasets.py @@ -31,7 +31,7 @@ def tabular(client, datasets): datasets, headers=OrderedDict(( ('uid', 'id'), - ('internal_name', None), + ('short_name', None), ('version', None), ('created', None), ('creators_csv', 'creators'), diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index fb0f0d3d86..d4871bc286 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -37,7 +37,7 @@ from renku.core.management.clone import clone from renku.core.management.config import RENKU_HOME from renku.core.models.datasets import Dataset, DatasetFile, DatasetTag, \ - generate_default_internal_name, is_dataset_name_valid + generate_default_short_name, is_dataset_name_valid from renku.core.models.git import GitURL from renku.core.models.locals import with_reference from renku.core.models.provenance.agents import Person @@ -133,7 +133,7 @@ def with_dataset(self, name=None, identifier=None, create=False): 'Dataset exists: "{}".'.format(name) ) - dataset_path = self.path / self.datadir / dataset.internal_name + dataset_path = self.path / self.datadir / dataset.short_name dataset_path.mkdir(parents=True, exist_ok=True) try: @@ -155,23 +155,23 @@ def with_dataset(self, name=None, identifier=None, create=False): dataset.to_yaml() def create_dataset( - self, name, internal_name='', description='', creators=() + self, name, short_name=None, description='', creators=() ): """Create a dataset.""" if not name: raise errors.ParameterError('Dataset name must be provided.') - if not internal_name: - internal_name = generate_default_internal_name(name, None) + if not short_name: + short_name = generate_default_short_name(name, None) - if not is_dataset_name_valid(internal_name): + if not is_dataset_name_valid(short_name): raise errors.ParameterError( - 'Dataset name "{}" is not valid.'.format(internal_name) + 'Dataset name "{}" is not valid.'.format(short_name) ) - if self.load_dataset(name=internal_name): + if self.load_dataset(name=short_name): raise errors.DatasetExistsError( - 'Dataset exists: "{}".'.format(internal_name) + 'Dataset exists: "{}".'.format(short_name) ) identifier = str(uuid.uuid4()) @@ -188,13 +188,13 @@ def create_dataset( client=self, identifier=identifier, name=name, - internal_name=internal_name, + short_name=short_name, description=description, creator=creators ) dataset_ref = LinkReference.create( - client=self, name='datasets/' + internal_name + client=self, name='datasets/' + short_name ) dataset_ref.set_reference(path) @@ -214,7 +214,7 @@ def add_data_to_dataset( ): """Import the data into the data directory.""" warning_message = '' - dataset_path = self.path / self.datadir / dataset.internal_name + dataset_path = self.path / self.datadir / dataset.short_name destination = destination or Path('.') destination = self._resolve_path(dataset_path, destination) diff --git a/renku/core/models/datasets.py b/renku/core/models/datasets.py index 3402b0c08f..a7143041e8 100644 --- a/renku/core/models/datasets.py +++ b/renku/core/models/datasets.py @@ -277,8 +277,7 @@ class Dataset(Entity, CreatorMixin): EDITABLE_FIELDS = [ 'creator', 'date_published', 'description', 'in_language', 'keywords', - 'license', 'name', 'url', 'version', 'created', 'files', - 'internal_name' + 'license', 'name', 'url', 'version', 'created', 'files', 'short_name' ] _id = jsonld.ib(default=None, context='@id', kw_only=True) @@ -352,20 +351,22 @@ class Dataset(Entity, CreatorMixin): same_as = jsonld.ib(context='schema:sameAs', default=None, kw_only=True) - internal_name = jsonld.ib(default='', context=None, kw_only=True) + short_name = jsonld.ib( + default=None, context='schema:alternateName', kw_only=True + ) @created.default def _now(self): """Define default value for datetime fields.""" return datetime.datetime.now(datetime.timezone.utc) - @internal_name.validator - def internal_name_validator(self, attribute, value): - """Validate internal_name.""" - # internal_name might have been scaped and have '%' in it + @short_name.validator + def short_name_validator(self, attribute, value): + """Validate short_name.""" + # short_name might have been scaped and have '%' in it if value and not is_dataset_name_valid(value, safe='%'): raise errors.ParameterError( - 'Invalid "internal_name": {}'.format(value) + 'Invalid "short_name": {}'.format(value) ) @property @@ -521,8 +522,8 @@ def __attrs_post_init__(self): # if with_dataset is used, the dataset is not committed yet pass - if not self.internal_name: - self.internal_name = generate_default_internal_name( + if not self.short_name: + self.short_name = generate_default_short_name( self.name, self.version ) @@ -536,9 +537,9 @@ def is_dataset_name_valid(name, safe=''): ) -def generate_default_internal_name(dataset_name, dataset_version): - """Get dataset internal_name.""" - # For compatibility with older versions use name as internal_name +def generate_default_short_name(dataset_name, dataset_version): + """Get dataset short_name.""" + # For compatibility with older versions use name as short_name # if it is valid; otherwise, use encoded name if is_dataset_name_valid(dataset_name): return dataset_name diff --git a/tests/cli/test_datasets.py b/tests/cli/test_datasets.py index 8c638259a6..31d34d57af 100644 --- a/tests/cli/test_datasets.py +++ b/tests/cli/test_datasets.py @@ -60,17 +60,18 @@ def test_datasets_create_with_metadata(runner, client): """Test creating a dataset with metadata.""" result = runner.invoke( cli, [ - 'dataset', 'create', 'my-dataset', '--description', - 'some description here', '-c', 'John Doe ', '-c', + 'dataset', 'create', 'my-dataset', '--short-name', 'short-name', + '--description', 'some description here', '-c', + 'John Doe ', '-c', 'John Smiths' ] ) assert 0 == result.exit_code assert 'OK' in result.output - dataset = client.load_dataset(name='my-dataset') + dataset = client.load_dataset(name='short-name') assert dataset.name == 'my-dataset' - assert dataset.internal_name == 'my-dataset' + assert dataset.short_name == 'short-name' assert dataset.description == 'some description here' assert 'John Doe' in [c.name for c in dataset.creator] assert 'john.doe@mail.ch' in [c.email for c in dataset.creator] @@ -78,6 +79,21 @@ def test_datasets_create_with_metadata(runner, client): assert 'john.smiths@mail.ch' in [c.email for c in dataset.creator] +def test_datasets_create_different_short_names(runner, client): + """Test creating datasets with same name but different short_name.""" + result = runner.invoke( + cli, ['dataset', 'create', 'dataset', '--short-name', 'dataset-1'] + ) + assert 0 == result.exit_code + assert 'OK' in result.output + + result = runner.invoke( + cli, ['dataset', 'create', 'dataset', '--short-name', 'dataset-2'] + ) + assert 0 == result.exit_code + assert 'OK' in result.output + + def test_datasets_create_with_same_name(runner, client): """Test creating datasets with same name.""" result = runner.invoke(cli, ['dataset', 'create', 'dataset']) @@ -90,23 +106,23 @@ def test_datasets_create_with_same_name(runner, client): @pytest.mark.parametrize( - 'name,internal_name', + 'name,short_name', [('v.a.l.i.d_internal-name', 'v.a.l.i.d_internal-name'), ('any name /@#$!', 'any_name'), ('name longer than 24 characters', 'name_longer_than_24_char'), ('semi valid-name', 'semi_validname'), ('dataset/new', 'datasetnew'), ('/dataset', 'dataset'), ('dataset/', 'dataset')] ) -def test_datasets_valid_name(runner, client, name, internal_name): +def test_datasets_valid_name(runner, client, name, short_name): """Test creating datasets with valid name.""" result = runner.invoke(cli, ['dataset', 'create', name]) assert 0 == result.exit_code - assert 'Use the name "{}"'.format(internal_name) in result.output + assert 'Use the name "{}"'.format(short_name) in result.output assert 'OK' in result.output - dataset = client.load_dataset(name=internal_name) + dataset = client.load_dataset(name=short_name) assert dataset.name == name - assert dataset.internal_name == internal_name + assert dataset.short_name == short_name def test_datasets_create_dirty(runner, project, client): @@ -722,16 +738,18 @@ def test_datasets_ls_files_correct_paths(tmpdir, runner, project): assert Path(record['path']).exists() -def test_datasets_ls_files(directory_tree, runner, project): +def test_datasets_ls_files_with_display_name(directory_tree, runner, project): """Test listing of data within dataset with include/exclude filters.""" # create a dataset - result = runner.invoke(cli, ['dataset', 'create', 'my-dataset']) + result = runner.invoke( + cli, ['dataset', 'create', 'my-dataset', '--short-name', 'short-name'] + ) assert 0 == result.exit_code # add data to dataset result = runner.invoke( cli, - ['dataset', 'add', 'my-dataset', directory_tree.strpath], + ['dataset', 'add', 'short-name', directory_tree.strpath], catch_exceptions=False, ) assert 0 == result.exit_code @@ -739,6 +757,11 @@ def test_datasets_ls_files(directory_tree, runner, project): # list files with dataset name result = runner.invoke(cli, ['dataset', 'ls-files', 'my-dataset']) assert 0 == result.exit_code + assert 'dir2/file2' not in result.output + + # list files with short_name + result = runner.invoke(cli, ['dataset', 'ls-files', 'short-name']) + assert 0 == result.exit_code assert 'dir2/file2' in result.output diff --git a/tests/core/commands/test_dataset.py b/tests/core/commands/test_dataset.py index cc01dd2f18..bcc4a5842b 100644 --- a/tests/core/commands/test_dataset.py +++ b/tests/core/commands/test_dataset.py @@ -182,6 +182,7 @@ def test_create_dataset_custom_message(project): """Test create dataset custom message.""" create_dataset( 'ds1', + short_name='', description='', creators=[], commit_message='my awesome dataset'