From dfeb1d42015e3cc98ce49d0c1f59fe6af139f4f0 Mon Sep 17 00:00:00 2001 From: Ralf Grubenmann Date: Wed, 8 Sep 2021 09:05:54 +0200 Subject: [PATCH] feat(core): add custom dataset metadata (#2310) --- renku/cli/dataset.py | 48 ++++++++++++++- renku/core/commands/dataset.py | 7 +++ renku/core/management/datasets.py | 61 +++++++++++++------- renku/core/metadata/database.py | 46 +++++++-------- renku/core/models/dataset.py | 16 ++++- renku/core/models/provenance/annotation.py | 12 ++++ renku/service/controllers/datasets_create.py | 1 + renku/service/controllers/datasets_edit.py | 1 + renku/service/serializers/datasets.py | 3 + tests/cli/test_datasets.py | 43 +++++++++++++- tests/service/views/test_dataset_views.py | 61 ++++++++++++++++++++ 11 files changed, 249 insertions(+), 50 deletions(-) diff --git a/renku/cli/dataset.py b/renku/cli/dataset.py index ab9a088074..a958f18641 100644 --- a/renku/cli/dataset.py +++ b/renku/cli/dataset.py @@ -45,6 +45,9 @@ | -k, --keyword | Dataset's keywords. Pass multiple times for a list | | | of keywords. | +-------------------+------------------------------------------------------+ +| -m, --metadata | Path to file containing custom JSON-LD metadata to | +| | be added to the dataset. | ++-------------------+------------------------------------------------------+ Editing a dataset's metadata @@ -398,6 +401,9 @@ only the dataset record. """ +import json +from pathlib import Path + import click import requests from rich.console import Console @@ -461,17 +467,36 @@ def list_dataset(format, columns): multiple=True, help="Creator's name, email, and affiliation. Accepted format is 'Forename Surname [affiliation]'.", ) +@click.option( + "-m", + "--metadata", + default=None, + type=click.Path(exists=True, dir_okay=False), + help="Custom metadata to be associated with the dataset.", +) @click.option("-k", "--keyword", default=None, multiple=True, type=click.STRING, help="List of keywords or tags.") -def create(name, title, description, creators, keyword): +def create(name, title, description, creators, metadata, keyword): """Create an empty dataset in the current repo.""" communicator = ClickCallback() creators = creators or () + custom_metadata = None + + if metadata: + custom_metadata = json.loads(Path(metadata).read_text()) + result = ( create_dataset() .with_communicator(communicator) .build() - .execute(name=name, title=title, description=description, creators=creators, keywords=keyword) + .execute( + name=name, + title=title, + description=description, + creators=creators, + keywords=keyword, + custom_metadata=custom_metadata, + ) ) new_dataset = result.output @@ -492,12 +517,24 @@ def create(name, title, description, creators, keyword): multiple=True, help="Creator's name, email, and affiliation. " "Accepted format is 'Forename Surname [affiliation]'.", ) +@click.option( + "-m", + "--metadata", + default=None, + type=click.Path(exists=True, dir_okay=False), + help="Custom metadata to be associated with the dataset.", +) @click.option("-k", "--keyword", default=None, multiple=True, type=click.STRING, help="List of keywords or tags.") -def edit(name, title, description, creators, keyword): +def edit(name, title, description, creators, metadata, keyword): """Edit dataset metadata.""" creators = creators or () keywords = keyword or () + custom_metadata = None + + if metadata: + custom_metadata = json.loads(Path(metadata).read_text()) + result = ( edit_dataset() .build() @@ -508,6 +545,7 @@ def edit(name, title, description, creators, keyword): creators=creators, keywords=keywords, skip_image_update=True, + custom_metadata=custom_metadata, ) ) @@ -551,6 +589,10 @@ def show(name): if ds["version"]: click.echo(click.style("Version: ", bold=True, fg="magenta") + ds.get("version", "")) + if ds["annotations"]: + click.echo(click.style("Annotations: ", bold=True, fg="magenta")) + click.echo(json.dumps(ds.get("annotations", ""), indent=2)) + click.echo(click.style("Title: ", bold=True, fg="magenta") + click.style(ds.get("title", ""), bold=True)) click.echo(click.style("Description: ", bold=True, fg="magenta")) diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index 714fcd8e6f..2a3fc934b5 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -93,6 +93,7 @@ def create_dataset_helper( keywords=None, images=None, safe_image_paths=None, + custom_metadata=None, ): """Create a dataset in the repository.""" client = client_dispatcher.current_client @@ -110,6 +111,7 @@ def create_dataset_helper( keywords=keywords, images=images, safe_image_paths=safe_image_paths, + custom_metadata=custom_metadata, ) return dataset @@ -132,6 +134,7 @@ def _edit_dataset( images=None, skip_image_update=False, safe_image_paths=None, + custom_metadata=None, ): """Edit dataset metadata.""" client = client_dispatcher.current_client @@ -162,6 +165,10 @@ def _edit_dataset( if images_updated: updated["images"] = [{"content_url": i.content_url, "position": i.position} for i in dataset.images] + if custom_metadata: + client.update_dataset_custom_metadata(dataset, custom_metadata) + updated["custom_metadata"] = custom_metadata + if not updated: return [], no_email_warnings diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index d00e43e868..7fa497c6ba 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -51,9 +51,16 @@ from renku.core.management.interface.database_dispatcher import IDatabaseDispatcher from renku.core.management.repository import RepositoryApiMixin from renku.core.metadata.immutable import DynamicProxy -from renku.core.models import dataset as new_datasets -from renku.core.models.dataset import get_dataset_data_dir, is_dataset_name_valid +from renku.core.models.dataset import ( + Dataset, + DatasetFile, + ImageObject, + RemoteEntity, + get_dataset_data_dir, + is_dataset_name_valid, +) from renku.core.models.provenance.agent import Person +from renku.core.models.provenance.annotation import Annotation from renku.core.models.refs import LinkReference from renku.core.utils import communication from renku.core.utils.git import ( @@ -98,13 +105,13 @@ def renku_pointers_path(self): return path @property - def datasets(self) -> Dict[str, new_datasets.Dataset]: + def datasets(self) -> Dict[str, Dataset]: """A map from datasets name to datasets.""" datasets_provenance = DatasetsProvenance() return {d.name: d for d in datasets_provenance.datasets} @staticmethod - def get_dataset(name, strict=False, immutable=False) -> Optional[new_datasets.Dataset]: + def get_dataset(name, strict=False, immutable=False) -> Optional[Dataset]: """Load dataset reference file.""" return get_dataset(name=name, strict=strict, immutable=immutable) @@ -151,6 +158,7 @@ def create_dataset( images=None, safe_image_paths=None, update_provenance=True, + custom_metadata=None, ): """Create a dataset.""" if not name: @@ -171,7 +179,12 @@ def create_dataset( keywords = keywords or () - dataset = new_datasets.Dataset( + annotations = None + + if custom_metadata: + annotations = [Annotation(id=Annotation.generate_id(), source="renku", body=custom_metadata)] + + dataset = Dataset( identifier=None, name=name, title=title, @@ -179,6 +192,7 @@ def create_dataset( creators=creators, keywords=keywords, project_id=self.project.id, + annotations=annotations, ) if images: @@ -192,7 +206,16 @@ def create_dataset( return dataset - def set_dataset_images(self, dataset: new_datasets.Dataset, images, safe_image_paths=None): + def update_dataset_custom_metadata(self, dataset: Dataset, custom_metadata: Dict): + """Update custom metadata on a dataset.""" + + existing_metadata = [a for a in dataset.annotations if a.source != "renku"] + + existing_metadata.append(Annotation(id=Annotation.generate_id(), body=custom_metadata, source="renku")) + + dataset.annotations = existing_metadata + + def set_dataset_images(self, dataset: Dataset, images, safe_image_paths=None): """Set the images on a dataset.""" safe_image_paths = safe_image_paths or [] @@ -228,10 +251,10 @@ def set_dataset_images(self, dataset: new_datasets.Dataset, images, safe_image_p # NOTE: absolute url if not img.get("mirror_locally", False): dataset.images.append( - new_datasets.ImageObject( + ImageObject( content_url=content_url, position=position, - id=new_datasets.ImageObject.generate_id(dataset, position), + id=ImageObject.generate_id(dataset, position), ) ) images_updated = True @@ -271,10 +294,10 @@ def set_dataset_images(self, dataset: new_datasets.Dataset, images, safe_image_p img_path = path dataset.images.append( - new_datasets.ImageObject( + ImageObject( content_url=str(img_path.relative_to(self.path)), position=position, - id=new_datasets.ImageObject.generate_id(dataset=dataset, position=position), + id=ImageObject.generate_id(dataset=dataset, position=position), ) ) images_updated = True @@ -450,7 +473,7 @@ def add_data_to_dataset( # Generate the DatasetFiles dataset_files = [] for data in files: - dataset_file = new_datasets.DatasetFile.from_path( + dataset_file = DatasetFile.from_path( client=self, path=data["path"], source=data["source"], based_on=data.get("based_on") ) dataset_files.append(dataset_file) @@ -647,7 +670,7 @@ def _add_from_git(self, url, sources, destination, ref, repository=None): operation = (src, dst, "move") checksum = get_object_hash(repo=remote_client.repo, revision="HEAD", path=path) - based_on = new_datasets.RemoteEntity(checksum=checksum, path=path, url=url) + based_on = RemoteEntity(checksum=checksum, path=path, url=url) results.append( { @@ -777,7 +800,7 @@ def move_files(self, files, to_dataset): datasets = [d.copy() for d in self.datasets.values()] if to_dataset: # NOTE: Use the same dataset object or otherwise a race happens if dataset is in both source and destination - to_dataset: new_datasets.Dataset = next(d for d in datasets if d.name == to_dataset) + to_dataset: Dataset = next(d for d in datasets if d.name == to_dataset) modified_datasets = {} progress_name = "Updating dataset metadata" @@ -787,7 +810,7 @@ def move_files(self, files, to_dataset): src = src.relative_to(self.path) dst = dst.relative_to(self.path) # NOTE: Files are moved at this point, so, we use can use dst - new_dataset_file = new_datasets.DatasetFile.from_path(self, dst) + new_dataset_file = DatasetFile.from_path(self, dst) for dataset in datasets: removed = dataset.unlink_file(src, missing_ok=True) if removed: @@ -856,7 +879,7 @@ def _update_datasets_metadata( modified_datasets = {} for file in updated_files: - new_file = new_datasets.DatasetFile.from_path( + new_file = DatasetFile.from_path( client=self, path=file.entity.path, based_on=file.based_on, source=file.source ) modified_datasets[file.dataset.name] = file.dataset @@ -926,9 +949,7 @@ def update_dataset_git_files(self, files: List[DynamicProxy], ref, delete=False) self._create_external_file(src.resolve(), dst) else: shutil.copy(src, dst) - file.based_on = new_datasets.RemoteEntity( - checksum=checksum, path=based_on.path, url=based_on.url - ) + file.based_on = RemoteEntity(checksum=checksum, path=based_on.path, url=based_on.url) updated_files.append(file) else: # File was removed or renamed @@ -1025,9 +1046,7 @@ def update_external_files(self, records: List[DynamicProxy]): for dataset in updated_datasets.values(): for file in dataset.files: if str(self.path / file.entity.path) in updated_files_paths: - new_file = new_datasets.DatasetFile.from_path( - client=self, path=file.entity.path, source=file.source - ) + new_file = DatasetFile.from_path(client=self, path=file.entity.path, source=file.source) dataset.add_or_update_files(new_file) datasets_provenance.add_or_update(dataset, creator=Person.from_client(self)) diff --git a/renku/core/metadata/database.py b/renku/core/metadata/database.py index aad1abc821..4376fd8d1d 100644 --- a/renku/core/metadata/database.py +++ b/renku/core/metadata/database.py @@ -598,10 +598,10 @@ def serialize(self, object: persistent.Persistent): is_dict = isinstance(data, dict) if not is_dict or (is_dict and not was_dict): - data = {"@value": data} + data = {"@renku_data_value": data} - data["@type"] = get_type_name(object) - data["@oid"] = object._p_oid + data["@renku_data_type"] = get_type_name(object) + data["@renku_oid"] = object._p_oid return data @@ -623,13 +623,13 @@ def _serialize_helper(self, object): # NOTE: Index objects are not stored as references and are included in their parent object (i.e. root) state = object.__getstate__() state = self._serialize_helper(state) - return {"@type": get_type_name(object), "@oid": object._p_oid, **state} + return {"@renku_data_type": get_type_name(object), "@renku_oid": object._p_oid, **state} elif isinstance(object, persistent.Persistent): if not object._p_oid: object._p_oid = Database.generate_oid(object) if object._p_state not in [GHOST, UPTODATE] or (object._p_state == UPTODATE and object._p_serial == NEW): self._database.register(object) - return {"@type": get_type_name(object), "@oid": object._p_oid, "@reference": True} + return {"@renku_data_type": get_type_name(object), "@renku_oid": object._p_oid, "@renku_reference": True} elif isinstance(object, datetime.datetime): value = object.isoformat() elif isinstance(object, tuple): @@ -637,18 +637,18 @@ def _serialize_helper(self, object): elif isinstance(object, (InterfaceClass)): # NOTE: Zope interfaces are weird, they're a class with type InterfaceClass, but need to be deserialized # as the class (without instantiation) - return {"@type": TYPE_TYPE, "@value": f"{object.__module__}.{object.__name__}"} + return {"@renku_data_type": TYPE_TYPE, "@renku_data_value": f"{object.__module__}.{object.__name__}"} elif isinstance(object, type): # NOTE: We're storing a type, not an instance - return {"@type": TYPE_TYPE, "@value": get_type_name(object)} + return {"@renku_data_type": TYPE_TYPE, "@renku_data_value": get_type_name(object)} elif isinstance(object, (FunctionType, BuiltinFunctionType)): name = object.__name__ module = getattr(object, "__module__", None) - return {"@type": FUNCTION_TYPE, "@value": f"{module}.{name}"} + return {"@renku_data_type": FUNCTION_TYPE, "@renku_data_value": f"{module}.{name}"} elif hasattr(object, "__getstate__"): if id(object) in self._serialization_cache: # NOTE: We already serialized this -> circular/repeat reference. - return {"@type": REFERENCE_TYPE, "@value": self._serialization_cache[id(object)]} + return {"@renku_data_type": REFERENCE_TYPE, "@renku_data_value": self._serialization_cache[id(object)]} # NOTE: The reference used for circular reference is just the position in the serialization cache, # as the order is deterministic. So the order in which objects are encoutered is their id for referencing. @@ -661,7 +661,7 @@ def _serialize_helper(self, object): else: if id(object) in self._serialization_cache: # NOTE: We already serialized this -> circular/repeat reference - return {"@type": REFERENCE_TYPE, "@value": self._serialization_cache[id(object)]} + return {"@renku_data_type": REFERENCE_TYPE, "@renku_data_value": self._serialization_cache[id(object)]} # NOTE: The reference used for circular reference is just the position in the serialization cache, # as the order is deterministic So the order in which objects are encoutered is their id for referencing. @@ -671,7 +671,7 @@ def _serialize_helper(self, object): value = {k: v for k, v in value.items() if not k.startswith("_v_")} value = self._serialize_helper(value) - return {"@type": get_type_name(object), "@value": value} + return {"@renku_data_type": get_type_name(object), "@renku_data_value": value} class ObjectReader: @@ -703,7 +703,7 @@ def set_ghost_state(self, object: persistent.Persistent, data: Dict): def deserialize(self, data): """Convert JSON to Persistent object.""" - oid = data["@oid"] + oid = data["@renku_oid"] self._deserialization_cache = [] @@ -724,36 +724,36 @@ def _deserialize_helper(self, data, create=True): else: assert isinstance(data, dict), f"Data must be a dict: '{type(data)}'" - if "@type" not in data: # NOTE: A normal dict value - assert "@oid" not in data + if "@renku_data_type" not in data: # NOTE: A normal dict value + assert "@renku_oid" not in data items = sorted(data.items(), key=lambda x: x[0]) for key, value in items: data[key] = self._deserialize_helper(value) return data - object_type = data.pop("@type") + object_type = data.pop("@renku_data_type") if object_type in (TYPE_TYPE, FUNCTION_TYPE): # NOTE: if we stored a type (not instance), return the type - return self._get_class(data["@value"]) + return self._get_class(data["@renku_data_value"]) elif object_type == REFERENCE_TYPE: # NOTE: we had a circular reference, we return the (not yet finalized) class here - return self._deserialization_cache[data["@value"]] + return self._deserialization_cache[data["@renku_data_value"]] cls = self._get_class(object_type) if issubclass(cls, datetime.datetime): assert create - data = data["@value"] + data = data["@renku_data_value"] return datetime.datetime.fromisoformat(data) elif issubclass(cls, tuple): - data = data["@value"] + data = data["@renku_data_value"] return tuple(self._deserialize_helper(value) for value in data) - oid: str = data.pop("@oid", None) + oid: str = data.pop("@renku_oid", None) if oid: assert isinstance(oid, str) - if "@reference" in data and data["@reference"]: # A reference + if "@renku_reference" in data and data["@renku_reference"]: # A reference assert create, f"Cannot deserialize a reference without creating an instance {data}" new_object = self._database.get_cached(oid) if new_object is not None: @@ -771,8 +771,8 @@ def _deserialize_helper(self, data, create=True): self.set_ghost_state(new_object, data) return new_object - if "@value" in data: - data = data["@value"] + if "@renku_data_value" in data: + data = data["@renku_data_value"] if not create: data = self._deserialize_helper(data) diff --git a/renku/core/models/dataset.py b/renku/core/models/dataset.py index ffb73887b1..f0f4846c7d 100644 --- a/renku/core/models/dataset.py +++ b/renku/core/models/dataset.py @@ -31,9 +31,10 @@ from renku.core import errors from renku.core.metadata.database import Persistent from renku.core.metadata.immutable import Immutable, Slots -from renku.core.models.calamus import DateTimeList, JsonLDSchema, Nested, Uri, fields, prov, renku, schema +from renku.core.models.calamus import DateTimeList, JsonLDSchema, Nested, Uri, fields, oa, prov, renku, schema from renku.core.models.entity import CollectionSchema, Entity, EntitySchema from renku.core.models.provenance.agent import Person, PersonSchema, SoftwareAgent +from renku.core.models.provenance.annotation import Annotation, AnnotationSchema from renku.core.utils.datetime8601 import fix_timezone, local_now, parse_date from renku.core.utils.git import get_path from renku.core.utils.urls import get_slug @@ -294,6 +295,7 @@ class Dataset(Persistent): def __init__( self, *, + annotations: List[Annotation] = None, creators: List[Person] = None, dataset_files: List[DatasetFile] = None, date_created: datetime = None, @@ -349,6 +351,7 @@ def __init__( self.same_as: Url = same_as self.title: str = title self.version: str = version + self.annotations: List[Annotation] = annotations or [] @classmethod def from_jsonld(cls, data, schema_class=None): @@ -399,6 +402,7 @@ def keywords_csv(self): def copy(self) -> "Dataset": """Return a clone of this dataset.""" return Dataset( + annotations=[a.copy() for a in self.annotations], creators=self.creators.copy(), dataset_files=[f.copy() for f in self.dataset_files], date_created=self.date_created, @@ -675,6 +679,7 @@ class Meta: model = Dataset unknown = EXCLUDE + annotations = Nested(oa.hasTarget, AnnotationSchema, reverse=True, many=True) creators = Nested(schema.creator, PersonSchema, many=True) date_created = fields.DateTime(schema.dateCreated, missing=None, format="iso", extra_formats=("%Y-%m-%d",)) date_removed = fields.DateTime(prov.invalidatedAtTime, missing=None, format="iso") @@ -706,6 +711,13 @@ class DatasetCreatorsJson(marshmallow.Schema): affiliation = marshmallow.fields.String() +class AnnotationJson(marshmallow.Schema): + """Schema for Annotations.""" + + source = marshmallow.fields.String() + body = marshmallow.fields.Dict() + + class DatasetDetailsJson(marshmallow.Schema): """Serialize a dataset to a response object.""" @@ -719,6 +731,8 @@ class DatasetDetailsJson(marshmallow.Schema): keywords = marshmallow.fields.List(marshmallow.fields.String()) identifier = marshmallow.fields.String() + annotations = marshmallow.fields.List(marshmallow.fields.Nested(AnnotationJson)) + class DatasetFileDetailsJson(marshmallow.Schema): """Serialize dataset files to a response object.""" diff --git a/renku/core/models/provenance/annotation.py b/renku/core/models/provenance/annotation.py index 6ba04c4f39..6a482ea5b2 100644 --- a/renku/core/models/provenance/annotation.py +++ b/renku/core/models/provenance/annotation.py @@ -17,6 +17,9 @@ # limitations under the License. """Represent an annotation for a workflow.""" +import copy +from uuid import uuid4 + from marshmallow import EXCLUDE from renku.core.models.calamus import JsonLDSchema, dcterms, fields, oa @@ -30,6 +33,15 @@ def __init__(self, *, id: str, body=None, source=None): self.body = body self.source = source + def copy(self): + """Return a copy of this annotation.""" + return copy.copy(self) + + @staticmethod + def generate_id(): + """Generate an id for an annotation.""" + return f"/annotations/{uuid4().hex}" + class AnnotationSchema(JsonLDSchema): """Annotation schema.""" diff --git a/renku/service/controllers/datasets_create.py b/renku/service/controllers/datasets_create.py index 3c03ade2cd..01e883c5c3 100644 --- a/renku/service/controllers/datasets_create.py +++ b/renku/service/controllers/datasets_create.py @@ -62,6 +62,7 @@ def renku_op(self): description=self.ctx.get("description"), keywords=self.ctx.get("keywords"), images=self.ctx.get("images"), + custom_metadata=self.ctx.get("custom_metadata"), safe_image_paths=[user_cache_dir], ) ) diff --git a/renku/service/controllers/datasets_edit.py b/renku/service/controllers/datasets_edit.py index d7ae782897..a191e2bb1a 100644 --- a/renku/service/controllers/datasets_edit.py +++ b/renku/service/controllers/datasets_edit.py @@ -62,6 +62,7 @@ def renku_op(self): self.ctx.get("creators"), keywords=self.ctx.get("keywords"), images=self.ctx.get("images"), + custom_metadata=self.ctx.get("custom_metadata"), safe_image_paths=[user_cache_dir], ) ) diff --git a/renku/service/serializers/datasets.py b/renku/service/serializers/datasets.py index 18c4e19013..587ec9d6a2 100644 --- a/renku/service/serializers/datasets.py +++ b/renku/service/serializers/datasets.py @@ -50,6 +50,8 @@ class DatasetDetailsRequest(DatasetDetails): images = fields.List(fields.Nested(ImageObjectRequest)) + custom_metadata = fields.Dict() + class DatasetCreateRequest( AsyncSchema, DatasetDetailsRequest, DatasetRefSchema, LocalRepositorySchema, RemoteRepositorySchema, MigrateSchema @@ -202,6 +204,7 @@ class DatasetEditRequest( description = fields.String(default=None) creators = fields.List(fields.Nested(DatasetCreators)) keywords = fields.List(fields.String()) + custom_metadata = fields.Dict(default=None) class DatasetEditResponse(RenkuSyncSchema): diff --git a/tests/cli/test_datasets.py b/tests/cli/test_datasets.py index e340058bd3..46866478da 100644 --- a/tests/cli/test_datasets.py +++ b/tests/cli/test_datasets.py @@ -59,6 +59,14 @@ def test_datasets_create_clean(runner, project, client, load_dataset_with_inject def test_dataset_show(runner, client, subdirectory): """Test creating a dataset with metadata.""" + metadata = { + "@id": "https://example.com/annotation1", + "@type": "https://schema.org/specialType", + "https://schema.org/specialProperty": "some_unique_value", + } + metadata_path = client.path / "metadata.json" + metadata_path.write_text(json.dumps(metadata)) + result = runner.invoke( cli, [ @@ -77,6 +85,8 @@ def test_dataset_show(runner, client, subdirectory): "keyword-1", "-k", "keyword-2", + "--metadata", + str(metadata_path), ], ) assert 0 == result.exit_code, format_result_exception(result) @@ -91,6 +101,10 @@ def test_dataset_show(runner, client, subdirectory): assert "Created: " in result.output assert "Name: my-dataset" in result.output assert "John Doe " in result.output + assert "some_unique_value" in result.output + assert "https://schema.org/specialProperty" in result.output + assert "https://example.com/annotation1" in result.output + assert "https://schema.org/specialType" in result.output assert "##" not in result.output @@ -806,7 +820,6 @@ def test_datasets_ls_files_tabular_patterns(runner, project, directory_tree): def test_datasets_ls_files_tabular_creators(runner, client, directory_tree, load_dataset_with_injection): """Test listing of data within dataset with creators filters.""" assert 0 == runner.invoke(cli, ["dataset", "add", "my-dataset", "-c", str(directory_tree)]).exit_code - creator = load_dataset_with_injection("my-dataset", client).creators[0].name assert creator is not None @@ -994,7 +1007,18 @@ def test_dataset_edit(runner, client, project, dirty, subdirectory, load_dataset if dirty: (client.path / "README.md").write_text("Make repo dirty.") - result = runner.invoke(cli, ["dataset", "create", "dataset", "-t", "original title", "-k", "keyword-1"]) + metadata = { + "@id": "https://example.com/annotation1", + "@type": "https://schema.org/specialType", + "https://schema.org/specialProperty": "some_unique_value", + } + metadata_path = client.path / "metadata.json" + metadata_path.write_text(json.dumps(metadata)) + + result = runner.invoke( + cli, + ["dataset", "create", "dataset", "-t", "original title", "-k", "keyword-1", "--metadata", str(metadata_path)], + ) assert 0 == result.exit_code, format_result_exception(result) creator1 = "Forename1 Surname1 [Affiliation 1]" @@ -1025,11 +1049,26 @@ def test_dataset_edit(runner, client, project, dirty, subdirectory, load_dataset assert 0 == result.exit_code, format_result_exception(result) assert "Successfully updated: keywords." in result.output + new_metadata = { + "@id": "https://example.com/annotation1", + "@type": "https://schema.org/specialType", + "https://schema.org/specialProperty": "some_other_unique_value", + } + metadata_path.write_text(json.dumps(new_metadata)) + + result = runner.invoke( + cli, ["dataset", "edit", "dataset", "--metadata", str(metadata_path)], catch_exceptions=False + ) + assert 0 == result.exit_code, format_result_exception(result) + assert "Successfully updated: custom_metadata." in result.output + dataset = load_dataset_with_injection("dataset", client) assert " new description " == dataset.description assert "new title" == dataset.title assert {creator1, creator2}.issubset({c.full_identity for c in dataset.creators}) assert {"keyword-2", "keyword-3"} == set(dataset.keywords) + assert 1 == len(dataset.annotations) + assert new_metadata == dataset.annotations[0].body @pytest.mark.parametrize("dirty", [False, True]) diff --git a/tests/service/views/test_dataset_views.py b/tests/service/views/test_dataset_views.py index 7ec3c985b5..9f14b5e843 100644 --- a/tests/service/views/test_dataset_views.py +++ b/tests/service/views/test_dataset_views.py @@ -298,6 +298,52 @@ def test_create_dataset_with_images(svc_client_with_repo): assert img2["content_url"].endswith("/2.png") +@pytest.mark.service +@pytest.mark.integration +@retry_failed +def test_create_dataset_with_custom_metadata(svc_client_with_repo): + """Create a new dataset with metadata.""" + svc_client, headers, project_id, _ = svc_client_with_repo + + payload = { + "project_id": project_id, + "name": uuid.uuid4().hex, + "title": "my little dataset", + "creators": [{"name": "name123", "email": "name123@ethz.ch", "affiliation": "ethz"}], + "description": "my little description", + "custom_metadata": { + "@id": "http://example.com/metadata12", + "@type": "https://schema.org/myType", + "https://schema.org/property1": 1, + "https://schema.org/property2": "test", + }, + } + + response = svc_client.post("/datasets.create", data=json.dumps(payload), headers=headers) + + assert response + assert_rpc_response(response) + + assert {"name", "remote_branch"} == set(response.json["result"].keys()) + assert payload["name"] == response.json["result"]["name"] + + params = { + "project_id": project_id, + } + response = svc_client.get("/datasets.list", query_string=params, headers=headers) + + assert response + assert_rpc_response(response) + + ds = next(ds for ds in response.json["result"]["datasets"] if ds["name"] == payload["name"]) + + assert payload["title"] == ds["title"] + assert payload["name"] == ds["name"] + assert payload["description"] == ds["description"] + assert payload["creators"] == ds["creators"] + assert payload["custom_metadata"] == ds["annotations"][0]["body"] + + @pytest.mark.parametrize( "img_url", ["https://raw.githubusercontent.com/SwissDataScienceCenter/calamus/master/docs/reed.png", "https://bit.ly/2ZoutNn"], @@ -628,6 +674,7 @@ def test_list_datasets_view(svc_client_with_repo): "title", "creators", "keywords", + "annotations", } == set(response.json["result"]["datasets"][0].keys()) @@ -700,6 +747,7 @@ def test_list_datasets_view_remote(svc_client_with_repo, it_remote_repo_url): "title", "creators", "keywords", + "annotations", } == set(response.json["result"]["datasets"][0].keys()) @@ -830,6 +878,7 @@ def test_create_and_list_datasets_view(svc_client_with_repo): "description", "created_at", "keywords", + "annotations", } == set(response.json["result"]["datasets"][0].keys()) assert payload["name"] in [ds["name"] for ds in response.json["result"]["datasets"]] @@ -1264,6 +1313,12 @@ def test_edit_datasets_view(svc_client_with_repo): "title": "my new title", "keywords": ["keyword1"], "creators": [{"name": "name123", "email": "name123@ethz.ch", "affiliation": "ethz"}], + "custom_metadata": { + "@id": "http://example.com/metadata12", + "@type": "https://schema.org/myType", + "https://schema.org/property1": 1, + "https://schema.org/property2": "test", + }, } response = svc_client.post("/datasets.edit", data=json.dumps(edit_payload), headers=headers) @@ -1275,6 +1330,12 @@ def test_edit_datasets_view(svc_client_with_repo): "title": "my new title", "keywords": ["keyword1"], "creators": [{"name": "name123", "email": "name123@ethz.ch", "affiliation": "ethz"}], + "custom_metadata": { + "@id": "http://example.com/metadata12", + "@type": "https://schema.org/myType", + "https://schema.org/property1": 1, + "https://schema.org/property2": "test", + }, } == response.json["result"]["edited"]