Skip to content

Commit

Permalink
feat(core): add custom dataset metadata (#2310)
Browse files Browse the repository at this point in the history
  • Loading branch information
Panaetius committed Sep 8, 2021
1 parent 3ab3ac1 commit dfeb1d4
Show file tree
Hide file tree
Showing 11 changed files with 249 additions and 50 deletions.
48 changes: 45 additions & 3 deletions renku/cli/dataset.py
Expand Up @@ -45,6 +45,9 @@
| -k, --keyword | Dataset's keywords. Pass multiple times for a list |
| | of keywords. |
+-------------------+------------------------------------------------------+
| -m, --metadata | Path to file containing custom JSON-LD metadata to |
| | be added to the dataset. |
+-------------------+------------------------------------------------------+
Editing a dataset's metadata
Expand Down Expand Up @@ -398,6 +401,9 @@
only the dataset record.
"""

import json
from pathlib import Path

import click
import requests
from rich.console import Console
Expand Down Expand Up @@ -461,17 +467,36 @@ def list_dataset(format, columns):
multiple=True,
help="Creator's name, email, and affiliation. Accepted format is 'Forename Surname <email> [affiliation]'.",
)
@click.option(
"-m",
"--metadata",
default=None,
type=click.Path(exists=True, dir_okay=False),
help="Custom metadata to be associated with the dataset.",
)
@click.option("-k", "--keyword", default=None, multiple=True, type=click.STRING, help="List of keywords or tags.")
def create(name, title, description, creators, keyword):
def create(name, title, description, creators, metadata, keyword):
"""Create an empty dataset in the current repo."""
communicator = ClickCallback()
creators = creators or ()

custom_metadata = None

if metadata:
custom_metadata = json.loads(Path(metadata).read_text())

result = (
create_dataset()
.with_communicator(communicator)
.build()
.execute(name=name, title=title, description=description, creators=creators, keywords=keyword)
.execute(
name=name,
title=title,
description=description,
creators=creators,
keywords=keyword,
custom_metadata=custom_metadata,
)
)

new_dataset = result.output
Expand All @@ -492,12 +517,24 @@ def create(name, title, description, creators, keyword):
multiple=True,
help="Creator's name, email, and affiliation. " "Accepted format is 'Forename Surname <email> [affiliation]'.",
)
@click.option(
"-m",
"--metadata",
default=None,
type=click.Path(exists=True, dir_okay=False),
help="Custom metadata to be associated with the dataset.",
)
@click.option("-k", "--keyword", default=None, multiple=True, type=click.STRING, help="List of keywords or tags.")
def edit(name, title, description, creators, keyword):
def edit(name, title, description, creators, metadata, keyword):
"""Edit dataset metadata."""
creators = creators or ()
keywords = keyword or ()

custom_metadata = None

if metadata:
custom_metadata = json.loads(Path(metadata).read_text())

result = (
edit_dataset()
.build()
Expand All @@ -508,6 +545,7 @@ def edit(name, title, description, creators, keyword):
creators=creators,
keywords=keywords,
skip_image_update=True,
custom_metadata=custom_metadata,
)
)

Expand Down Expand Up @@ -551,6 +589,10 @@ def show(name):
if ds["version"]:
click.echo(click.style("Version: ", bold=True, fg="magenta") + ds.get("version", ""))

if ds["annotations"]:
click.echo(click.style("Annotations: ", bold=True, fg="magenta"))
click.echo(json.dumps(ds.get("annotations", ""), indent=2))

click.echo(click.style("Title: ", bold=True, fg="magenta") + click.style(ds.get("title", ""), bold=True))

click.echo(click.style("Description: ", bold=True, fg="magenta"))
Expand Down
7 changes: 7 additions & 0 deletions renku/core/commands/dataset.py
Expand Up @@ -93,6 +93,7 @@ def create_dataset_helper(
keywords=None,
images=None,
safe_image_paths=None,
custom_metadata=None,
):
"""Create a dataset in the repository."""
client = client_dispatcher.current_client
Expand All @@ -110,6 +111,7 @@ def create_dataset_helper(
keywords=keywords,
images=images,
safe_image_paths=safe_image_paths,
custom_metadata=custom_metadata,
)

return dataset
Expand All @@ -132,6 +134,7 @@ def _edit_dataset(
images=None,
skip_image_update=False,
safe_image_paths=None,
custom_metadata=None,
):
"""Edit dataset metadata."""
client = client_dispatcher.current_client
Expand Down Expand Up @@ -162,6 +165,10 @@ def _edit_dataset(
if images_updated:
updated["images"] = [{"content_url": i.content_url, "position": i.position} for i in dataset.images]

if custom_metadata:
client.update_dataset_custom_metadata(dataset, custom_metadata)
updated["custom_metadata"] = custom_metadata

if not updated:
return [], no_email_warnings

Expand Down
61 changes: 40 additions & 21 deletions renku/core/management/datasets.py
Expand Up @@ -51,9 +51,16 @@
from renku.core.management.interface.database_dispatcher import IDatabaseDispatcher
from renku.core.management.repository import RepositoryApiMixin
from renku.core.metadata.immutable import DynamicProxy
from renku.core.models import dataset as new_datasets
from renku.core.models.dataset import get_dataset_data_dir, is_dataset_name_valid
from renku.core.models.dataset import (
Dataset,
DatasetFile,
ImageObject,
RemoteEntity,
get_dataset_data_dir,
is_dataset_name_valid,
)
from renku.core.models.provenance.agent import Person
from renku.core.models.provenance.annotation import Annotation
from renku.core.models.refs import LinkReference
from renku.core.utils import communication
from renku.core.utils.git import (
Expand Down Expand Up @@ -98,13 +105,13 @@ def renku_pointers_path(self):
return path

@property
def datasets(self) -> Dict[str, new_datasets.Dataset]:
def datasets(self) -> Dict[str, Dataset]:
"""A map from datasets name to datasets."""
datasets_provenance = DatasetsProvenance()
return {d.name: d for d in datasets_provenance.datasets}

@staticmethod
def get_dataset(name, strict=False, immutable=False) -> Optional[new_datasets.Dataset]:
def get_dataset(name, strict=False, immutable=False) -> Optional[Dataset]:
"""Load dataset reference file."""
return get_dataset(name=name, strict=strict, immutable=immutable)

Expand Down Expand Up @@ -151,6 +158,7 @@ def create_dataset(
images=None,
safe_image_paths=None,
update_provenance=True,
custom_metadata=None,
):
"""Create a dataset."""
if not name:
Expand All @@ -171,14 +179,20 @@ def create_dataset(

keywords = keywords or ()

dataset = new_datasets.Dataset(
annotations = None

if custom_metadata:
annotations = [Annotation(id=Annotation.generate_id(), source="renku", body=custom_metadata)]

dataset = Dataset(
identifier=None,
name=name,
title=title,
description=description,
creators=creators,
keywords=keywords,
project_id=self.project.id,
annotations=annotations,
)

if images:
Expand All @@ -192,7 +206,16 @@ def create_dataset(

return dataset

def set_dataset_images(self, dataset: new_datasets.Dataset, images, safe_image_paths=None):
def update_dataset_custom_metadata(self, dataset: Dataset, custom_metadata: Dict):
"""Update custom metadata on a dataset."""

existing_metadata = [a for a in dataset.annotations if a.source != "renku"]

existing_metadata.append(Annotation(id=Annotation.generate_id(), body=custom_metadata, source="renku"))

dataset.annotations = existing_metadata

def set_dataset_images(self, dataset: Dataset, images, safe_image_paths=None):
"""Set the images on a dataset."""
safe_image_paths = safe_image_paths or []

Expand Down Expand Up @@ -228,10 +251,10 @@ def set_dataset_images(self, dataset: new_datasets.Dataset, images, safe_image_p
# NOTE: absolute url
if not img.get("mirror_locally", False):
dataset.images.append(
new_datasets.ImageObject(
ImageObject(
content_url=content_url,
position=position,
id=new_datasets.ImageObject.generate_id(dataset, position),
id=ImageObject.generate_id(dataset, position),
)
)
images_updated = True
Expand Down Expand Up @@ -271,10 +294,10 @@ def set_dataset_images(self, dataset: new_datasets.Dataset, images, safe_image_p
img_path = path

dataset.images.append(
new_datasets.ImageObject(
ImageObject(
content_url=str(img_path.relative_to(self.path)),
position=position,
id=new_datasets.ImageObject.generate_id(dataset=dataset, position=position),
id=ImageObject.generate_id(dataset=dataset, position=position),
)
)
images_updated = True
Expand Down Expand Up @@ -450,7 +473,7 @@ def add_data_to_dataset(
# Generate the DatasetFiles
dataset_files = []
for data in files:
dataset_file = new_datasets.DatasetFile.from_path(
dataset_file = DatasetFile.from_path(
client=self, path=data["path"], source=data["source"], based_on=data.get("based_on")
)
dataset_files.append(dataset_file)
Expand Down Expand Up @@ -647,7 +670,7 @@ def _add_from_git(self, url, sources, destination, ref, repository=None):
operation = (src, dst, "move")

checksum = get_object_hash(repo=remote_client.repo, revision="HEAD", path=path)
based_on = new_datasets.RemoteEntity(checksum=checksum, path=path, url=url)
based_on = RemoteEntity(checksum=checksum, path=path, url=url)

results.append(
{
Expand Down Expand Up @@ -777,7 +800,7 @@ def move_files(self, files, to_dataset):
datasets = [d.copy() for d in self.datasets.values()]
if to_dataset:
# NOTE: Use the same dataset object or otherwise a race happens if dataset is in both source and destination
to_dataset: new_datasets.Dataset = next(d for d in datasets if d.name == to_dataset)
to_dataset: Dataset = next(d for d in datasets if d.name == to_dataset)
modified_datasets = {}

progress_name = "Updating dataset metadata"
Expand All @@ -787,7 +810,7 @@ def move_files(self, files, to_dataset):
src = src.relative_to(self.path)
dst = dst.relative_to(self.path)
# NOTE: Files are moved at this point, so, we use can use dst
new_dataset_file = new_datasets.DatasetFile.from_path(self, dst)
new_dataset_file = DatasetFile.from_path(self, dst)
for dataset in datasets:
removed = dataset.unlink_file(src, missing_ok=True)
if removed:
Expand Down Expand Up @@ -856,7 +879,7 @@ def _update_datasets_metadata(
modified_datasets = {}

for file in updated_files:
new_file = new_datasets.DatasetFile.from_path(
new_file = DatasetFile.from_path(
client=self, path=file.entity.path, based_on=file.based_on, source=file.source
)
modified_datasets[file.dataset.name] = file.dataset
Expand Down Expand Up @@ -926,9 +949,7 @@ def update_dataset_git_files(self, files: List[DynamicProxy], ref, delete=False)
self._create_external_file(src.resolve(), dst)
else:
shutil.copy(src, dst)
file.based_on = new_datasets.RemoteEntity(
checksum=checksum, path=based_on.path, url=based_on.url
)
file.based_on = RemoteEntity(checksum=checksum, path=based_on.path, url=based_on.url)
updated_files.append(file)
else:
# File was removed or renamed
Expand Down Expand Up @@ -1025,9 +1046,7 @@ def update_external_files(self, records: List[DynamicProxy]):
for dataset in updated_datasets.values():
for file in dataset.files:
if str(self.path / file.entity.path) in updated_files_paths:
new_file = new_datasets.DatasetFile.from_path(
client=self, path=file.entity.path, source=file.source
)
new_file = DatasetFile.from_path(client=self, path=file.entity.path, source=file.source)
dataset.add_or_update_files(new_file)

datasets_provenance.add_or_update(dataset, creator=Person.from_client(self))
Expand Down

0 comments on commit dfeb1d4

Please sign in to comment.