Skip to content

Commit

Permalink
feat(svc): add support for adding images to datasets (#1850)
Browse files Browse the repository at this point in the history
* feat(svc): add support for adding images to datasets
  • Loading branch information
Panaetius committed Feb 8, 2021
1 parent 1e35958 commit c3caafd
Show file tree
Hide file tree
Showing 16 changed files with 739 additions and 28 deletions.
15 changes: 14 additions & 1 deletion conftest.py
Expand Up @@ -983,8 +983,16 @@ def integration_repo(headers, project_id, url_components):

with chdir(integration_repo_path(headers, project_id, url_components)):
repo = Repo(".")
repo.heads.master.checkout()

yield repo

if integration_repo_path(headers, project_id, url_components).exists():
repo.git.reset("--hard")
repo.heads.master.checkout()
repo.git.reset("--hard")
repo.git.clean("-xdf")


@pytest.fixture(scope="module")
def identity_headers():
Expand Down Expand Up @@ -1062,7 +1070,12 @@ def svc_client_setup(integration_lifecycle):
current = repo.create_head(new_branch)
current.checkout()

yield svc_client, deepcopy(headers), project_id, url_components
yield svc_client, deepcopy(headers), project_id, url_components

if integration_repo_path(headers, project_id, url_components).exists():
# NOTE: Some tests delete the repo
repo.git.checkout("master")
repo.git.branch("-D", current)


@pytest.fixture
Expand Down
11 changes: 9 additions & 2 deletions renku/cli/dataset.py
Expand Up @@ -484,7 +484,14 @@ def edit(name, title, description, creators, keyword):
result = (
edit_dataset()
.build()
.execute(name=name, title=title, description=description, creators=creators, keywords=keywords,)
.execute(
name=name,
title=title,
description=description,
creators=creators,
keywords=keywords,
skip_image_update=True,
)
)

updated, no_email_warnings = result.output
Expand All @@ -498,7 +505,7 @@ def edit(name, title, description, creators, keyword):
)
)
else:
click.echo("Successfully updated: {}.".format(", ".join(updated)))
click.echo("Successfully updated: {}.".format(", ".join(updated.keys())))
if no_email_warnings:
click.echo(ClickCallback.WARNING + "No email or wrong format for: " + ", ".join(no_email_warnings))

Expand Down
55 changes: 46 additions & 9 deletions renku/core/commands/dataset.py
Expand Up @@ -65,14 +65,22 @@ def list_datasets():
return Command().command(_list_datasets).lock_dataset()


def _create_dataset(client, name, title=None, description="", creators=None, keywords=None):
def _create_dataset(
client, name, title=None, description="", creators=None, keywords=None, images=None, safe_image_paths=[]
):
if not creators:
creators = [Person.from_git(client.repo)]
else:
creators, _ = _construct_creators(creators)

dataset, _, _ = client.create_dataset(
name=name, title=title, description=description, creators=creators, keywords=keywords
name=name,
title=title,
description=description,
creators=creators,
keywords=keywords,
images=images,
safe_image_paths=safe_image_paths,
)

client.update_datasets_provenance(dataset)
Expand All @@ -86,19 +94,40 @@ def create_dataset():
return command.require_migration().with_commit(commit_only=DATASET_METADATA_PATHS)


def _edit_dataset(client, name, title, description, creators, keywords=None):
def _edit_dataset(
client, name, title, description, creators, keywords=None, images=[], skip_image_update=False, safe_image_paths=[]
):
"""Edit dataset metadata."""
creators, no_email_warnings = _construct_creators(creators, ignore_email=True)
creator_objs, no_email_warnings = _construct_creators(creators, ignore_email=True)
title = title.strip() if isinstance(title, str) else ""

possible_updates = {"creators": creators, "description": description, "keywords": keywords, "title": title}
updated = [k for k, v in possible_updates.items() if v]
possible_updates = {
"creators": creators,
"description": description,
"keywords": keywords,
"title": title,
}

dataset = client.load_dataset(name=name)

updated = {k: v for k, v in possible_updates.items() if v}

if updated:
dataset.update_metadata(creators=creator_objs, description=description, keywords=keywords, title=title)

if skip_image_update:
images_updated = False
else:
safe_image_paths.append(client.path)
images_updated = client.set_dataset_images(dataset, images, safe_image_paths)

if images_updated:
updated["images"] = [{"content_url": i.content_url, "position": i.position} for i in dataset.images]

if not updated:
return [], no_email_warnings

with client.with_dataset(name=name) as dataset:
dataset.update_metadata(creators=creators, description=description, keywords=keywords, title=title)
dataset.to_yaml()

client.update_datasets_provenance(dataset)

Expand Down Expand Up @@ -495,7 +524,11 @@ def _import_dataset(client, uri, name="", extract=False, yes=False, previous_dat
if not dataset.data_dir:
raise OperationError(f"Data directory for dataset must be set: {dataset.name}")

sources = [f"{dataset.data_dir}/**"]
sources = []

if record.datadir_exists:
sources = [f"{dataset.data_dir}/**"]

for file_ in dataset.files:
try:
Path(file_.path).relative_to(dataset.data_dir)
Expand All @@ -514,6 +547,10 @@ def _import_dataset(client, uri, name="", extract=False, yes=False, previous_dat
if previous_dataset:
_update_previous_dataset(client, dataset, previous_dataset, new_files, delete)

if provider.supports_images:
with client.with_dataset(name=name):
record.import_images(client, dataset)


def import_dataset():
"""Create a command for importing datasets."""
Expand Down
5 changes: 5 additions & 0 deletions renku/core/commands/providers/api.py
Expand Up @@ -45,6 +45,11 @@ def is_git_based(self):
"""True if provider is a git repository."""
return False

@property
def supports_images(self):
"""True if provider is a git repository."""
return False


class ExporterApi(abc.ABC):
"""Interface defining exporter methods."""
Expand Down
30 changes: 30 additions & 0 deletions renku/core/commands/providers/renku.py
Expand Up @@ -18,6 +18,7 @@
"""Renku dataset provider."""
import os
import re
import shutil
import urllib
from pathlib import Path
from subprocess import PIPE, SubprocessError, run
Expand Down Expand Up @@ -114,6 +115,11 @@ def is_git_based(self):
"""True if provider is git-based."""
return True

@property
def supports_images(self):
"""True if provider is a git repository."""
return True

def _migrate_project(self, client):
if is_project_unsupported(client):
return
Expand Down Expand Up @@ -194,6 +200,7 @@ def __init__(self, dataset, project_url, remote_client, uri):
self._dataset = dataset
self._project_url = project_url
self._uri = uri
self.remote_client = remote_client

for file_ in dataset.files:
file_.checksum = remote_client.repo.git.hash_object(file_.path)
Expand Down Expand Up @@ -246,6 +253,23 @@ def as_dataset(self, client):
self._dataset.same_as = Url(url_id=remove_credentials(same_as))
return self._dataset

def import_images(self, client, dataset):
"""Add images from remote dataset."""
if not self._dataset.images:
return

for img in self._dataset.images:
if img.is_absolute:
continue

remote_image_path = self.remote_client.path / img.content_url
local_image_path = client.path / img.content_url
local_image_path.parent.mkdir(exist_ok=True, parents=True)

shutil.copy(remote_image_path, local_image_path)

dataset.images = self._dataset.images

def is_last_version(self, uri):
"""Check if dataset is at last possible version."""
return True
Expand All @@ -270,3 +294,9 @@ def version(self):
def latest_uri(self):
"""Get uri of latest version."""
return self._dataset._id

@property
def datadir_exists(self):
"""Whether the dataset datadir exists (might be missing in git if empty)."""

return (self.remote_client.path / self._dataset.data_dir).exists()
4 changes: 4 additions & 0 deletions renku/core/errors.py
Expand Up @@ -458,3 +458,7 @@ class CommandFinalizedError(RenkuException):

class RenkuSaveError(RenkuException):
"""Raised when renku save doesn't work."""


class DatasetImageError(RenkuException):
"""Raised when a local dataset image is not accessible."""
97 changes: 95 additions & 2 deletions renku/core/management/datasets.py
Expand Up @@ -30,7 +30,7 @@
from pathlib import Path
from subprocess import PIPE, SubprocessError, run
from urllib import error, parse
from urllib.parse import ParseResult
from urllib.parse import ParseResult, urlparse

import attr
import patoolib
Expand All @@ -46,6 +46,7 @@
Dataset,
DatasetFile,
DatasetTag,
ImageObject,
generate_dataset_file_url,
is_dataset_name_valid,
)
Expand All @@ -70,6 +71,9 @@ class DatasetsApiMixin(object):
CACHE = "cache"
"""Directory to cache transient data."""

DATASET_IMAGES = "dataset_images"
"""Directory for dataset images."""

DATASETS_PROVENANCE = "dataset.json"
"""File for storing datasets' provenance."""

Expand All @@ -84,6 +88,14 @@ def renku_datasets_path(self):

return self.path / self.renku_home / self.DATASETS

@property
def renku_dataset_images_path(self):
"""Return a ``Path`` instance of Renku dataset metadata folder."""
if self._temporary_datasets_path:
return self._temporary_datasets_path

return self.path / self.renku_home / self.DATASET_IMAGES

@property
def datasets_provenance_path(self):
"""Path to store activity files."""
Expand Down Expand Up @@ -235,7 +247,9 @@ def with_dataset(self, name=None, create=False):

dataset.to_yaml()

def create_dataset(self, name=None, title=None, description=None, creators=None, keywords=None):
def create_dataset(
self, name=None, title=None, description=None, creators=None, keywords=None, images=None, safe_image_paths=[]
):
"""Create a dataset."""
if not name:
raise errors.ParameterError("Dataset name must be provided.")
Expand Down Expand Up @@ -277,13 +291,91 @@ def create_dataset(self, name=None, title=None, description=None, creators=None,
immutable=True, # No mutation required when first creating a dataset
)

if images:
safe_image_paths.append(self.path)
self.set_dataset_images(dataset, images, safe_image_paths)

dataset_ref = LinkReference.create(client=self, name="datasets/" + name)
dataset_ref.set_reference(metadata_path)

dataset.to_yaml(path=metadata_path)

return dataset, metadata_path, dataset_ref

def set_dataset_images(self, dataset, images, safe_image_paths=[]):
"""Set the images on a dataset."""

if not images:
images = []

(self.renku_dataset_images_path / dataset.identifier).mkdir(exist_ok=True, parents=True)

previous_images = dataset.images or []

dataset.images = []

images_updated = False

for img in images:
position = img["position"]
content_url = img["content_url"]

if any(i.position == img["position"] for i in dataset.images):
raise errors.DatasetImageError(f"Duplicate dataset image specified for position {position}")

existing = next(
(i for i in previous_images if i.position == img["position"] and i.content_url == img["content_url"]),
None,
)

if existing:
dataset.images.append(existing)
continue

if urlparse(content_url).netloc:
# NOTE: absolute url
dataset.images.append(ImageObject(content_url, position, id=ImageObject.generate_id(dataset, position)))
images_updated = True
continue

path = content_url
if not os.path.isabs(path):
path = os.path.normpath(os.path.join(self.path, path))

if not os.path.exists(path) or not any(os.path.commonprefix([path, p]) == str(p) for p in safe_image_paths):
# NOTE: make sure files exists and prevent path traversal
raise errors.DatasetImageError(f"Dataset image with relative path {content_url} not found")

image_folder = self.renku_dataset_images_path / dataset.identifier

if not path.startswith(str(image_folder)):
# NOTE: only copy dataset image if it's not in .renku/datasets/<id>/images/ already
_, ext = os.path.splitext(content_url)
img_path = image_folder / f"{position}{ext}"
shutil.copy(path, img_path)

dataset.images.append(
ImageObject(
str(img_path.relative_to(self.path)), position, id=ImageObject.generate_id(dataset, position)
)
)
images_updated = True

new_urls = [i.content_url for i in dataset.images]

for prev in previous_images:
# NOTE: Delete images if they were removed
if prev.content_url in new_urls or urlparse(prev.content_url).netloc:
continue

path = prev.content_url
if not os.path.isabs(path):
path = os.path.normpath(os.path.join(self.path, path))

os.remove(path)

return images_updated or dataset.images != previous_images

def add_data_to_dataset(
self,
dataset,
Expand Down Expand Up @@ -1217,6 +1309,7 @@ def _check_url(url):

DATASET_METADATA_PATHS = [
Path(RENKU_HOME) / DatasetsApiMixin.DATASETS,
Path(RENKU_HOME) / DatasetsApiMixin.DATASET_IMAGES,
Path(RENKU_HOME) / DatasetsApiMixin.POINTERS,
Path(RENKU_HOME) / LinkReference.REFS,
Path(RENKU_HOME) / DatasetsApiMixin.DATASETS_PROVENANCE,
Expand Down

0 comments on commit c3caafd

Please sign in to comment.