Skip to content

Commit

Permalink
fix(dataset): same_as and DatasetFile id corrections
Browse files Browse the repository at this point in the history
  • Loading branch information
m-alisafaee committed Sep 23, 2021
1 parent 0a3aab1 commit 956b77e
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 26 deletions.
65 changes: 42 additions & 23 deletions renku/core/management/migrations/utils/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,58 +27,56 @@
from renku.core.models.provenance import agent as new_agents


def convert_url(url: Optional[old_datasets.Url]) -> Optional[Url]:
"""Convert old Url to new Url."""
if not url:
return
return Url(url=url.url, url_id=url.url_id, url_str=url.url_str)
def _convert_dataset_identifier(identifier: str) -> str:
"""Remove - from a dataset identifier."""
return identifier.replace("-", "")


def convert_dataset_tag(tag: Optional[old_datasets.DatasetTag]) -> Optional[DatasetTag]:
def _convert_dataset_tag(tag: Optional[old_datasets.DatasetTag]) -> Optional[DatasetTag]:
"""Convert old DatasetTag to new DatasetTag."""
if not tag:
return
return DatasetTag(dataset_id="dummy-id", date_created=tag.created, description=tag.description, name=tag.name)


def convert_language(language: Optional[old_datasets.Language]) -> Optional[Language]:
def _convert_language(language: Optional[old_datasets.Language]) -> Optional[Language]:
"""Convert old Language to new Language."""
if not language:
return
return Language(alternate_name=language.alternate_name, name=language.name)


def convert_image_object(image_object: Optional[old_datasets.ImageObject]) -> Optional[ImageObject]:
def _convert_image_object(image_object: Optional[old_datasets.ImageObject]) -> Optional[ImageObject]:
"""Create from old ImageObject instance."""
if not image_object:
return
return ImageObject(content_url=image_object.content_url, position=image_object.position, id=image_object.id)


def create_remote_entity(dataset_file: Optional[old_datasets.DatasetFile]) -> Optional[RemoteEntity]:
def _create_remote_entity(dataset_file: Optional[old_datasets.DatasetFile]) -> Optional[RemoteEntity]:
"""Create RemoteEntity from old DatasetFile."""
if not dataset_file:
return
commit_sha = dataset_file._label.rsplit("@", maxsplit=1)[-1]
return RemoteEntity(commit_sha=commit_sha, path=dataset_file.path, url=dataset_file.url)


def convert_dataset_file(dataset_file: old_datasets.DatasetFile, client, revision: str) -> Optional[DatasetFile]:
def _convert_dataset_file(dataset_file: old_datasets.DatasetFile, client, revision: str) -> Optional[DatasetFile]:
"""Convert old DatasetFile to new DatasetFile if available at revision."""
entity = Entity.from_revision(client=client, path=dataset_file.path, revision=revision)
if not entity:
return

return DatasetFile(
based_on=create_remote_entity(dataset_file.based_on),
based_on=_create_remote_entity(dataset_file.based_on),
date_added=dataset_file.added,
entity=entity,
is_external=dataset_file.external,
source=dataset_file.source,
)


def convert_person(person: Optional[old_datasets.Person]) -> Optional[new_agents.Person]:
def _convert_person(person: Optional[old_datasets.Person]) -> Optional[new_agents.Person]:
"""Create a Person from and old Person."""
if not person:
return
Expand All @@ -92,15 +90,36 @@ def convert_person(person: Optional[old_datasets.Person]) -> Optional[new_agents
)


def convert_agent(
def _convert_agent(
agent: Optional[Union[old_datasets.Person, old_datasets.SoftwareAgent]]
) -> Optional[Union[new_agents.Person, new_agents.SoftwareAgent]]:
"""Create an instance from Person/SoftwareAgent."""
if isinstance(agent, old_datasets.SoftwareAgent):
return new_agents.SoftwareAgent(id=agent.id, name=agent.label)

assert not agent or isinstance(agent, old_datasets.Person), f"Invalid type {type(agent)}"
return convert_person(agent)
return _convert_person(agent)


def _convert_same_as(url: Optional[old_datasets.Url]) -> Optional[Url]:
"""Convert old Url to new Url."""
if not url:
return

if url.url_str:
parsed_url = urlparse(url.url_str)
else:
assert url.url_id
parsed_url = urlparse(url.url_id)

# NOTE: Test if dataset is imported from a renku deployment. This is good enough for all current deployments.
if "renku" in parsed_url.netloc:
path = _convert_dataset_identifier(parsed_url.path)
parsed_url = parsed_url._replace(path=path)

url_str = parsed_url.geturl()

return Url(url_str=url_str) if url.url_str else Url(url_id=url_str)


def convert_dataset(dataset: old_datasets.Dataset, client, revision: str) -> Tuple[Dataset, List[DatasetTag]]:
Expand All @@ -112,7 +131,7 @@ def convert_dataset_files(files: List[old_datasets.DatasetFile]) -> List[Dataset
files = {f.path: f for f in files} # NOTE: To make sure there are no duplicate paths

for file in files.values():
new_file = convert_dataset_file(dataset_file=file, client=client, revision=revision)
new_file = _convert_dataset_file(dataset_file=file, client=client, revision=revision)
if not new_file:
continue

Expand All @@ -126,31 +145,31 @@ def convert_derived_from(derived_from: Optional[old_datasets.Url]) -> Optional[U
return

url = derived_from.url.get("@id")
path = urlparse(url).path
path = _convert_dataset_identifier(urlparse(url).path)

return Url(url_id=Dataset.generate_id(identifier=Path(path).name))

tags = [convert_dataset_tag(tag) for tag in (dataset.tags or [])]
tags = [_convert_dataset_tag(tag) for tag in (dataset.tags or [])]

return (
Dataset(
creators=[convert_agent(creator) for creator in dataset.creators],
creators=[_convert_agent(creator) for creator in dataset.creators],
dataset_files=convert_dataset_files(dataset.files),
date_created=dataset.date_created,
date_published=dataset.date_published,
date_removed=None,
derived_from=convert_derived_from(dataset.derived_from),
description=dataset.description,
id=None,
identifier=dataset.identifier.replace("-", ""),
images=[convert_image_object(image) for image in (dataset.images or [])],
in_language=convert_language(dataset.in_language),
identifier=_convert_dataset_identifier(dataset.identifier),
images=[_convert_image_object(image) for image in (dataset.images or [])],
in_language=_convert_language(dataset.in_language),
keywords=dataset.keywords,
license=dataset.license,
name=dataset.name,
project_id=client.project.id,
initial_identifier=dataset.initial_identifier.replace("-", ""),
same_as=convert_url(dataset.same_as),
initial_identifier=_convert_dataset_identifier(dataset.initial_identifier),
same_as=_convert_same_as(dataset.same_as),
title=dataset.title,
version=dataset.version,
),
Expand Down
20 changes: 18 additions & 2 deletions renku/core/models/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,15 @@ def __init__(self, *, id: str = None, url: str = None, url_str: str = None, url_
self.url_id: str = url_id

if not self.url:
self.url = self.get_default_url()
self.url = self._get_default_url()
elif isinstance(self.url, dict):
if "_id" in self.url:
self.url["@id"] = self.url.pop("_id")
self.url_id = self.url["@id"]
self.url_str = None
elif isinstance(self.url, str):
self.url_str = self.url
self.url_id = None

if not self.id or self.id.startswith("_:"):
self.id = Url.generate_id(url_str=self.url_str, url_id=self.url_id)
Expand All @@ -93,7 +95,12 @@ def generate_id(url_str, url_id):

return f"/urls/{id}"

def get_default_url(self):
@property
def value(self):
"""Returns the url value as string."""
return self.url_str or self.url_id

def _get_default_url(self):
"""Define default value for url field."""
if self.url_str:
return self.url_str
Expand Down Expand Up @@ -257,6 +264,14 @@ def generate_id():
"""
return f"/dataset-files/{uuid4().hex}"

@classmethod
def from_dataset_file(cls, other: "DatasetFile") -> "DatasetFile":
"""Return a copy with a different id."""
self = other.copy()
self.id = DatasetFile.generate_id()

return self

def copy(self) -> "DatasetFile":
"""Return a clone of this object."""
return copy.copy(self)
Expand Down Expand Up @@ -493,6 +508,7 @@ def update_files_from(self, current_dataset: "Dataset", date: datetime = None):

# NOTE: Whatever remains in `current_files` are removed in the newer version
for removed_file in current_files.values():
removed_file = DatasetFile.from_dataset_file(removed_file)
removed_file.remove(date)
files.append(removed_file)

Expand Down
40 changes: 39 additions & 1 deletion tests/cli/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from renku.core.models.refs import LinkReference
from renku.core.utils.git import get_object_hash
from renku.core.utils.urls import get_slug
from tests.utils import assert_dataset_is_mutated, format_result_exception
from tests.utils import assert_dataset_is_mutated, format_result_exception, write_and_commit_file


def test_datasets_create_clean(runner, project, client, load_dataset_with_injection):
Expand Down Expand Up @@ -2011,6 +2011,44 @@ def test_datasets_provenance_add_file(runner, client, directory_tree, load_datas
assert {"file1", "file2", "file3"} == {Path(f.entity.path).name for f in dataset.files}


def test_immutability_of_dataset_files(runner, client, directory_tree, load_dataset_with_injection):
"""Test DatasetFiles are generated when their Entity changes."""
assert 0 == runner.invoke(cli, ["dataset", "add", "my-data", "-c", str(directory_tree / "file1")]).exit_code

file1 = os.path.join(DATA_DIR, "my-data", "file1")

v1 = load_dataset_with_injection("my-data", client).find_file(file1)

# DatasetFile changes when Entity is changed
write_and_commit_file(client.repo, file1, "changed content")
assert 0 == runner.invoke(cli, ["dataset", "update"]).exit_code
v2 = load_dataset_with_injection("my-data", client).find_file(file1)

assert v1.id != v2.id

# DatasetFile doesn't change when Entity is unchanged
assert 0 == runner.invoke(cli, ["dataset", "add", "my-data", str(directory_tree / "dir1" / "file2")]).exit_code
v3 = load_dataset_with_injection("my-data", client).find_file(file1)

assert v2.id == v3.id

# DatasetFile changes when Entity is unchanged but is overwritten
assert (
0 == runner.invoke(cli, ["dataset", "add", "my-data", "--overwrite", str(directory_tree / "file1")]).exit_code
)
v4 = load_dataset_with_injection("my-data", client).find_file(file1)

assert v3.id != v4.id

# DatasetFile changes if the file is removed
assert 0 == runner.invoke(cli, ["dataset", "unlink", "my-data", "--include", "file1"], input="y").exit_code
dataset = load_dataset_with_injection("my-data", client)
v5 = next(f for f in dataset.dataset_files if f.is_removed())

assert "file1" in v5.entity.path
assert v4.id != v5.id


@pytest.mark.serial
def test_unauthorized_import(mock_kg, client, runner):
"""Test importing without a valid token."""
Expand Down
22 changes: 22 additions & 0 deletions tests/cli/test_migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,28 @@ def test_comprehensive_dataset_migration(
assert file_.based_on is None


@pytest.mark.migration
def test_migrate_renku_dataset_same_as(isolated_runner, old_client_before_database, load_dataset_with_injection):
"""Test migration of imported renku datasets remove dashes from the same_as field."""
result = isolated_runner.invoke(cli, ["migrate", "--strict"])
assert 0 == result.exit_code, format_result_exception(result)

dataset = load_dataset_with_injection("renku-dataset", old_client_before_database)

assert "https://dev.renku.ch/datasets/860f6b5b46364c83b6a9b38ef198bcc0" == dataset.same_as.value


@pytest.mark.migration
def test_migrate_renku_dataset_derived_from(isolated_runner, old_client_before_database, load_dataset_with_injection):
"""Test migration of datasets remove dashes from the derived_from field."""
result = isolated_runner.invoke(cli, ["migrate", "--strict"])
assert 0 == result.exit_code, format_result_exception(result)

dataset = load_dataset_with_injection("local", old_client_before_database)

assert "/datasets/535b6e1ddb85442a897b2b3c72aec0c6" == dataset.derived_from.url_id


@pytest.mark.migration
def test_no_blank_node_after_dataset_migration(isolated_runner, old_dataset_project, load_dataset_with_injection):
"""Test migration of datasets with blank nodes creates IRI identifiers."""
Expand Down
Binary file modified tests/data/old-datasets-v0.16.0.git.tar.gz
Binary file not shown.

0 comments on commit 956b77e

Please sign in to comment.