diff --git a/conftest.py b/conftest.py index ac7281276d..9f3dda7a42 100644 --- a/conftest.py +++ b/conftest.py @@ -362,6 +362,22 @@ def request_callback(request): yield rsps +@pytest.fixture +def missing_kg_project_responses(): + """KG project query responses for missing project.""" + with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: + + def request_callback(request): + return (404, {"Content-Type": "application/text"}, json.dumps({"message": "no project found"})) + + rsps.add_callback( + responses.GET, re.compile("http(s)*://dev.renku.ch/knowledge-graph/projects/.*"), callback=request_callback + ) + rsps.add_passthru(re.compile("http(s)*://dev.renku.ch/datasets/.*")) + rsps.add_passthru(re.compile("http(s)*://dev.renku.ch/knowledge-graph/datasets/.*")) + yield rsps + + @pytest.fixture() def directory_tree(tmp_path): """Create a test directory tree.""" diff --git a/renku/cli/exception_handler.py b/renku/cli/exception_handler.py index 1c53e92bf8..31ca6fb4db 100644 --- a/renku/cli/exception_handler.py +++ b/renku/cli/exception_handler.py @@ -109,7 +109,7 @@ def __init__(self, *args, **kwargs): if HAS_SENTRY: import sentry_sdk - sentry_sdk.init() + sentry_sdk.init(dsn=os.getenv("SENTRY_DSN"), environment=os.getenv("SENTRY_ENV")) def main(self, *args, **kwargs): """Catch all exceptions.""" diff --git a/renku/core/commands/graph.py b/renku/core/commands/graph.py index 04835f96d9..46a5f1df61 100644 --- a/renku/core/commands/graph.py +++ b/renku/core/commands/graph.py @@ -31,6 +31,7 @@ from renku.core.models.provenance.activities import Activity, ProcessRun, Usage, WorkflowRun from renku.core.models.provenance.qualified import Generation from renku.core.models.workflow.run import Run +from renku.core.utils.scm import git_unicode_unescape def _safe_path(filepath, can_be_cwl=False): @@ -527,7 +528,7 @@ def build_graph(client, revision, no_output, paths): commit = client.repo.rev_parse(stop) paths = ( - str(client.path / item.a_path) + str(client.path / git_unicode_unescape(item.a_path)) for item in commit.diff(commit.parents or NULL_TREE) # if not item.deleted_file ) diff --git a/renku/core/commands/providers/renku.py b/renku/core/commands/providers/renku.py index 224fd807d1..1c456a9821 100644 --- a/renku/core/commands/providers/renku.py +++ b/renku/core/commands/providers/renku.py @@ -55,9 +55,17 @@ def find_record(self, uri, client=None): same_as, kg_urls = self._get_dataset_info(uri) project_url = None failed_urls = [] + non_existing_projects = [] for kg_url in kg_urls: - kg_datasets_url, ssh_url, https_url = self._get_project_urls(kg_url) + try: + kg_datasets_url, ssh_url, https_url = self._get_project_urls(kg_url) + except errors.OperationError as e: + # NOTE: Project was likely deleted, but still referenced in the KG + if "project not found" not in str(e): + raise + non_existing_projects.append(kg_url) + continue # Check if the project contains the dataset if same_as is None: # Dataset is in the project @@ -85,6 +93,12 @@ def find_record(self, uri, client=None): if project_url is None: if failed_urls: message = "Cannot clone remote projects:\n\t" + "\n\t".join(failed_urls) + elif non_existing_projects: + raise errors.ProjectNotFound( + "Cannot find these projects in the knowledge graph:\n\t{}".format( + "\n\t".join(non_existing_projects) + ) + ) else: message = "Cannot find any project for the dataset." @@ -178,11 +192,12 @@ def _query_knowledge_graph(url): try: response = requests.get(url) except urllib.error.HTTPError as e: - raise errors.OperationError("Cannot access knowledge graph: {}".format(url)) from e + raise errors.OperationError(f"Cannot access knowledge graph: {url}") from e if response.status_code != 200: - raise errors.OperationError( - "Cannot access knowledge graph: {}\nResponse code: {}".format(url, response.status_code) - ) + if response.status_code == 404: + raise errors.OperationError(f"Cannot access knowledge graph: {url}, project not found") + + raise errors.OperationError(f"Cannot access knowledge graph: {url}\nResponse code: {response.status_code}") return response.json() diff --git a/renku/core/errors.py b/renku/core/errors.py index 781f2b8690..0d7faad524 100644 --- a/renku/core/errors.py +++ b/renku/core/errors.py @@ -196,6 +196,10 @@ def __init__(self): ) +class ProjectNotFound(RenkuException): + """Raise when one or more projects couldn't be found in the KG.""" + + class NothingToCommit(RenkuException): """Raise when there is nothing to commit.""" diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index 3625611ff2..e17a3d62fa 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -20,6 +20,7 @@ import concurrent.futures import os import re +import shlex import shutil import tempfile import time @@ -677,23 +678,19 @@ def _get_src_and_dst(path, repo_path, sources, dst_root, used_sources): def _fetch_lfs_files(repo_path, paths): """Fetch and checkout paths that are tracked by Git LFS.""" repo_path = str(repo_path) - try: - output = run(("git", "lfs", "ls-files", "--name-only"), stdout=PIPE, cwd=repo_path, universal_newlines=True) - except SubprocessError: - return - - lfs_files = set(output.stdout.split("\n")) - files = lfs_files & paths - if not files: - return try: - for path in files: - run(["git", "lfs", "pull", "--include", path], cwd=repo_path) + includes = ",".join(shlex.quote(p) for p in paths) + status = run( + ["git", "lfs", "pull", "--include", includes], stderr=PIPE, cwd=repo_path, universal_newlines=True + ) + if status.returncode != 0: + message = "\n\t".join(status.stderr.split("\n")) + raise errors.GitError(f"Cannot pull LFS objects from server: {message}") except KeyboardInterrupt: raise - except SubprocessError: - pass + except SubprocessError as e: + raise errors.GitError(f"Cannot pull LFS objects from server: {e}") @staticmethod def _fetch_files_metadata(client, paths): @@ -955,7 +952,8 @@ def _update_pointer_file(self, pointer_file_path): os.remove(pointer_file_path) return self._create_pointer_file(target, checksum=checksum) - def remove_file(self, filepath): + @staticmethod + def remove_file(filepath): """Remove a file/symlink and its pointer file (for external files).""" path = Path(filepath) try: @@ -1003,7 +1001,7 @@ def prepare_git_repo(self, url, ref=None): if not url: raise errors.GitError("Invalid URL.") - RENKU_BRANCH = "renku-default-branch" + renku_branch = "renku-default-branch" def checkout(repo, ref): try: @@ -1011,7 +1009,7 @@ def checkout(repo, ref): except GitCommandError: raise errors.ParameterError('Cannot find reference "{}" in Git repository: {}'.format(ref, url)) - ref = ref or RENKU_BRANCH + ref = ref or renku_branch u = GitURL.parse(url) path = u.pathname if u.hostname == "localhost": @@ -1048,7 +1046,7 @@ def checkout(repo, ref): # Because the name of the default branch is not always 'master', we # create an alias of the default branch when cloning the repo. It # is used to refer to the default branch later. - renku_ref = "refs/heads/" + RENKU_BRANCH + renku_ref = "refs/heads/" + renku_branch try: repo.git.execute(["git", "symbolic-ref", renku_ref, repo.head.reference.path]) checkout(repo, ref) diff --git a/renku/core/management/git.py b/renku/core/management/git.py index 1eb24da9fa..149e1d9c38 100644 --- a/renku/core/management/git.py +++ b/renku/core/management/git.py @@ -33,6 +33,7 @@ import git from renku.core import errors +from renku.core.utils.scm import git_unicode_unescape from renku.core.utils.urls import remove_credentials COMMIT_DIFF_STRATEGY = "DIFF" @@ -213,8 +214,9 @@ def ensure_unstaged(self, path): staged = self.repo.index.diff("HEAD") for file_path in staged: - is_parent = str(file_path.a_path).startswith(path) - is_equal = path == file_path.a_path + unescaped_path = git_unicode_unescape(file_path.a_path) + is_parent = str(unescaped_path).startswith(path) + is_equal = path == unescaped_path if is_parent or is_equal: raise errors.DirtyRenkuDirectory(self.repo) @@ -240,9 +242,9 @@ def commit(self, commit_only=None, commit_empty=True, raise_if_empty=False, comm diff_before = set() if commit_only == COMMIT_DIFF_STRATEGY: - staged = {item.a_path for item in self.repo.index.diff(None)} + staged = {git_unicode_unescape(item.a_path) for item in self.repo.index.diff(None)} - modified = {item.a_path for item in self.repo.index.diff("HEAD")} + modified = {git_unicode_unescape(item.a_path) for item in self.repo.index.diff("HEAD")} if staged or modified: self.repo.git.reset() @@ -269,10 +271,12 @@ def commit(self, commit_only=None, commit_empty=True, raise_if_empty=False, comm if commit_only == COMMIT_DIFF_STRATEGY: # Get diff generated in command. - change_types = {item.a_path: item.change_type for item in self.repo.index.diff(None)} + change_types = {git_unicode_unescape(item.a_path): item.change_type for item in self.repo.index.diff(None)} staged_after = set(change_types.keys()) - modified_after_change_types = {item.a_path: item.change_type for item in self.repo.index.diff("HEAD")} + modified_after_change_types = { + git_unicode_unescape(item.a_path): item.change_type for item in self.repo.index.diff("HEAD") + } modified_after = set(modified_after_change_types.keys()) @@ -294,7 +298,7 @@ def commit(self, commit_only=None, commit_empty=True, raise_if_empty=False, comm diffs = [] try: - diffs = [d.a_path for d in self.repo.index.diff("HEAD")] + diffs = [git_unicode_unescape(d.a_path) for d in self.repo.index.diff("HEAD")] if project_metadata_path in diffs: diffs.remove(project_metadata_path) except git.exc.BadName: diff --git a/renku/core/management/migrate.py b/renku/core/management/migrate.py index 21670d40aa..37cb8b0c0c 100644 --- a/renku/core/management/migrate.py +++ b/renku/core/management/migrate.py @@ -37,7 +37,7 @@ from renku.core.errors import MigrationRequired, ProjectNotSupported from renku.core.utils.migrate import read_project_version -SUPPORTED_PROJECT_VERSION = 7 +SUPPORTED_PROJECT_VERSION = 8 def check_for_migration(client): diff --git a/renku/core/management/migrations/m_0003__1_jsonld.py b/renku/core/management/migrations/m_0003__1_jsonld.py index f6cd51b730..92f7809ca7 100644 --- a/renku/core/management/migrations/m_0003__1_jsonld.py +++ b/renku/core/management/migrations/m_0003__1_jsonld.py @@ -79,6 +79,10 @@ def _apply_on_the_fly_jsonld_migrations( ): data = read_yaml(path) + if not isinstance(data, dict) and not isinstance(data, list): + # NOTE: metadata file is probably not an actual renku file + return + if jsonld_translate: # perform the translation data = pyld.jsonld.expand(data) diff --git a/renku/core/management/migrations/m_0005__2_cwl.py b/renku/core/management/migrations/m_0005__2_cwl.py index 5dcdcca18b..7c8a8b0f6c 100644 --- a/renku/core/management/migrations/m_0005__2_cwl.py +++ b/renku/core/management/migrations/m_0005__2_cwl.py @@ -36,6 +36,7 @@ from renku.core.models.provenance.agents import Person, SoftwareAgent from renku.core.models.workflow.parameters import CommandArgument, CommandInput, CommandOutput, MappedIOStream from renku.core.models.workflow.run import Run +from renku.core.utils.scm import git_unicode_unescape from renku.version import __version__, version_url default_missing_software_agent = SoftwareAgent( @@ -365,7 +366,7 @@ def _invalidations_from_commit(client, commit): # in this backwards diff if file_.change_type != "A": continue - path_ = Path(file_.a_path) + path_ = Path(git_unicode_unescape(file_.a_path)) entity = _get_activity_entity(client, commit, path_, collections, deleted=True) results.append(entity) diff --git a/renku/core/management/migrations/m_0008__blank_node_id.py b/renku/core/management/migrations/m_0008__blank_node_id.py new file mode 100644 index 0000000000..ab3e294f49 --- /dev/null +++ b/renku/core/management/migrations/m_0008__blank_node_id.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2020 - Swiss Data Science Center (SDSC) +# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and +# Eidgenössische Technische Hochschule Zürich (ETHZ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Dataset metadata migrations.""" + +from renku.core.management.migrations.models.v8 import get_client_datasets + + +def migrate(client): + """Migration function.""" + _fix_dataset_metadata(client) + + +def _fix_dataset_metadata(client): + for dataset in get_client_datasets(client): + dataset.to_yaml() diff --git a/renku/core/management/migrations/models/v3.py b/renku/core/management/migrations/models/v3.py index 45c57ff528..b7abb7800e 100644 --- a/renku/core/management/migrations/models/v3.py +++ b/renku/core/management/migrations/models/v3.py @@ -22,7 +22,8 @@ from marshmallow import EXCLUDE, post_load, pre_load from renku.core.models import jsonld -from renku.core.models.calamus import JsonLDSchema, fields, prov, rdfs, renku, schema, wfprov +from renku.core.models.calamus import JsonLDSchema, Uri, fields, prov, rdfs, renku, schema, wfprov +from renku.core.models.datasets import generate_dataset_tag_id, generate_url_id from renku.core.models.git import get_user_info from renku.core.models.projects import generate_project_id from renku.core.models.provenance.agents import generate_person_id @@ -33,6 +34,8 @@ class Base: def __init__(self, **kwargs): """Initialize an instance.""" + self.client = None + for k, v in kwargs.items(): setattr(self, k, v) @@ -40,15 +43,17 @@ def __init__(self, **kwargs): class Person(Base): """Person migration model.""" - client = None + affiliation = None + email = None + name = None @staticmethod def _fix_person_id(person, client=None): """Fixes the id of a Person if it is not set.""" - if not person._id or "mailto:None" in person._id: + if not person._id or "mailto:None" in person._id or person._id.startswith("_:"): if not client and person.client: client = person.client - person._id = generate_person_id(email=person.email, client=client) + person._id = generate_person_id(client=client, email=person.email, full_identity=person.full_identity) return person @@ -67,6 +72,13 @@ def __init__(self, **kwargs): kwargs.setdefault("_id", None) super().__init__(**kwargs) + @property + def full_identity(self): + """Return name, email, and affiliation.""" + email = f" <{self.email}>" if self.email else "" + affiliation = f" [{self.affiliation}]" if self.affiliation else "" + return f"{self.name}{email}{affiliation}" + class Project(Base): """Project migration model.""" @@ -98,6 +110,16 @@ class DatasetFile(Base): class DatasetTag(Base): """DatasetTag migration model.""" + commit = None + name = None + + def __init__(self, **kwargs): + """Initialize an instance.""" + super().__init__(**kwargs) + + if not self._id or self._id.startswith("_:"): + self._id = generate_dataset_tag_id(client=self.client, name=self.name, commit=self.commit) + class Language(Base): """Language migration model.""" @@ -106,6 +128,22 @@ class Language(Base): class Url(Base): """Url migration model.""" + url = None + url_id = None + url_str = None + + def __init__(self, **kwargs): + """Initialize an instance.""" + super().__init__(**kwargs) + + if isinstance(self.url, dict): + self.url_id = self.url["@id"] + elif isinstance(self.url, str): + self.url_str = self.url + + if not self._id or self._id.startswith("_:"): + self._id = generate_url_id(client=self.client, url_str=self.url_str, url_id=self.url_id) + class Dataset(Base): """Dataset migration model.""" @@ -212,7 +250,7 @@ class Meta: external = fields.Boolean(renku.external, missing=False) -class LanguageSchemaV5(JsonLDSchema): +class LanguageSchemaV3(JsonLDSchema): """Language schema.""" class Meta: @@ -226,7 +264,7 @@ class Meta: name = fields.String(schema.name) -class DatasetTagSchemaV5(JsonLDSchema): +class DatasetTagSchemaV3(JsonLDSchema): """DatasetTag schema.""" class Meta: @@ -244,7 +282,7 @@ class Meta: name = fields.String(schema.name) -class UrlSchemaV5(JsonLDSchema): +class UrlSchemaV3(JsonLDSchema): """Url schema.""" class Meta: @@ -255,7 +293,7 @@ class Meta: unknown = EXCLUDE _id = fields.Id(missing=None) - url = fields.Uri(schema.url, missing=None) + url = Uri(schema.url, missing=None) class DatasetSchemaV3(CreatorMixinSchemaV3, EntitySchemaV3): @@ -274,12 +312,12 @@ class Meta: description = fields.String(schema.description, missing=None) files = fields.Nested(schema.hasPart, DatasetFileSchemaV3, many=True) identifier = fields.String(schema.identifier) - in_language = fields.Nested(schema.inLanguage, LanguageSchemaV5, missing=None) + in_language = fields.Nested(schema.inLanguage, LanguageSchemaV3, missing=None) keywords = fields.List(schema.keywords, fields.String()) - license = fields.Uri(schema.license, missing=None, allow_none=True) + license = Uri(schema.license, missing=None, allow_none=True) name = fields.String(schema.alternateName, missing=None) - same_as = fields.Nested(schema.sameAs, UrlSchemaV5, missing=None) - tags = fields.Nested(schema.subjectOf, DatasetTagSchemaV5, many=True) + same_as = fields.Nested(schema.sameAs, UrlSchemaV3, missing=None) + tags = fields.Nested(schema.subjectOf, DatasetTagSchemaV3, many=True) title = fields.String(schema.name) url = fields.String(schema.url) version = fields.String(schema.version, missing=None) diff --git a/renku/core/management/migrations/models/v6.py b/renku/core/management/migrations/models/v6.py deleted file mode 100644 index c2995ae891..0000000000 --- a/renku/core/management/migrations/models/v6.py +++ /dev/null @@ -1,127 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright 2017-2020 - Swiss Data Science Center (SDSC) -# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and -# Eidgenössische Technische Hochschule Zürich (ETHZ). -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Migration models V6.""" - -import os - -from marshmallow import EXCLUDE - -from renku.core.models import jsonld -from renku.core.models.calamus import JsonLDSchema, fields, schema - -from .v3 import Base, DatasetSchemaV3 - - -class DatasetTag(Base): - """DatasetTag migration model.""" - - -class Language(Base): - """Language migration model.""" - - -class Url(Base): - """Url migration model.""" - - -class Dataset(Base): - """Dataset migration model.""" - - @classmethod - def from_yaml(cls, path, client): - """Read content from YAML file.""" - data = jsonld.read_yaml(path) - self = DatasetSchemaV6(client=client).load(data) - self.__reference__ = path - return self - - def to_yaml(self, path=None): - """Write content to a YAML file.""" - from renku.core.management import LocalClient - - data = DatasetSchemaV6().dump(self) - path = path or self.__reference__ or os.path.join(self.path, LocalClient.METADATA) - jsonld.write_yaml(path=path, data=data) - - -class LanguageSchemaV6(JsonLDSchema): - """Language schema.""" - - class Meta: - """Meta class.""" - - rdf_type = schema.Language - model = Language - unknown = EXCLUDE - - alternate_name = fields.String(schema.alternateName) - name = fields.String(schema.name) - - -class DatasetTagSchemaV6(JsonLDSchema): - """DatasetTag schema.""" - - class Meta: - """Meta class.""" - - rdf_type = schema.PublicationEvent - model = DatasetTag - unknown = EXCLUDE - - _id = fields.Id() - commit = fields.String(schema.location) - created = fields.DateTime(schema.startDate, missing=None) - dataset = fields.String(schema.about) - description = fields.String(schema.description) - name = fields.String(schema.name) - - -class UrlSchemaV6(JsonLDSchema): - """Url schema.""" - - class Meta: - """Meta class.""" - - rdf_type = schema.URL - model = Url - unknown = EXCLUDE - - _id = fields.Id(missing=None) - url = fields.Uri(schema.url, missing=None) - - -class DatasetSchemaV6(DatasetSchemaV3): - """Dataset schema.""" - - class Meta: - """Meta class.""" - - rdf_type = schema.Dataset - model = Dataset - unknown = EXCLUDE - - in_language = fields.Nested(schema.inLanguage, LanguageSchemaV6, missing=None) - keywords = fields.List(schema.keywords, fields.String()) - same_as = fields.Nested(schema.sameAs, UrlSchemaV6, missing=None) - tags = fields.Nested(schema.subjectOf, DatasetTagSchemaV6, many=True) - - -def get_client_datasets(client): - """Return Dataset migration models for a client.""" - paths = client.renku_datasets_path.rglob(client.METADATA) - return [Dataset.from_yaml(path, client=client) for path in paths] diff --git a/renku/core/management/migrations/models/v8.py b/renku/core/management/migrations/models/v8.py new file mode 100644 index 0000000000..f26ec17a71 --- /dev/null +++ b/renku/core/management/migrations/models/v8.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017-2020 - Swiss Data Science Center (SDSC) +# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and +# Eidgenössische Technische Hochschule Zürich (ETHZ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Migration models V8.""" + +import os + +from marshmallow import EXCLUDE, pre_dump + +from renku.core.models import jsonld +from renku.core.models.calamus import Uri, fields, schema + +from .v3 import CreatorMixinSchemaV3, DatasetTagSchemaV3, EntitySchemaV3, LanguageSchemaV3, PersonSchemaV3, UrlSchemaV3 +from .v7 import Base, DatasetFileSchemaV7 + + +class Dataset(Base): + """Dataset migration model.""" + + @classmethod + def from_yaml(cls, path, client=None, commit=None): + """Read content from YAML file.""" + data = jsonld.read_yaml(path) + self = DatasetSchemaV8(client=client, commit=commit, flattened=True).load(data) + self.__reference__ = path + return self + + def to_yaml(self, path=None): + """Write content to a YAML file.""" + from renku.core.management import LocalClient + + data = DatasetSchemaV8(flattened=True).dump(self) + path = path or self.__reference__ or os.path.join(self.path, LocalClient.METADATA) + jsonld.write_yaml(path=path, data=data) + + +class DatasetSchemaV8(CreatorMixinSchemaV3, EntitySchemaV3): + """Dataset schema.""" + + class Meta: + """Meta class.""" + + rdf_type = schema.Dataset + model = Dataset + unknown = EXCLUDE + + creators = fields.Nested(schema.creator, PersonSchemaV3, many=True) + date_created = fields.DateTime(schema.dateCreated, missing=None) + date_published = fields.DateTime(schema.datePublished, missing=None) + description = fields.String(schema.description, missing=None) + files = fields.Nested(schema.hasPart, DatasetFileSchemaV7, many=True) + identifier = fields.String(schema.identifier) + in_language = fields.Nested(schema.inLanguage, LanguageSchemaV3, missing=None) + keywords = fields.List(schema.keywords, fields.String()) + license = Uri(schema.license, missing=None, allow_none=True) + name = fields.String(schema.alternateName, missing=None) + same_as = fields.Nested(schema.sameAs, UrlSchemaV3, missing=None) + tags = fields.Nested(schema.subjectOf, DatasetTagSchemaV3, many=True) + title = fields.String(schema.name) + url = fields.String(schema.url) + version = fields.String(schema.version, missing=None) + + @pre_dump + def fix_license(self, data, **kwargs): + """Fix license to be a string.""" + if isinstance(data.license, dict): + data.license = data.license.get("http://schema.org/url", "") + + return data + + +def get_client_datasets(client): + """Return Dataset migration models for a client.""" + paths = client.renku_datasets_path.rglob(client.METADATA) + return [Dataset.from_yaml(path=path, client=client) for path in paths] diff --git a/renku/core/models/calamus.py b/renku/core/models/calamus.py index 30d32fc1ae..75fe18c513 100644 --- a/renku/core/models/calamus.py +++ b/renku/core/models/calamus.py @@ -116,9 +116,6 @@ def _deserialize(self, value, attr, data, **kwargs): raise ValueError("Invalid type for field {}: {}".format(self.name, type(value))) -fields.Uri = Uri - - class StringList(fields._JsonLDField, marshmallow.fields.String, marshmallow.fields.List): """A String field that might be a list when deserializing.""" diff --git a/renku/core/models/cwl/command_line_tool.py b/renku/core/models/cwl/command_line_tool.py index 88b7f8171a..434bfeeb6a 100644 --- a/renku/core/models/cwl/command_line_tool.py +++ b/renku/core/models/cwl/command_line_tool.py @@ -30,6 +30,7 @@ from renku.core import errors from renku.core.commands.echo import INFO +from renku.core.utils.scm import git_unicode_unescape from renku.version import __version__, version_url from ...management.config import RENKU_HOME @@ -182,7 +183,7 @@ def watch(self, client, no_output=False): candidates |= {file_ for file_ in repo.untracked_files} # Capture modified files through redirects. - candidates |= {o.a_path for o in repo.index.diff(None) if not o.deleted_file} + candidates |= {git_unicode_unescape(o.a_path) for o in repo.index.diff(None) if not o.deleted_file} # Include explicit outputs candidates |= {str(path.relative_to(self.working_dir)) for path in self.explicit_outputs} diff --git a/renku/core/models/datasets.py b/renku/core/models/datasets.py index 39bf3ee371..42deccbfd0 100644 --- a/renku/core/models/datasets.py +++ b/renku/core/models/datasets.py @@ -31,7 +31,7 @@ from renku.core import errors from renku.core.models import jsonld as jsonld -from renku.core.models.calamus import JsonLDSchema, Nested, fields, rdfs, renku, schema +from renku.core.models.calamus import JsonLDSchema, Nested, Uri, fields, rdfs, renku, schema from renku.core.models.entities import Entity, EntitySchema from renku.core.models.locals import ReferenceMixin from renku.core.models.provenance.agents import Person, PersonSchema @@ -57,21 +57,7 @@ class Url: def default_id(self): """Define default value for id field.""" - if self.url_str: - parsed_result = urlparse(self.url_str) - id_ = ParseResult("", *parsed_result[1:]).geturl() - elif self.url_id: - parsed_result = urlparse(self.url_id) - id_ = ParseResult("", *parsed_result[1:]).geturl() - else: - id_ = str(uuid.uuid4()) - - host = "localhost" - if self.client: - host = self.client.remote.get("host") or host - host = os.environ.get("RENKU_DOMAIN") or host - - return urljoin("https://{host}".format(host=host), pathlib.posixpath.join("/urls", quote(id_, safe=""))) + return generate_url_id(client=self.client, url_str=self.url_str, url_id=self.url_id) def default_url(self): """Define default value for url field.""" @@ -96,8 +82,12 @@ def __attrs_post_init__(self): """Post-initialize attributes.""" if not self.url: self.url = self.default_url() + elif isinstance(self.url, dict): + self.url_id = self.url["@id"] + elif isinstance(self.url, str): + self.url_str = self.url - if not self._id: + if not self._id or self._id.startswith("_:"): self._id = self.default_id() @classmethod @@ -176,19 +166,11 @@ def _now(self): def default_id(self): """Define default value for id field.""" - - host = "localhost" - if self.client: - host = self.client.remote.get("host") or host - host = os.environ.get("RENKU_DOMAIN") or host - - name = "{0}@{1}".format(self.name, self.commit) - - return urljoin("https://{host}".format(host=host), pathlib.posixpath.join("/datasettags", quote(name, safe=""))) + return generate_dataset_tag_id(client=self.client, name=self.name, commit=self.commit) def __attrs_post_init__(self): """Post-Init hook.""" - if not self._id: + if not self._id or self._id.startswith("_:"): self._id = self.default_id() @classmethod @@ -645,7 +627,7 @@ class Meta: model = Url unknown = EXCLUDE - url = fields.Uri(schema.url, missing=None) + url = Uri(schema.url, missing=None) _id = fields.Id(init_name="id", missing=None) @@ -738,7 +720,7 @@ class Meta: identifier = fields.String(schema.identifier) in_language = Nested(schema.inLanguage, LanguageSchema, missing=None) keywords = fields.List(schema.keywords, fields.String(), missing=None, allow_none=True) - license = fields.Uri(schema.license, missing=None, allow_none=True) + license = Uri(schema.license, missing=None, allow_none=True) title = fields.String(schema.name) url = fields.String(schema.url) version = fields.String(schema.version, missing=None) @@ -792,6 +774,37 @@ def to_unix(el): return "_".join(name) +def generate_url_id(client, url_str, url_id): + """Generate @id field for Url.""" + if url_str: + parsed_result = urlparse(url_str) + id_ = ParseResult("", *parsed_result[1:]).geturl() + elif url_id: + parsed_result = urlparse(url_id) + id_ = ParseResult("", *parsed_result[1:]).geturl() + else: + id_ = str(uuid.uuid4()) + + host = "localhost" + if client: + host = client.remote.get("host") or host + host = os.environ.get("RENKU_DOMAIN") or host + + return urljoin("https://{host}".format(host=host), pathlib.posixpath.join("/urls", quote(id_, safe=""))) + + +def generate_dataset_tag_id(client, name, commit): + """Generate @id field for DatasetTag.""" + host = "localhost" + if client: + host = client.remote.get("host") or host + host = os.environ.get("RENKU_DOMAIN") or host + + name = "{0}@{1}".format(name, commit) + + return urljoin("https://{host}".format(host=host), pathlib.posixpath.join("/datasettags", quote(name, safe=""))) + + def generate_dataset_id(client, identifier): """Generate @id field.""" # Determine the hostname for the resource URIs. diff --git a/renku/core/models/provenance/activities.py b/renku/core/models/provenance/activities.py index 8ece669d2f..236cb6cc70 100644 --- a/renku/core/models/provenance/activities.py +++ b/renku/core/models/provenance/activities.py @@ -19,7 +19,6 @@ import os import urllib -import uuid import weakref from collections import OrderedDict from pathlib import Path, posixpath @@ -42,6 +41,7 @@ from renku.core.models.locals import ReferenceMixin from renku.core.models.refs import LinkReference from renku.core.models.workflow.run import Run +from renku.core.utils.scm import git_unicode_unescape from .agents import Person, PersonSchema, SoftwareAgentSchema, renku_agent from .qualified import Association, AssociationSchema, Generation, GenerationSchema, Usage, UsageSchema @@ -139,15 +139,27 @@ def get_output_paths(self): # in this backwards diff if file_.change_type == "A": continue - path_ = Path(file_.a_path) - is_dataset = self.client.DATASETS in str(path_) + path_ = Path(git_unicode_unescape(file_.a_path)) + + is_dataset = any( + [ + path_.resolve() == (self.client.path / f.path).resolve() + for d in self.client.datasets.values() + for f in d.files + ] + ) not_refs = LinkReference.REFS not in str(path_) does_not_exists = not path_.exists() if all([is_dataset, not_refs, does_not_exists]): - uid = uuid.UUID(path_.parent.name) - path_ = Path(self.client.renku_home) / self.client.DATASETS / str(uid) / self.client.METADATA + dataset = next( + d + for d in self.client.datasets.values() + for f in d.files + if path_.resolve() == (self.client.path / f.path).resolve() + ) + path_ = self.client.path / dataset.path / self.client.METADATA index.add(str(path_)) @@ -234,15 +246,26 @@ def paths(self): # in this backwards diff if file_.change_type == "A": continue - path_ = Path(file_.a_path) - - is_dataset = self.client.DATASETS in str(path_) + path_ = Path(git_unicode_unescape(file_.a_path)) + + is_dataset = any( + [ + path_.resolve() == (self.client.path / f.path).resolve() + for d in self.client.datasets.values() + for f in d.files + ] + ) not_refs = LinkReference.REFS not in str(path_) does_not_exists = not (path_.exists() or (path_.is_symlink() and os.path.lexists(path_))) if all([is_dataset, not_refs, does_not_exists]): - uid = uuid.UUID(path_.parent.name) - path_ = Path(self.client.renku_home) / self.client.DATASETS / str(uid) / self.client.METADATA + dataset = next( + d + for d in self.client.datasets + for f in d.files + if path_.resolve() == (self.client.path / f.path).resolve() + ) + path_ = self.client.path / dataset.path / self.client.METADATA index.add(str(path_)) diff --git a/renku/core/models/provenance/agents.py b/renku/core/models/provenance/agents.py index 43634a99f2..6e208cb69b 100644 --- a/renku/core/models/provenance/agents.py +++ b/renku/core/models/provenance/agents.py @@ -49,7 +49,7 @@ class Person: def default_id(self): """Set the default id.""" - return generate_person_id(email=self.email, client=self.client) + return generate_person_id(email=self.email, client=self.client, full_identity=self.full_identity) @email.validator def check_email(self, attribute, value): @@ -124,7 +124,7 @@ def from_jsonld(cls, data): def __attrs_post_init__(self): """Finish object initialization.""" # handle the case where ids were improperly set - if self._id == "mailto:None" or self._id is None: + if self._id == "mailto:None" or not self._id or self._id.startswith("_:"): self._id = self.default_id() if self.label is None: @@ -187,7 +187,7 @@ def as_jsonld(self): renku_agent = SoftwareAgent(label="renku {0}".format(__version__), id=version_url) -def generate_person_id(email, client=None): +def generate_person_id(client, email, full_identity): """Generate Person default id.""" if email: return "mailto:{email}".format(email=email) @@ -197,7 +197,7 @@ def generate_person_id(email, client=None): host = client.remote.get("host") or host host = os.environ.get("RENKU_DOMAIN") or host - id_ = str(uuid.uuid4()) + id_ = full_identity or str(uuid.uuid4()) return urllib.parse.urljoin( "https://{host}".format(host=host), pathlib.posixpath.join("/persons", quote(id_, safe="")) diff --git a/renku/core/models/provenance/qualified.py b/renku/core/models/provenance/qualified.py index dfc55ed9cc..cb772c69ea 100644 --- a/renku/core/models/provenance/qualified.py +++ b/renku/core/models/provenance/qualified.py @@ -18,6 +18,7 @@ """Represent elaborated information about relations.""" import weakref +from urllib.parse import quote import attr from marshmallow import EXCLUDE @@ -134,8 +135,8 @@ def activity(self): def default_id(self): """Configure calculated ID.""" if self.role: - return "{self.activity._id}/{self.role}".format(self=self,) - return "{self.activity._id}/tree/{self.entity.path}".format(self=self,) + return f"{self.activity._id}/{self.role}" + return f"{self.activity._id}/tree/{quote(str(self.entity.path))}" @classmethod def from_jsonld(cls, data): diff --git a/renku/core/utils/migrate.py b/renku/core/utils/migrate.py index 1c35caf910..0e06bbea4c 100644 --- a/renku/core/utils/migrate.py +++ b/renku/core/utils/migrate.py @@ -57,7 +57,7 @@ def get_pre_0_3_4_datasets_metadata(client): project_is_pre_0_3 = int(read_project_version(client)) < 2 if project_is_pre_0_3: - return (client.path / DATA_DIR).rglob(client.METADATA) + return (client.path / DATA_DIR).glob(f"*/{client.METADATA}") return [] diff --git a/renku/core/utils/scm.py b/renku/core/utils/scm.py index 334c5280b2..3b6558870c 100644 --- a/renku/core/utils/scm.py +++ b/renku/core/utils/scm.py @@ -22,3 +22,10 @@ def strip_and_lower(input): """Adjust chars to make the input compatible as scm source.""" return re.sub(r"\s", r"-", input.strip()).lower() + + +def git_unicode_unescape(s, encoding="utf-8"): + """Undoes git/gitpython unicode encoding.""" + if s.startswith('"'): + return s.strip('"').encode("latin1").decode("unicode-escape").encode("latin1").decode(encoding) + return s diff --git a/renku/service/.env-example b/renku/service/.env-example index 5a6b2969b8..963bee14a2 100644 --- a/renku/service/.env-example +++ b/renku/service/.env-example @@ -22,3 +22,7 @@ DEPLOYMENT_LOG_LEVEL=INFO # Scheduler RENKU_SVC_CLEANUP_INTERVAL=60 + +# Sentry +SENTRY_DSN= +SENTRY_ENV= diff --git a/renku/service/worker.py b/renku/service/worker.py index 017940672d..a76dfd8d06 100644 --- a/renku/service/worker.py +++ b/renku/service/worker.py @@ -28,7 +28,9 @@ from renku.service.logger import DEPLOYMENT_LOG_LEVEL, worker_log if os.getenv("SENTRY_DSN"): - sentry_sdk.init(os.getenv("SENTRY_DSN"), integrations=[RqIntegration()]) + sentry_sdk.init( + dsn=os.getenv("SENTRY_DSN"), environment=os.getenv("SENTRY_ENV"), integrations=[RqIntegration()], + ) @contextmanager diff --git a/tests/cli/test_datasets.py b/tests/cli/test_datasets.py index 7309d17831..28dead27d8 100644 --- a/tests/cli/test_datasets.py +++ b/tests/cli/test_datasets.py @@ -410,6 +410,26 @@ def test_add_to_dirty_repo(directory_tree, runner, project, client): assert ["untracked"] == client.repo.untracked_files +def test_add_unicode_file(tmpdir, runner, project, client): + """Test adding files with unicode special characters in their names.""" + # create a dataset + result = runner.invoke(cli, ["dataset", "create", "my-dataset"]) + assert 0 == result.exit_code + assert "OK" in result.output + + filename = "filéàèû爱ಠ_ಠ.txt" + new_file = tmpdir.join(filename) + new_file.write(str("test")) + + # add data + result = runner.invoke(cli, ["dataset", "add", "my-dataset", str(new_file)],) + assert 0 == result.exit_code + + result = runner.invoke(cli, ["log", "--format", "json-ld", "--strict", f"data/my-dataset/{filename}"]) + assert 0 == result.exit_code + assert filename in result.output.encode("latin1").decode("unicode-escape") + + def test_multiple_file_to_dataset(tmpdir, runner, project, client): """Test importing multiple data into a dataset at once.""" # create a dataset diff --git a/tests/cli/test_integration_datasets.py b/tests/cli/test_integration_datasets.py index aad31fc7f9..4401b679f0 100644 --- a/tests/cli/test_integration_datasets.py +++ b/tests/cli/test_integration_datasets.py @@ -305,7 +305,7 @@ def test_dataset_import_renkulab_dataset(runner, project, client, url): @pytest.mark.integration -@flaky(max_runs=1, min_passes=1) +@flaky(max_runs=10, min_passes=1) def test_import_renku_dataset_preserves_directory_hierarchy(runner, project, client): """Test dataset imported from Renku projects have correct directory hierarchy.""" url = "https://dev.renku.ch/datasets/1a637fd1-a7a6-4d1f-b9aa-157e7033cd1c" @@ -336,6 +336,16 @@ def prepare_git_repo(*_): assert "Cannot clone remote projects:" in result.output +@pytest.mark.integration +@flaky(max_runs=10, min_passes=1) +@pytest.mark.parametrize("url", ["https://dev.renku.ch/datasets/e3e1beba-0559-4fdd-8e46-82963cec9fe2",]) +def test_dataset_import_renku_missing_project(runner, client, missing_kg_project_responses, url): + """Test dataset import fails if cannot clone repo.""" + result = runner.invoke(cli, ["dataset", "import", url], input="y") + assert 1 == result.exit_code + assert "Cannot find these projects in the knowledge graph" in result.output + + @pytest.mark.integration @flaky(max_runs=10, min_passes=1) @pytest.mark.parametrize( @@ -369,6 +379,19 @@ def test_dataset_reimport_renkulab_dataset(runner, project, url): assert "Dataset exists" in result.output +@pytest.mark.integration +@flaky(max_runs=10, min_passes=1) +def test_renku_dataset_import_missing_lfs_objects(runner, project): + """Test importing a dataset with missing LFS objects fails.""" + result = runner.invoke( + cli, ["dataset", "import", "--yes", "https://dev.renku.ch/datasets/5c11e321-2bea-458c-94ce-abccf4257a54"] + ) + + assert 1 == result.exit_code + assert "Error: Cannot pull LFS objects from server" in result.output + assert "[404] Object does not exist on the server or you don't have permissions to access it" in result.output + + @pytest.mark.integration @flaky(max_runs=10, min_passes=1) @pytest.mark.parametrize( diff --git a/tests/cli/test_migrate.py b/tests/cli/test_migrate.py index c36baa0474..0de35843ff 100644 --- a/tests/cli/test_migrate.py +++ b/tests/cli/test_migrate.py @@ -253,7 +253,7 @@ def test_comprehensive_dataset_migration(isolated_runner, old_dataset_project): assert "https://doi.org/10.7910/DVN/EV6KLF" == dataset.same_as.url assert "1" == dataset.tags[0].name assert "Tag 1 created by renku import" == dataset.tags[0].description - assert isinstance(dataset.license, dict) + assert isinstance(dataset.license, str) assert "https://creativecommons.org/publicdomain/zero/1.0/" in str(dataset.license) file_ = dataset.find_file("data/dataverse/copy.sh") @@ -286,3 +286,16 @@ def test_comprehensive_dataset_migration(isolated_runner, old_dataset_project): assert "README.md" == file_.source assert file_.based_on is None assert file_.url.endswith("/projects/mohammad.alisafaee/old-datasets-v0.9.1/files/blob/README.md") + + +@pytest.mark.migration +def test_no_blank_node_after_dataset_migration(isolated_runner, old_dataset_project): + """Test migration of datasets with blank nodes creates IRI identifiers.""" + assert 0 == isolated_runner.invoke(cli, ["migrate"]).exit_code + + dataset = LocalClient(path=old_dataset_project.working_dir).load_dataset("201901_us_flights_1") + + assert not dataset.creators[0]._id.startswith("_:") + assert not dataset.same_as._id.startswith("_:") + assert not dataset.tags[0]._id.startswith("_:") + assert isinstance(dataset.license, str) diff --git a/tests/core/models/test_calamus.py b/tests/core/models/test_calamus.py index 4da5f95a2f..f52a313dd4 100644 --- a/tests/core/models/test_calamus.py +++ b/tests/core/models/test_calamus.py @@ -19,7 +19,7 @@ import pytest -from renku.core.models.calamus import JsonLDSchema, fields +from renku.core.models.calamus import JsonLDSchema, Uri, fields @pytest.mark.parametrize("value", [{"field": "http://datascience.ch"}, "http://datascience.ch"]) @@ -33,7 +33,7 @@ def __init__(self, field): schema = fields.Namespace("http://schema.org/") class EntitySchema(JsonLDSchema): - field = fields.Uri(schema.field, allow_none=True) + field = Uri(schema.field, allow_none=True) class Meta: rdf_type = schema.Entity @@ -60,7 +60,7 @@ def __init__(self, field): schema = fields.Namespace("http://schema.org/") class EntitySchema(JsonLDSchema): - field = fields.Uri(schema.field, allow_none=True) + field = Uri(schema.field, allow_none=True) class Meta: rdf_type = schema.Entity