Skip to content

Commit

Permalink
fix(core): adapt to zenodo jsonld changes. Send referer on Zenodo req…
Browse files Browse the repository at this point in the history
…uest (#3643)
  • Loading branch information
Panaetius committed Dec 5, 2023
1 parent e4c06cb commit 41816d8
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 28 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ add_ignore = ["D105", "D107", "D202", "D401"]

[tool.bandit]
skips = ["B101", "B603", "B607", "B404"]
exclude_dirs = ["tests"]

[tool.isort]
multi_line_output = 3
Expand Down
52 changes: 36 additions & 16 deletions renku/core/dataset/providers/zenodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from renku.core.dataset.providers.repository import RepositoryImporter, make_request
from renku.core.util import communication
from renku.core.util.doi import is_doi
from renku.core.util.requests import get_redirect_url
from renku.core.util.urls import remove_credentials
from renku.domain_model.project_context import project_context

Expand Down Expand Up @@ -80,7 +81,9 @@ def supports(uri):
@staticmethod
def get_record_id(uri):
"""Extract record id from URI."""
return urlparse(uri).path.split("/")[-1]
parts = urlparse(uri).path.split("/")
parts = [p for p in parts if p.isdigit()]
return parts[-1]

@staticmethod
def get_export_parameters() -> List["ProviderParameter"]:
Expand Down Expand Up @@ -121,7 +124,7 @@ def __init__(self, *, uri: str, original_uri, json: Dict[str, Any]):

metadata = self._json.pop("metadata", {})
self._json["metadata"] = ZenodoMetadataSerializer.from_metadata(metadata) if metadata is not None else None
record_id = self._json.pop("record_id", None)
record_id = self._json.pop("record_id", None) or self._json.pop("recid", None)
self._json["record_id"] = str(record_id) if record_id is not None else None

# NOTE: Make sure that these properties have a default value
Expand All @@ -136,11 +139,11 @@ def version(self):
@property
def latest_uri(self):
"""Get URI of latest version."""
return self._json["links"].get("latest_html")
return get_redirect_url(self._json["links"].get("latest"))

def is_latest_version(self):
"""Check if this record is the latest version."""
return ZenodoProvider.get_record_id(self._json["links"].get("latest_html")) == self._json["record_id"]
return ZenodoProvider.get_record_id(self.latest_uri) == self._json["record_id"]

def get_jsonld(self):
"""Get record metadata as jsonld."""
Expand Down Expand Up @@ -173,18 +176,19 @@ def fetch_provider_dataset(self) -> "ProviderDataset":
from renku.domain_model.dataset import Url, generate_default_slug

class ZenodoDatasetSchema(ProviderDatasetSchema):
"""Schema for Dataverse datasets."""
"""Schema for Zenodo datasets."""

@pre_load
def fix_data(self, data, **kwargs):
"""Fix data that is received from Dataverse."""
"""Fix data that is received from Zenodo."""
# Fix context
context = data.get("@context")
if context and isinstance(context, str):
if not context.endswith("/"):
context = f"{context}/"
if context == "https://schema.org/":
context = "http://schema.org/"
data["@context"] = {"@base": context, "@vocab": context}

# Add type to creators
creators = data.get("creator", [])
for c in creators:
Expand All @@ -194,6 +198,10 @@ def fix_data(self, data, **kwargs):
license = data.get("license")
if license and isinstance(license, dict):
data["license"] = license.get("url", "")
# fix keywords to be a list
keywords = data.get("keywords")
if keywords and isinstance(keywords, str):
data["keywords"] = [k.strip() for k in keywords.split(",")]

# Delete existing isPartOf
data.pop("isPartOf", None)
Expand Down Expand Up @@ -228,17 +236,17 @@ def fix_data(self, data, **kwargs):
class ZenodoFileSerializer:
"""Zenodo record file."""

def __init__(self, *, id=None, checksum=None, links=None, filename=None, filesize=None):
def __init__(self, *, id=None, checksum=None, links=None, key=None, size=None, **kwargs):
self.id = id
self.checksum = checksum
self.links = links
self.filename = filename
self.filesize = filesize
self.filename = key
self.filesize = size

@property
def remote_url(self):
"""Get remote URL as ``urllib.ParseResult``."""
return urllib.parse.urlparse(self.links["download"])
return urllib.parse.urlparse(self.links["self"])

@property
def type(self):
Expand Down Expand Up @@ -325,7 +333,10 @@ def from_metadata(cls, metadata: Dict[str, Any]) -> "ZenodoMetadataSerializer":
class ZenodoExporter(ExporterApi):
"""Zenodo export manager."""

HEADERS = {"Content-Type": "application/json"}
HEADERS = {
"Content-Type": "application/json",
"Referer": f"https://{os.environ.get('RENKU_DOMAIN', 'zenodo.org')}",
}

def __init__(self, dataset, publish, tag):
super().__init__(dataset)
Expand Down Expand Up @@ -503,7 +514,9 @@ def publish_deposition(self):
"""Publish existing deposition."""
from renku.core.util import requests

response = requests.post(url=self.publish_url, params=self.exporter.default_params)
response = requests.post(
url=self.publish_url, params=self.exporter.default_params, headers=self.exporter.HEADERS
)
self._check_response(response)

return response
Expand All @@ -517,14 +530,21 @@ def _check_response(response):
except errors.RequestError:
if response.status_code == 400:
err_response = response.json()
messages = [
'"{}" failed with "{}"'.format(err["field"], err["message"]) for err in err_response["errors"]
]
if "errors" in err_response:
messages = [
'"{}" failed with "{}"'.format(err["field"], ", ".join(err["messages"]))
for err in err_response["errors"]
]
elif "message" in err_response:
messages = [err_response["message"]]
else:
messages = [response.text()]

raise errors.ExportError(
"\n" + "\n".join(messages) + "\nSee `renku dataset edit -h` for details on how to edit" " metadata"
)
else:
print(response.status_code)
raise errors.ExportError(response.content)


Expand Down
24 changes: 13 additions & 11 deletions tests/cli/test_integration_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,29 +397,27 @@ def test_dataset_import_renku_provider_with_subgroups(runner, project, uri):
@pytest.mark.vcr
def test_dataset_import_renkulab_dataset_with_image(runner, project, with_injection):
"""Test dataset import from Renkulab projects."""
# dataset is https://dev.renku.ch/projects/renku-python-integration-tests/lego-datasets/datasets/colors/
result = runner.invoke(
cli, ["dataset", "import", "https://dev.renku.ch/datasets/4f36f891bb7c4b2bab137633cc270a40"], input="y"
cli, ["dataset", "import", "https://dev.renku.ch/datasets/5952ea58de934fe188680a0e626a259c"], input="y"
)

assert 0 == result.exit_code, format_result_exception(result)
assert "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" in result.output
assert "158c016d7338e9874a2a70972ed62ca22e2ce7ae" in result.output

assert "0" in result.output
assert "OK" in result.output

result = runner.invoke(cli, ["dataset", "ls-files"])
assert 0 == result.exit_code, format_result_exception(result)
assert "bla" in result.output
assert "colors.csv" in result.output

with with_injection():
dataset = [d for d in DatasetGateway().get_all_active_datasets()][0]
assert 2 == len(dataset.images)
img1 = next(i for i in dataset.images if i.position == 1)
img2 = next(i for i in dataset.images if i.position == 2)
assert 1 == len(dataset.images)
img1 = next(i for i in dataset.images if i.position == 0)

assert img1.content_url == "https://example.com/image1.jpg"
assert img2.content_url.endswith("/2.png")
assert os.path.exists(project.path / img2.content_url)
assert img1.content_url == ".renku/dataset_images/41033ca2758944678718dde9140431f1/0.png"


@pytest.mark.integration
Expand Down Expand Up @@ -822,7 +820,6 @@ def test_dataset_export_upload_failure(runner, tmpdir, project, zenodo_sandbox):
result = runner.invoke(cli, ["dataset", "export", "my-dataset", "zenodo"])

assert 1 == result.exit_code, result.output + str(result.stderr_bytes)
assert "metadata.creators.0.affiliation" in result.output
assert "metadata.description" in result.output


Expand Down Expand Up @@ -940,7 +937,10 @@ def test_export_dataset_unauthorized(
result = runner.invoke(cli, ["dataset", "export", "my-dataset", provider] + params)

assert 1 == result.exit_code, result.output + str(result.stderr_bytes)
assert "Access unauthorized - update access token." in result.output, format_result_exception(result)
# Note: Zenodo returns a referer error when a wrong token is supplied for some reason
assert (
"Access unauthorized - update access token." in result.output or "Referer checking failed" in result.output
), format_result_exception(result)

secret = get_value("zenodo", "secret")
assert secret is None
Expand Down Expand Up @@ -1232,6 +1232,7 @@ def test_dataset_update_zenodo(project, runner, doi):
commit_sha_after_file1_delete = project.repository.head.commit.hexsha

before_dataset = get_dataset_with_injection("imported_dataset")
assert before_dataset is not None

result = runner.invoke(cli, ["dataset", "update", "--all", "--dry-run"])

Expand All @@ -1245,6 +1246,7 @@ def test_dataset_update_zenodo(project, runner, doi):
assert 0 == result.exit_code, format_result_exception(result) + str(result.stderr_bytes)

after_dataset = get_dataset_with_injection("imported_dataset")
assert after_dataset is not None
assert after_dataset.version != before_dataset.version
assert after_dataset.id != before_dataset.id
assert after_dataset.derived_from is None
Expand Down
2 changes: 1 addition & 1 deletion tests/service/views/test_cache_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -956,7 +956,7 @@ def test_cache_gets_synchronized(local_remote_repository, directory_tree, quick_

assert response
assert 200 == response.status_code
assert {"slug", "remote_branch"} == set(response.json["result"].keys())
assert {"git_url", "slug", "remote_branch"} == set(response.json["result"].keys())

remote_repo_checkout.pull()

Expand Down

0 comments on commit 41816d8

Please sign in to comment.