Skip to content

Commit

Permalink
feat(dataset): export dataset keywords (#3454)
Browse files Browse the repository at this point in the history
  • Loading branch information
m-alisafaee committed May 16, 2023
1 parent a355ac8 commit feb3f14
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 14 deletions.
23 changes: 22 additions & 1 deletion renku/command/schema/agent.py
Expand Up @@ -16,7 +16,7 @@
"""Agents JSON-LD schemes."""

from calamus.schema import JsonLDSchema
from marshmallow import EXCLUDE
from marshmallow import EXCLUDE, pre_load

from renku.command.schema.calamus import StringList, fields, prov, schema
from renku.domain_model.provenance.agent import Person, SoftwareAgent
Expand All @@ -32,6 +32,27 @@ class Meta:
model = Person
unknown = EXCLUDE

@pre_load
def fix_affiliation(self, data, **kwargs):
"""Fix affiliation to be a string."""
affiliations = []
affiliation = data.get("http://schema.org/affiliation")
if affiliation:
if not isinstance(affiliation, list):
affiliation = [affiliation]
for a in affiliation:
if isinstance(a, dict):
name = a.get("http://schema.org/name", "")
if isinstance(name, list):
name = name[0]
else:
name = str(a)
affiliations.append(name)

data["http://schema.org/affiliation"] = affiliations

return data

affiliation = StringList(schema.affiliation, load_default=None)
alternate_name = StringList(schema.alternateName, load_default=None)
email = fields.String(schema.email, load_default=None)
Expand Down
15 changes: 15 additions & 0 deletions renku/core/dataset/providers/dataverse.py
Expand Up @@ -37,6 +37,7 @@
AUTHOR_METADATA_TEMPLATE,
CONTACT_METADATA_TEMPLATE,
DATASET_METADATA_TEMPLATE,
KEYWORDS_METADATA_TEMPLATE,
)
from renku.core.dataset.providers.doi import DOIProvider
from renku.core.dataset.providers.repository import RepositoryImporter, make_request
Expand Down Expand Up @@ -311,6 +312,7 @@ def __init__(
name=None,
parent_url=None,
type=None,
encoding_format=None,
):
self.content_size = content_size
self.content_url = content_url
Expand All @@ -321,6 +323,7 @@ def __init__(
self.name = name
self.parent_url = parent_url
self.type = type
self.encoding_format = encoding_format

@property
def remote_url(self):
Expand Down Expand Up @@ -384,13 +387,15 @@ def export(self, **kwargs):
def _get_dataset_metadata(self):
authors, contacts = self._get_creators()
subject = self._get_subject()
keywords = self._get_keywords()
metadata_template = Template(DATASET_METADATA_TEMPLATE)
metadata = metadata_template.substitute(
name=_escape_json_string(self.dataset.title),
authors=json.dumps(authors),
contacts=json.dumps(contacts),
description=_escape_json_string(self.dataset.description),
subject=subject,
keywords=json.dumps(keywords),
)
return json.loads(metadata)

Expand Down Expand Up @@ -425,6 +430,16 @@ def _get_creators(self):

return authors, contacts

def _get_keywords(self):
keywords = []

for keyword in self.dataset.keywords:
keyword_template = Template(KEYWORDS_METADATA_TEMPLATE)
keyword_str = keyword_template.substitute(keyword=_escape_json_string(keyword))
keywords.append(json.loads(keyword_str))

return keywords


class _DataverseDeposition:
"""Dataverse record for deposit."""
Expand Down
17 changes: 17 additions & 0 deletions renku/core/dataset/providers/dataverse_metadata_templates.py
Expand Up @@ -38,6 +38,12 @@
"multiple": true,
"typeName": "datasetContact"
},
{
"value": ${keywords},
"typeClass": "compound",
"multiple": true,
"typeName": "keyword"
},
{
"value": [
{
Expand Down Expand Up @@ -99,3 +105,14 @@
}
}
"""

KEYWORDS_METADATA_TEMPLATE = """
{
"keywordValue": {
"typeName": "keywordValue",
"multiple": false,
"typeClass": "primitive",
"value": "${keyword}"
}
}
"""
16 changes: 8 additions & 8 deletions renku/core/dataset/providers/zenodo.py
Expand Up @@ -70,7 +70,7 @@ def __init__(self, uri: str, is_doi: bool = False):

@staticmethod
def supports(uri):
"""Whether or not this provider supports a given URI."""
"""Whether this provider supports a given URI."""
if "zenodo" in uri.lower():
return True

Expand Down Expand Up @@ -335,10 +335,7 @@ def __init__(self, dataset, publish, tag):
@property
def zenodo_url(self):
"""Returns correct Zenodo URL based on environment."""
if "ZENODO_USE_SANDBOX" in os.environ:
return ZENODO_SANDBOX_URL

return ZENODO_BASE_URL
return ZENODO_SANDBOX_URL if "ZENODO_USE_SANDBOX" in os.environ else ZENODO_BASE_URL

def set_access_token(self, access_token):
"""Set access token."""
Expand Down Expand Up @@ -482,6 +479,7 @@ def attach_metadata(self, dataset, tag):
{"name": creator.name, "affiliation": creator.affiliation if creator.affiliation else None}
for creator in dataset.creators
],
"keywords": dataset.keywords,
}
}

Expand Down Expand Up @@ -532,12 +530,12 @@ def _check_response(response):
def _make_request(uri, accept: str = "application/json"):
"""Execute network request."""
record_id = ZenodoProvider.get_record_id(uri)
url = make_records_url(record_id)
url = make_records_url(record_id, uri=uri)

return make_request(url=url, accept=accept)


def make_records_url(record_id):
def make_records_url(record_id, uri: str):
"""Create URL to access record by ID.
Args:
Expand All @@ -546,4 +544,6 @@ def make_records_url(record_id):
Returns:
str: Full URL for the record.
"""
return urllib.parse.urljoin(ZENODO_BASE_URL, posixpath.join(ZENODO_API_PATH, "records", record_id))
url = ZENODO_SANDBOX_URL if "sandbox.zenodo.org" in uri.lower() else ZENODO_BASE_URL

return urllib.parse.urljoin(url, posixpath.join(ZENODO_API_PATH, "records", record_id))
2 changes: 1 addition & 1 deletion renku/core/migration/models/v9.py
Expand Up @@ -1544,7 +1544,7 @@ def creators_csv(self):
@property
def keywords_csv(self):
"""Comma-separated list of keywords associated with dataset."""
return ", ".join(self.keywords)
return ", ".join(self.keywords or [])

@property
def tags_csv(self):
Expand Down
2 changes: 1 addition & 1 deletion renku/domain_model/dataset.py
Expand Up @@ -500,7 +500,7 @@ def creators_full_csv(self):
@property
def keywords_csv(self):
"""Comma-separated list of keywords associated with dataset."""
return ", ".join(self.keywords)
return ", ".join(self.keywords or [])

def get_datadir(self) -> Path:
"""Return dataset's data directory relative to project's root."""
Expand Down
23 changes: 20 additions & 3 deletions tests/cli/test_integration_datasets.py
Expand Up @@ -61,12 +61,20 @@
"name": "pyndl_naive_discr_v0.6.4",
"creator": "Konstantin Sering, Marc Weitz, David-Elias Künstle, Lennart Schneider",
"version": "v0.6.4",
"keywords": {
"naive discriminative learning",
"linguistics",
"python",
"cognitive science",
"machine learning",
},
},
{
"doi": "10.7910/DVN/F4NUMR",
"name": "replication_data_for_2.2",
"creator": "James Druckman, Martin Kifer, Michael Parkin",
"version": "2",
"keywords": {"Social Sciences"},
},
],
)
Expand Down Expand Up @@ -104,6 +112,7 @@ def test_dataset_import_real_doi(runner, project, doi, prefix, sleep_after):
assert doi["doi"] in dataset.same_as.url
assert dataset.date_created is None
assert dataset.date_published is not None
assert doi["keywords"] == set(dataset.keywords)

result = runner.invoke(cli, ["graph", "export", "--format", "json-ld", "--strict"])
assert 0 == result.exit_code, format_result_exception(result)
Expand Down Expand Up @@ -825,10 +834,10 @@ def test_dataset_export_upload_failure(runner, tmpdir, project, zenodo_sandbox):
[("zenodo", [], "zenodo.org/record"), ("dataverse", ["--dataverse-name", "sdsc-published-test-dataverse"], "doi:")],
)
def test_dataset_export_published_url(
runner, tmpdir, project, zenodo_sandbox, dataverse_demo, provider, params, output
runner, tmpdir, project, zenodo_sandbox, dataverse_demo, with_injection, provider, params, output
):
"""Test publishing of dataset."""
result = runner.invoke(cli, ["dataset", "create", "my-dataset"])
result = runner.invoke(cli, ["dataset", "create", "my-dataset", "-k", "keyword", "-k", "data"])

assert 0 == result.exit_code, format_result_exception(result) + str(result.stderr_bytes)
assert "OK" in result.output
Expand All @@ -841,7 +850,7 @@ def test_dataset_export_published_url(
result = runner.invoke(cli, ["dataset", "add", "--copy", "my-dataset", str(new_file)])
assert 0 == result.exit_code, format_result_exception(result) + str(result.stderr_bytes)

with with_dataset(name="my-dataset", commit_database=True) as dataset:
with with_injection(), with_dataset(name="my-dataset", commit_database=True) as dataset:
dataset.description = "awesome dataset"
dataset.creators[0].affiliation = "eth"

Expand All @@ -854,6 +863,14 @@ def test_dataset_export_published_url(
assert "Exported to:" in result.output
assert output in result.output

m = re.search(r"Exported to:\s*(\S*)$", result.output, flags=re.MULTILINE)
doi = m.group(1)
result = runner.invoke(cli, ["dataset", "import", doi, "--name", "imported"], input="y")
assert 0 == result.exit_code, format_result_exception(result)

dataset = get_dataset_with_injection("imported")
assert {"data", "keyword"} == set(dataset.keywords)


@pytest.mark.integration
@retry_failed
Expand Down

0 comments on commit feb3f14

Please sign in to comment.