Skip to content

Commit 52b2769

Browse files
authored
feat(datasets): import data from zenodo (#509)
1 parent 691644d commit 52b2769

File tree

8 files changed

+506
-1
lines changed

8 files changed

+506
-1
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ __pycache__/
66
.pytest_cache/
77
*.py[cod]
88

9+
# VSCode
10+
.vscode/
11+
912
# Idea software family
1013
.idea/
1114

renku/cli/_providers/__init__.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright 2019 - Swiss Data Science Center (SDSC)
4+
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
5+
# Eidgenössische Technische Hochschule Zürich (ETHZ).
6+
#
7+
# Licensed under the Apache License, Version 2.0 (the "License");
8+
# you may not use this file except in compliance with the License.
9+
# You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
"""Third party data registry integration."""
19+
from urllib.parse import urlparse
20+
21+
from renku.cli._providers.doi import DOIProvider
22+
from renku.cli._providers.zenodo import ZenodoProvider
23+
24+
25+
class ProviderFactory:
26+
"""Create a provider type from URI."""
27+
28+
@staticmethod
29+
def from_uri(uri):
30+
"""Get provider type based on uri."""
31+
is_doi = DOIProvider.is_doi(uri)
32+
if is_doi is False:
33+
url = urlparse(uri)
34+
if bool(url.scheme and url.netloc and url.params == '') is False:
35+
return None, 'Cannot parse URL.'
36+
37+
provider = None
38+
if 'zenodo' in uri:
39+
provider = ZenodoProvider(is_doi=is_doi)
40+
41+
if is_doi and provider is None:
42+
return None, (
43+
'Provider {} not found. '.format(
44+
uri.split('/')[1].split('.')[0] # Get DOI provider name.
45+
) + 'Currently supporting following providers: (Zenodo, )'
46+
)
47+
48+
return provider, None

renku/cli/_providers/api.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Copyright 2019 - Swiss Data Science Center (SDSC)
2+
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
3+
# Eidgenössische Technische Hochschule Zürich (ETHZ).
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
"""API for providers."""
17+
import abc
18+
19+
20+
class ProviderApi(abc.ABC):
21+
"""Interface defining provider methods."""
22+
23+
@abc.abstractmethod
24+
def find_record(self, uri):
25+
"""Find record by uri."""
26+
pass

renku/cli/_providers/doi.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright 2019 - Swiss Data Science Center (SDSC)
4+
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
5+
# Eidgenössische Technische Hochschule Zürich (ETHZ).
6+
#
7+
# Licensed under the Apache License, Version 2.0 (the "License");
8+
# you may not use this file except in compliance with the License.
9+
# You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
"""DOI API integration."""
19+
import re
20+
import urllib
21+
22+
import attr
23+
import requests
24+
25+
from renku.cli._providers.api import ProviderApi
26+
27+
doi_regexp = re.compile(
28+
r'(doi:\s*|(?:https?://)?(?:dx\.)?doi\.org/)?(10\.\d+(.\d+)*/.+)$',
29+
flags=re.I
30+
)
31+
"""See http://en.wikipedia.org/wiki/Digital_object_identifier."""
32+
33+
DOI_BASE_URL = 'https://dx.doi.org'
34+
35+
36+
def make_doi_url(doi):
37+
"""Create URL to access DOI metadata."""
38+
return urllib.parse.urljoin(DOI_BASE_URL, doi)
39+
40+
41+
@attr.s
42+
class DOIMetadata:
43+
"""Response from doi.org for DOI metadata."""
44+
45+
id = attr.ib(kw_only=True)
46+
DOI = attr.ib(kw_only=True)
47+
URL = attr.ib(kw_only=True)
48+
type = attr.ib(kw_only=True, default=None)
49+
categories = attr.ib(kw_only=True, default=None)
50+
author = attr.ib(kw_only=True, default=None)
51+
version = attr.ib(kw_only=True, default=None)
52+
issued = attr.ib(kw_only=True, default=None)
53+
title = attr.ib(kw_only=True, default=None)
54+
abstract = attr.ib(kw_only=True, default=None)
55+
language = attr.ib(kw_only=True, default=None)
56+
publisher = attr.ib(kw_only=True, default=None)
57+
58+
59+
@attr.s
60+
class DOIProvider(ProviderApi):
61+
"""doi.org registry API provider."""
62+
63+
headers = attr.ib(
64+
default={'accept': 'application/vnd.citationstyles.csl+json'}
65+
)
66+
timeout = attr.ib(default=3)
67+
68+
@staticmethod
69+
def is_doi(uri):
70+
"""Check if uri is DOI."""
71+
return doi_regexp.match(uri)
72+
73+
def _query(self, doi):
74+
"""Retrieve metadata for given doi."""
75+
url = doi
76+
if doi.startswith('http') is False:
77+
url = make_doi_url(doi)
78+
79+
response = requests.get(url, headers=self.headers)
80+
if response.status_code != 200:
81+
raise LookupError('record not found')
82+
83+
return response
84+
85+
def find_record(self, uri):
86+
"""Finds DOI record."""
87+
response = self._query(uri).json()
88+
return DOIMetadata(**response)

renku/cli/_providers/zenodo.py

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright 2019 - Swiss Data Science Center (SDSC)
4+
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
5+
# Eidgenössische Technische Hochschule Zürich (ETHZ).
6+
#
7+
# Licensed under the Apache License, Version 2.0 (the "License");
8+
# you may not use this file except in compliance with the License.
9+
# You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
"""Zenodo API integration."""
19+
import pathlib
20+
import re
21+
import urllib
22+
from urllib.parse import urlparse
23+
24+
import attr
25+
import requests
26+
27+
from renku.cli._providers.api import ProviderApi
28+
from renku.cli._providers.doi import DOIProvider
29+
30+
ZENODO_BASE_URL = 'https://zenodo.org'
31+
ZENODO_BASE_PATH = 'api'
32+
33+
34+
def make_records_url(record_id):
35+
"""Create URL to access record by ID."""
36+
return urllib.parse.urljoin(
37+
ZENODO_BASE_URL,
38+
pathlib.posixpath.join(ZENODO_BASE_PATH, 'records', record_id)
39+
)
40+
41+
42+
@attr.s
43+
class ZenodoFile:
44+
"""Zenodo record file."""
45+
46+
checksum = attr.ib()
47+
links = attr.ib()
48+
bucket = attr.ib()
49+
key = attr.ib()
50+
size = attr.ib()
51+
type = attr.ib()
52+
53+
@property
54+
def remote_url(self):
55+
"""Get remote URL as ``urllib.ParseResult``."""
56+
return urllib.parse.urlparse(self.links['self'])
57+
58+
@property
59+
def name(self):
60+
"""Get file name."""
61+
return self.remote_url.path.split('/')[-1]
62+
63+
64+
@attr.s
65+
class ZenodoRecord:
66+
"""Zenodo record."""
67+
68+
id = attr.ib()
69+
conceptrecid = attr.ib()
70+
71+
doi = attr.ib()
72+
files = attr.ib()
73+
links = attr.ib()
74+
metadata = attr.ib()
75+
owners = attr.ib()
76+
revision = attr.ib()
77+
stats = attr.ib()
78+
79+
created = attr.ib()
80+
updated = attr.ib()
81+
82+
conceptdoi = attr.ib(default=None)
83+
_zenodo = attr.ib(kw_only=True, default=None)
84+
85+
@property
86+
def last_version(self):
87+
"""Check if record is at last possible version."""
88+
return self.version['is_last']
89+
90+
@property
91+
def version(self):
92+
"""Get record version."""
93+
return self.metadata['relations']['version'][0]
94+
95+
@property
96+
def display_version(self):
97+
"""Get display version."""
98+
return 'v{0}'.format(self.version['index'])
99+
100+
@property
101+
def display_name(self):
102+
"""Get record display name."""
103+
return '{0}_{1}'.format(
104+
re.sub(r'\W+', '', self.metadata['title']).lower()[:16],
105+
self.display_version
106+
)
107+
108+
def get_files(self):
109+
"""Get Zenodo files metadata as ``ZenodoFile``."""
110+
if len(self.files) == 0:
111+
raise LookupError('no files have been found')
112+
113+
return [ZenodoFile(**file_) for file_ in self.files]
114+
115+
116+
@attr.s
117+
class ZenodoProvider(ProviderApi):
118+
"""zenodo.org registry API provider."""
119+
120+
is_doi = attr.ib(default=False)
121+
122+
@staticmethod
123+
def record_id(uri):
124+
"""Extract record id from uri."""
125+
return urlparse(uri).path.split('/')[-1]
126+
127+
def find_record(self, uri):
128+
"""Retrieves a record from Zenodo.
129+
130+
:raises: ``LookupError``
131+
:param uri: DOI or URL
132+
:return: ``ZenodoRecord``
133+
"""
134+
if self.is_doi:
135+
return self.find_record_by_doi(uri)
136+
137+
return self.get_record(uri)
138+
139+
def find_record_by_doi(self, doi):
140+
"""Resolve the DOI and make a record for the retrieved record id."""
141+
doi = DOIProvider().find_record(doi)
142+
return self.get_record(ZenodoProvider.record_id(doi.URL))
143+
144+
def get_record(self, uri):
145+
"""Retrieve record metadata and return ``ZenodoRecord``."""
146+
record_id = ZenodoProvider.record_id(uri)
147+
response = requests.get(make_records_url(record_id))
148+
if response.status_code != 200:
149+
raise LookupError('record not found')
150+
151+
return ZenodoRecord(**response.json(), zenodo=self)

0 commit comments

Comments
 (0)