Skip to content

Commit 4273d2a

Browse files
authored
feat: integrate metadata from zenodo (#545)
1 parent 8c697fb commit 4273d2a

File tree

20 files changed

+717
-216
lines changed

20 files changed

+717
-216
lines changed

.travis.yml

Lines changed: 54 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,19 @@ env:
3434
- REQUIREMENTS=lowest
3535
- REQUIREMENTS=release
3636

37+
dist: xenial
38+
3739
python:
3840
- "3.5"
3941
- "3.6"
42+
- "3.7"
4043

4144
stages:
4245
- name: test
46+
- name: integration
47+
if: branch = master
48+
- name: test OSX
49+
if: branch = master
4350
- name: publish 🐍 🐳
4451
if: type = push AND (branch = master OR tag IS present)
4552
- name: brew 🍺
@@ -84,7 +91,51 @@ after_success:
8491

8592
jobs:
8693
include:
87-
- stage: test
94+
- stage: integration
95+
os: linux
96+
dist: xenial
97+
language: python
98+
env:
99+
- REQUIREMENTS=release
100+
script: pytest -m integration -v
101+
- stage: integration
102+
python: "3.6"
103+
os: linux
104+
dist: xenial
105+
language: python
106+
env:
107+
- REQUIREMENTS=release
108+
- stage: integration
109+
python: "3.7"
110+
os: linux
111+
dist: xenial
112+
language: python
113+
env:
114+
- REQUIREMENTS=release
115+
- stage: integration
116+
os: linux
117+
dist: xenial
118+
language: python
119+
env:
120+
- REQUIREMENTS=lowest
121+
script: pytest -m integration -v
122+
- stage: integration
123+
python: "3.6"
124+
os: linux
125+
dist: xenial
126+
language: python
127+
env:
128+
- REQUIREMENTS=lowest
129+
- stage: integration
130+
python: "3.7"
131+
os: linux
132+
dist: xenial
133+
language: python
134+
env:
135+
- REQUIREMENTS=lowest
136+
137+
- stage: test OSX
138+
if: branch = master
88139
language: generic
89140
sudo: true
90141
os: osx
@@ -93,12 +144,6 @@ jobs:
93144
sudo: true
94145
os: osx
95146
osx_image: xcode9.2
96-
- python: "3.7"
97-
os: linux
98-
dist: xenial
99-
language: python
100-
env:
101-
- REQUIREMENTS=release
102147

103148
- stage: publish 🐍 🐳
104149
python: 3.6
@@ -110,6 +155,7 @@ jobs:
110155
distributions: "sdist bdist_wheel"
111156
on:
112157
all_branches: true
158+
113159
- # stage: publish
114160
sudo: required
115161
services:
@@ -121,7 +167,7 @@ jobs:
121167
- sudo apt-get -y install docker-ce
122168
script: make docker-login docker-push
123169

124-
- stage: brew 🍺
170+
- # stage: brew 🍺
125171
language: generic
126172
sudo: true
127173
os: osx

docs/models/datasets.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ Manage files in the dataset.
3939
:inherited-members:
4040

4141

42-
Author
43-
------
42+
Creator
43+
-------
4444

45-
.. autoclass:: renku.models.datasets.Author
45+
.. autoclass:: renku.models.datasets.Creator
4646
:members:
4747
:inherited-members:

renku/api/datasets.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
from renku import errors
3434
from renku._compat import Path
3535
from renku.models._git import GitURL
36-
from renku.models.datasets import Author, Dataset, DatasetFile, NoneType
36+
from renku.models.datasets import Creator, Dataset, DatasetFile, NoneType
3737

3838

3939
@attr.s
@@ -249,7 +249,7 @@ def _add_from_url(self, dataset, path, url, link=False, **kwargs):
249249
DatasetFile(
250250
path=result,
251251
url=url,
252-
author=dataset.author,
252+
creator=dataset.creator,
253253
dataset=dataset.name,
254254
)
255255
}
@@ -290,7 +290,7 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
290290
DatasetFile(
291291
path=result,
292292
url=url,
293-
author=dataset.author,
293+
creator=dataset.creator,
294294
dataset=dataset.name,
295295
)
296296
}
@@ -377,13 +377,13 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
377377

378378
os.symlink(os.path.relpath(str(src), str(dst.parent)), str(dst))
379379

380-
# grab all the authors from the commit history
380+
# grab all the creators from the commit history
381381
git_repo = Repo(str(submodule_path.absolute()))
382-
authors = []
382+
creators = []
383383
for commit in git_repo.iter_commits(paths=target):
384-
author = Author.from_commit(commit)
385-
if author not in authors:
386-
authors.append(author)
384+
creator = Creator.from_commit(commit)
385+
if creator not in creators:
386+
creators.append(creator)
387387

388388
dataset_path = self.renku_datasets_path / dataset.name
389389
result = os.path.relpath(str(dst), start=str(dataset_path))
@@ -398,7 +398,7 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
398398
DatasetFile(
399399
path=result,
400400
url=url,
401-
author=authors,
401+
creator=creators,
402402
dataset=dataset.name, # TODO detect original dataset
403403
)
404404
}

renku/api/repository.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -350,9 +350,9 @@ def init_repository(self, name=None, force=False):
350350

351351
self.repo.description = name or path.name
352352

353-
# Check that an author can be determined from Git.
354-
from renku.models.datasets import Author
355-
Author.from_git(self.repo)
353+
# Check that an creator can be determined from Git.
354+
from renku.models.datasets import Creator
355+
Creator.from_git(self.repo)
356356

357357
# TODO read existing gitignore and create a unique set of rules
358358
import pkg_resources

renku/cli/_exc.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,9 @@ def _handle_sentry(self):
117117
with capture_internal_exceptions():
118118
from git import Repo
119119
from renku.cli._git import get_git_home
120-
from renku.models.datasets import Author
120+
from renku.models.datasets import Creator
121121

122-
user = Author.from_git(Repo(get_git_home()))
122+
user = Creator.from_git(Repo(get_git_home()))
123123

124124
scope.user = {'name': user.name, 'email': user.email}
125125

renku/cli/_format/dataset_files.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def tabular(client, records):
3535
records,
3636
headers=OrderedDict((
3737
('added', None),
38-
('authors_csv', 'authors'),
38+
('creators_csv', 'creators'),
3939
('dataset', None),
4040
('full_path', 'path'),
4141
)),

renku/cli/_format/datasets.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ def tabular(client, datasets):
3232
datasets,
3333
headers=OrderedDict((
3434
('short_id', 'id'),
35-
('name', None),
35+
('display_name', None),
36+
('version', None),
3637
('created', None),
37-
('authors_csv', 'authors'),
38+
('creators_csv', 'creators'),
3839
)),
3940
)
4041
)

renku/cli/_providers/__init__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
"""Third party data registry integration."""
1919
from urllib.parse import urlparse
2020

21-
from renku.cli._providers.doi import DOIProvider
2221
from renku.cli._providers.zenodo import ZenodoProvider
22+
from renku.utils.doi import is_doi
2323

2424

2525
class ProviderFactory:
@@ -28,17 +28,17 @@ class ProviderFactory:
2828
@staticmethod
2929
def from_uri(uri):
3030
"""Get provider type based on uri."""
31-
is_doi = DOIProvider.is_doi(uri)
32-
if is_doi is False:
31+
is_doi_ = is_doi(uri)
32+
if is_doi_ is False:
3333
url = urlparse(uri)
3434
if bool(url.scheme and url.netloc and url.params == '') is False:
3535
return None, 'Cannot parse URL.'
3636

3737
provider = None
3838
if 'zenodo' in uri:
39-
provider = ZenodoProvider(is_doi=is_doi)
39+
provider = ZenodoProvider(is_doi=is_doi_)
4040

41-
if is_doi and provider is None:
41+
if is_doi_ and provider is None:
4242
return None, (
4343
'Provider {} not found. '.format(
4444
uri.split('/')[1].split('.')[0] # Get DOI provider name.

renku/cli/_providers/doi.py

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,20 +16,13 @@
1616
# See the License for the specific language governing permissions and
1717
# limitations under the License.
1818
"""DOI API integration."""
19-
import re
2019
import urllib
2120

2221
import attr
2322
import requests
2423

2524
from renku.cli._providers.api import ProviderApi
2625

27-
doi_regexp = re.compile(
28-
r'(doi:\s*|(?:https?://)?(?:dx\.)?doi\.org/)?(10\.\d+(.\d+)*/.+)$',
29-
flags=re.I
30-
)
31-
"""See http://en.wikipedia.org/wiki/Digital_object_identifier."""
32-
3326
DOI_BASE_URL = 'https://dx.doi.org'
3427

3528

@@ -39,22 +32,35 @@ def make_doi_url(doi):
3932

4033

4134
@attr.s
42-
class DOIMetadata:
35+
class DOIMetadataSerializer:
4336
"""Response from doi.org for DOI metadata."""
4437

4538
id = attr.ib(kw_only=True)
46-
DOI = attr.ib(kw_only=True)
47-
URL = attr.ib(kw_only=True)
39+
40+
doi = attr.ib(kw_only=True)
41+
42+
url = attr.ib(kw_only=True)
43+
4844
type = attr.ib(kw_only=True, default=None)
45+
4946
categories = attr.ib(kw_only=True, default=None)
47+
5048
author = attr.ib(kw_only=True, default=None)
49+
5150
version = attr.ib(kw_only=True, default=None)
51+
5252
issued = attr.ib(kw_only=True, default=None)
53+
5354
title = attr.ib(kw_only=True, default=None)
55+
5456
abstract = attr.ib(kw_only=True, default=None)
57+
5558
language = attr.ib(kw_only=True, default=None)
59+
5660
publisher = attr.ib(kw_only=True, default=None)
5761

62+
container_title = attr.ib(kw_only=True, default=None)
63+
5864

5965
@attr.s
6066
class DOIProvider(ProviderApi):
@@ -66,9 +72,14 @@ class DOIProvider(ProviderApi):
6672
timeout = attr.ib(default=3)
6773

6874
@staticmethod
69-
def is_doi(uri):
70-
"""Check if uri is DOI."""
71-
return doi_regexp.match(uri)
75+
def _serialize(response):
76+
"""Serialize HTTP response for DOI."""
77+
return DOIMetadataSerializer(
78+
**{
79+
key.replace('-', '_').lower(): value
80+
for key, value in response.items()
81+
}
82+
)
7283

7384
def _query(self, doi):
7485
"""Retrieve metadata for given doi."""
@@ -85,4 +96,4 @@ def _query(self, doi):
8596
def find_record(self, uri):
8697
"""Finds DOI record."""
8798
response = self._query(uri).json()
88-
return DOIMetadata(**response)
99+
return DOIProvider._serialize(response)

0 commit comments

Comments
 (0)