/
doi.py
159 lines (129 loc) · 4.94 KB
/
doi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# -*- coding: utf-8 -*-
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DOI API integration."""
import urllib
from pathlib import Path
from typing import Type
from renku.core import errors
from renku.core.dataset.providers.api import ImporterApi, ProviderApi, ProviderPriority
from renku.core.plugin import hookimpl
from renku.core.util.doi import extract_doi, is_doi
from renku.domain_model.dataset_provider import IDatasetProviderPlugin
DOI_BASE_URL = "https://dx.doi.org"
class DOIProvider(ProviderApi, IDatasetProviderPlugin):
"""`doi.org <http://doi.org>`_ registry API provider."""
priority = ProviderPriority.HIGHER
name = "DOI"
def __init__(self, headers=None, timeout=3):
self.timeout = timeout
self.headers = headers if headers is not None else {"accept": "application/vnd.citationstyles.csl+json"}
@staticmethod
def supports(uri) -> bool:
"""Whether or not this provider supports a given URI."""
return bool(is_doi(uri))
def get_importer(self, uri, **kwargs) -> "DOIImporter":
"""Get import manager."""
from renku.core.util import requests
def query(doi):
"""Retrieve metadata for given doi."""
doi = extract_doi(doi)
url = make_doi_url(doi)
response = requests.get(url, headers=self.headers)
if response.status_code != 200:
raise LookupError("record not found. Status: {}".format(response.status_code))
return response
def serialize(response):
"""Serialize HTTP response for DOI."""
json_data = response.json()
data = {key.replace("-", "_").lower(): value for key, value in json_data.items()}
try:
return DOIImporter(**data)
except TypeError:
raise errors.ImportError("doi metadata could not be serialized")
query_response = query(uri)
return serialize(query_response)
@classmethod
@hookimpl
def dataset_provider(cls) -> "Type[DOIProvider]":
"""The definition of the provider."""
return cls
class DOIImporter(ImporterApi):
"""Response from `doi.org <http://doi.org>`_ for DOI metadata."""
def __init__(
self,
id,
doi,
url,
abstract=None,
author=None,
categories=None,
container_title=None,
contributor=None,
copyright=None,
issued=None,
language=None,
publisher=None,
title=None,
type=None,
version=None,
):
super().__init__(uri=url, original_uri=url)
self.id = id
self.doi = doi
self.abstract = abstract
self.author = author
self.categories = categories
self.container_title = container_title
self.contributor = contributor
self.copyright = copyright
self.issued = issued
self.language = language
self.publisher = publisher
self.title = title
self.type = type
self._version = version
@property
def version(self) -> str:
"""Get record version."""
return self._version
@property
def latest_uri(self) -> str:
"""Get URI of the latest version."""
return self.uri
def fetch_provider_dataset(self):
"""Deserialize this record to a ``ProviderDataset``."""
raise NotImplementedError
def is_latest_version(self) -> bool:
"""Check if record is at last possible version."""
return True
def download_files(self, client, destination: Path, extract: bool):
"""Download dataset files from the remote provider."""
raise NotImplementedError
def tag_dataset(self, name: str) -> None:
"""Create a tag for the dataset ``name`` if the remote dataset has a tag/version."""
raise NotImplementedError
def copy_extra_metadata(self, new_dataset) -> None:
"""Copy provider specific metadata once the dataset is created."""
raise NotImplementedError
def make_doi_url(doi):
"""Create URL to access DOI metadata."""
parsed_url = urllib.parse.urlparse(doi)
if parsed_url.scheme == "doi":
parsed_url = parsed_url._replace(scheme="")
doi = parsed_url.geturl()
return urllib.parse.urljoin(DOI_BASE_URL, doi)