-
Notifications
You must be signed in to change notification settings - Fork 36
/
job.py
385 lines (306 loc) · 15.1 KB
/
job.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
import datetime
import logging
import time
import typing
from pathlib import Path
from typing import List, Union, Dict, Optional
from deprecated.sphinx import deprecated
from requests import ConnectionError, Response
from openeo.api.logs import LogEntry
from openeo.internal.jupyter import render_component, render_error, VisualDict, VisualList
from openeo.rest import OpenEoClientException, JobFailedException, OpenEoApiError
from openeo.util import ensure_dir
if hasattr(typing, 'TYPE_CHECKING') and typing.TYPE_CHECKING:
# Imports for type checking only (circular import issue at runtime). `hasattr` is Python 3.5 workaround #210
from openeo.rest.connection import Connection
logger = logging.getLogger(__name__)
class RESTJob:
"""
Handle for an openEO batch job, allowing it to describe, start, cancel, inspect results, etc.
"""
# TODO: rename this to BatchJob?
def __init__(self, job_id: str, connection: 'Connection'):
self.job_id = job_id
"""Unique identifier of the batch job (string)."""
self.connection = connection
def __repr__(self):
return '<{c} job_id={i!r}>'.format(c=self.__class__.__name__, i=self.job_id)
def _repr_html_(self):
data = self.describe_job()
currency = self.connection.capabilities().currency()
return render_component('job', data=data, parameters={'currency': currency})
def describe_job(self) -> dict:
""" Get all job information."""
# GET /jobs/{job_id}
return self.connection.get("/jobs/{}".format(self.job_id), expected_status=200).json()
def update_job(self, process_graph=None, output_format=None,
output_parameters=None, title=None, description=None,
plan=None, budget=None, additional=None):
""" Update a job."""
# PATCH /jobs/{job_id}
raise NotImplementedError
def delete_job(self):
""" Delete a job."""
# DELETE /jobs/{job_id}
self.connection.delete("/jobs/{}".format(self.job_id), expected_status=204)
def estimate_job(self):
""" Calculate an time/cost estimate for a job."""
# GET /jobs/{job_id}/estimate
data = self.connection.get("/jobs/{}/estimate".format(self.job_id), expected_status=200).json()
currency = self.connection.capabilities().currency()
return VisualDict('job-estimate', data=data, parameters={'currency': currency})
def start_job(self):
""" Start / queue a job for processing."""
# POST /jobs/{job_id}/results
self.connection.post("/jobs/{}/results".format(self.job_id), expected_status=202)
def stop_job(self):
""" Stop / cancel job processing."""
# DELETE /jobs/{job_id}/results
self.connection.delete("/jobs/{}/results".format(self.job_id), expected_status=204)
@deprecated("Use :py:meth:`~RESTJOB.get_results` instead.", version="0.4.10")
def list_results(self) -> dict:
"""Get batch job results metadata."""
return self.get_results().get_metadata()
def download_result(self, target: Union[str, Path] = None) -> Path:
"""
Download single job result to the target file path or into folder (current working dir by default).
Fails if there are multiple result files.
:param target: String or path where the file should be downloaded to.
"""
return self.get_results().download_file(target=target)
@deprecated(
"Instead use :py:meth:`RESTJob.get_results` and the more flexible download functionality of :py:class:`JobResults`",
version="0.4.10")
def download_results(self, target: Union[str, Path] = None) -> Dict[Path, dict]:
"""
Download all job result files into given folder (current working dir by default).
The names of the files are taken directly from the backend.
:param target: String/path, folder where to put the result files.
:return: file_list: Dict containing the downloaded file path as value and asset metadata
"""
return self.get_result().download_files(target)
@deprecated("Use :py:meth:`RESTJob.get_results` instead.", version="0.4.10")
def get_result(self):
return _Result(self)
def get_results(self) -> "JobResults":
"""
Get handle to batch job results for result metadata inspection or downloading resulting assets.
.. versionadded:: 0.4.10
"""
return JobResults(self)
def status(self):
""" Returns the status of the job."""
return self.describe_job().get("status", "N/A")
def logs(self, offset=None) -> List[LogEntry]:
""" Retrieve job logs."""
url = "/jobs/{}/logs".format(self.job_id)
logs = self.connection.get(url, params={'offset': offset}, expected_status=200).json()["logs"]
entries = [LogEntry(log) for log in logs]
return VisualList('logs', data=entries)
def run_synchronous(self, outputfile: Union[str, Path, None] = None,
print=print, max_poll_interval=60, connection_retry_interval=30) -> 'RESTJob':
"""Start the job, wait for it to finish and download result"""
self.start_and_wait(
print=print, max_poll_interval=max_poll_interval, connection_retry_interval=connection_retry_interval
)
# TODO #135 support multi file result sets too?
if outputfile is not None:
self.download_result(outputfile)
return self
def start_and_wait(self, print=print, max_poll_interval: int = 60, connection_retry_interval: int = 30) -> "RESTJob":
"""
Start the batch job, poll its status and wait till it finishes (or fails)
:param print: print/logging function to show progress/status
:param max_poll_interval: maximum number of seconds to sleep between status polls
:param connection_retry_interval: how long to wait when status poll failed due to connection issue
:return:
"""
start_time = time.time()
def elapsed() -> str:
return str(datetime.timedelta(seconds=time.time() - start_time)).rsplit(".")[0]
def print_status(msg: str):
print("{t} Job {i!r}: {m}".format(t=elapsed(), i=self.job_id, m=msg))
# TODO: make `max_poll_interval`, `connection_retry_interval` class constants or instance properties?
print_status("send 'start'")
self.start_job()
# TODO: also add `wait` method so you can track a job that already has started explicitly
# or just rename this method to `wait` and automatically do start if not started yet?
# Start with fast polling.
poll_interval = min(5, max_poll_interval)
status = None
while True:
# TODO: also allow a hard time limit on this infinite poll loop?
try:
job_info = self.describe_job()
except ConnectionError as e:
print_status("Connection error while querying status: {e}".format(e=e))
time.sleep(connection_retry_interval)
continue
status = job_info.get("status", "N/A")
progress = '{p}%'.format(p=job_info["progress"]) if "progress" in job_info else "N/A"
print_status("{s} (progress {p})".format(s=status, p=progress))
if status not in ('submitted', 'created', 'queued', 'running'):
break
# Sleep for next poll (and adaptively make polling less frequent)
time.sleep(poll_interval)
poll_interval = min(1.25 * poll_interval, max_poll_interval)
if status != "finished":
raise JobFailedException("Batch job {i} didn't finish properly. Status: {s} (after {t}).".format(
i=self.job_id, s=status, t=elapsed()
), job=self)
return self
class ResultAsset:
"""
Result asset of a batch job (e.g. a GeoTIFF or JSON file)
.. versionadded:: 0.4.10
"""
def __init__(self, job: RESTJob, name: str, href: str, metadata: dict):
self.job = job
self.name = name
"""Asset name as advertised by the backend."""
self.href = href
"""Download URL of the asset."""
self.metadata = metadata
"""Asset metadata provided by the backend, possibly containing keys "type" (for media type), "roles", "title", "description"."""
def __repr__(self):
return "<ResultAsset {n!r} (type {t}) at {h!r}>".format(
n=self.name, t=self.metadata.get("type", "unknown"), h=self.href
)
def download(self, target: Optional[Union[Path, str]] = None, chunk_size=None) -> Path:
"""
Download asset to given location
:param target: download target path. Can be an existing folder
(in which case the filename advertised by backend will be used)
or full file name. By default, the working directory will be used.
"""
target = Path(target or Path.cwd())
if target.is_dir():
target = target / self.name
ensure_dir(target.parent)
logger.info("Downloading Job result asset {n!r} from {h!s} to {t!s}".format(n=self.name, h=self.href, t=target))
with target.open("wb") as f:
response = self._get_response(stream=True)
for block in response.iter_content(chunk_size=chunk_size):
f.write(block)
return target
def _get_response(self, stream=True) -> Response:
return self.job.connection.get(self.href, stream=stream)
def load_json(self) -> dict:
"""Load asset in memory and parse as JSON."""
if not (self.name.lower().endswith(".json") or self.metadata.get("type") == "application/json"):
logger.warning("Asset might not be JSON")
return self._get_response().json()
def load_bytes(self) -> bytes:
"""Load asset in memory as raw bytes."""
return self._get_response().content
# TODO: more `load` methods e.g.: load GTiff asset directly as numpy array
class MultipleAssetException(OpenEoClientException):
pass
class JobResults:
"""
Results of a batch job: listing of one or more output files (assets)
and some metadata.
.. versionadded:: 0.4.10
"""
def __init__(self, job: RESTJob):
self._job = job
self._results_url = "/jobs/{j}/results".format(j=self._job.job_id)
self._results = None
def __repr__(self):
return "<JobResults for job {j!r}>".format(j=self._job.job_id)
def _repr_html_(self):
try:
response = self.get_metadata()
return render_component("batch-job-result", data = response)
except OpenEoApiError as error:
return render_error(error)
def get_metadata(self, force=False) -> dict:
"""Get batch job results metadata (parsed JSON)"""
if self._results is None or force:
self._results = self._job.connection.get(self._results_url, expected_status=200).json()
return self._results
# TODO: provide methods for `stac_version`, `id`, `geometry`, `properties`, `links`, ...?
def get_assets(self) -> List[ResultAsset]:
"""
Get all assets from the job results.
"""
# TODO: add arguments to filter on metadata, e.g. to only get assets of type "image/tiff"
metadata = self.get_metadata()
if "assets" in metadata:
# API 1.0 style: dictionary mapping filenames to metadata dict (with at least a "href" field)
assets = metadata["assets"]
else:
# Best effort translation of on old style to "assets" style (#134)
assets = {a["href"].split("/")[-1]: a for a in metadata["links"]}
return [
ResultAsset(job=self._job, name=name, href=asset["href"], metadata=asset)
for name, asset in assets.items()
]
def get_asset(self, name: str = None) -> ResultAsset:
"""
Get single asset by name or without name if there is only one.
"""
# TODO: also support getting a single asset by type or role?
assets = self.get_assets()
if len(assets) == 0:
raise OpenEoClientException("No assets in result.")
if name is None:
if len(assets) == 1:
return assets[0]
else:
raise MultipleAssetException("Multiple result assets for job {j}: {a}".format(
j=self._job.job_id, a=[a.name for a in assets]
))
else:
try:
return next(a for a in assets if a.name == name)
except StopIteration:
raise OpenEoClientException(
"No asset {n!r} in: {a}".format(n=name, a=[a.name for a in assets])
)
def download_file(self, target: Union[Path, str] = None, name: str = None) -> Path:
"""
Download single asset. Can be used when there is only one asset in the
:py:class:`JobResults`, or when the desired asset name is given explicitly.
:param target: path to download to. Can be an existing directory
(in which case the filename advertised by backend will be used)
or full file name. By default, the working directory will be used.
:param name: asset name to download (not required when there is only one asset)
:return: path of downloaded asset
"""
try:
return self.get_asset(name=name).download(target=target)
except MultipleAssetException:
raise OpenEoClientException(
"Can not use `download_file` with multiple assets. Use `download_files` instead.")
def download_files(self, target: Union[Path, str] = None) -> List[Path]:
"""
Download all assets to given folder.
:param target: path to folder to download to (must be a folder if it already exists)
:return: list of paths to the downloaded assets.
"""
target = Path(target or Path.cwd())
if target.exists() and not target.is_dir():
raise OpenEoClientException("The target argument must be a folder. Got {t!r}".format(t=str(target)))
ensure_dir(target)
return [a.download(target) for a in self.get_assets()]
@deprecated(reason="Use :py:class:`JobResults` instead", version="0.4.10")
class _Result:
"""
Wrapper around `JobResults` to adapt old deprecated "Result" API.
.. deprecated:: 0.4.10
"""
# TODO: deprecated: remove this
def __init__(self, job):
self.results = JobResults(job=job)
def download_file(self, target: Union[str, Path] = None) -> Path:
return self.results.download_file(target=target)
def download_files(self, target: Union[str, Path] = None) -> Dict[Path, dict]:
target = Path(target or Path.cwd())
if target.exists() and not target.is_dir():
raise OpenEoClientException("The target argument must be a folder. Got {t!r}".format(t=str(target)))
return {a.download(target): a.metadata for a in self.results.get_assets()}
def load_json(self) -> dict:
return self.results.get_asset().load_json()
def load_bytes(self) -> bytes:
return self.results.get_asset().load_bytes()