/
mdio.py
241 lines (194 loc) · 7.95 KB
/
mdio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
"""Conversion from to MDIO various other formats."""
from __future__ import annotations
import uuid
from os import path
import numpy as np
from dask.array.core import Array
from dask.base import compute_as_if_collection
from dask.core import flatten
from dask.highlevelgraph import HighLevelGraph
from tqdm.dask import TqdmCallback
from mdio import MDIOReader
from mdio.segy._workers import write_block_to_segy
from mdio.segy.creation import mdio_spec_to_segy
from mdio.segy.creation import merge_partial_segy
try:
import distributed
except ImportError:
distributed = None
def mdio_to_segy(
mdio_path_or_buffer: str,
output_segy_path: str,
endian: str = "big",
access_pattern: str = "012",
out_sample_format: str = "ibm32",
storage_options: dict = None,
new_chunks: tuple[int, ...] = None,
selection_mask: np.ndarray = None,
client: distributed.Client = None,
) -> None:
"""Convert MDIO file to SEG-Y format.
MDIO allows exporting multidimensional seismic data back to the flattened
seismic format SEG-Y, to be used in data transmission.
The input headers are preserved as is, and will be transferred to the
output file.
The user has control over the endianness, and the floating point data
type. However, by default we export as Big-Endian IBM float, per the
SEG-Y format's default.
The input MDIO can be local or cloud based. However, the output SEG-Y
will be generated locally.
A `selection_mask` can be provided (in the shape of the spatial grid)
to export a subset of the seismic data.
Args:
mdio_path_or_buffer: Input path where the MDIO is located
output_segy_path: Path to the output SEG-Y file
endian: Endianness of the input SEG-Y. Rev.2 allows little
endian. Default is 'big'.
access_pattern: This specificies the chunk access pattern. Underlying
zarr.Array must exist. Examples: '012', '01'
out_sample_format: Output sample format.
Currently support: {'ibm32', 'ieee32'}. Default is 'ibm32'.
storage_options: Storage options for the cloud storage backend.
Default: None (will assume anonymous access)
new_chunks: Set manual chunksize. For development purposes only.
selection_mask: Array that lists the subset of traces
client: Dask client. If `None` we will use local threaded scheduler.
If `auto` is used we will create multiple processes (with
8 threads each).
Raises:
ImportError: if distributed package isn't installed but requested.
ValueError: if cut mask is empty, i.e. no traces will be written.
Examples:
To export an existing local MDIO file to SEG-Y we use the code
snippet below. This will export the full MDIO (without padding) to
SEG-Y format using IBM floats and big-endian byte order.
>>> from mdio import mdio_to_segy
>>>
>>>
>>> mdio_to_segy(
... mdio_path_or_buffer="prefix2/file.mdio",
... output_segy_path="prefix/file.segy",
... )
If we want to export this as an IEEE big-endian, using a selection
mask, we would run:
>>> mdio_to_segy(
... mdio_path_or_buffer="prefix2/file.mdio",
... output_segy_path="prefix/file.segy",
... selection_mask=boolean_mask,
... out_sample_format="ieee32",
... )
"""
backend = "dask"
mdio = MDIOReader(
mdio_path_or_buffer=mdio_path_or_buffer,
access_pattern=access_pattern,
storage_options=storage_options,
)
ndim = mdio.n_dim
# We flatten the z-axis (time or depth); so ieee2ibm, and byte-swaps etc
# can run on big chunks of data.
auto_chunk = (None,) * (ndim - 2) + ("100M",) + (-1,)
new_chunks = new_chunks if new_chunks is not None else auto_chunk
creation_args = [
mdio_path_or_buffer,
output_segy_path,
endian,
access_pattern,
out_sample_format,
storage_options,
new_chunks,
selection_mask,
backend,
]
if client is not None:
if distributed is not None:
# This is in case we work with big data
feature = client.submit(mdio_spec_to_segy, *creation_args)
mdio, sample_format = feature.result()
else:
msg = "Distributed client was provided, but `distributed` is not installed"
raise ImportError(msg)
else:
mdio, sample_format = mdio_spec_to_segy(*creation_args)
num_samp = mdio.shape[-1]
live_mask = mdio.live_mask.compute()
if selection_mask is not None:
live_mask = live_mask & selection_mask
# This handles the case if we are skipping a whole block.
if live_mask.sum() == 0:
raise ValueError("No traces will be written out. Live mask is empty.")
# Find rough dim limits, so we don't unnecessarily hit disk / cloud store.
# Typically, gets triggered when there is a selection mask
dim_slices = ()
live_nonzeros = live_mask.nonzero()
for dim_nonzeros in live_nonzeros:
start = np.min(dim_nonzeros)
stop = np.max(dim_nonzeros) + 1
dim_slices += (slice(start, stop),)
# Lazily pull the data with limits now, and limit mask so its the same shape.
live_mask, headers, traces = mdio[dim_slices]
if selection_mask is not None:
selection_mask = selection_mask[dim_slices]
live_mask = live_mask & selection_mask
# Now we flatten the data in the slowest changing axis (i.e. 0)
# TODO: Add support for flipping these, if user wants
axis = 0
# Get new chunksizes for sequential array
seq_trc_chunks = tuple(
(dim_chunks if idx == axis else (sum(dim_chunks),))
for idx, dim_chunks in enumerate(traces.chunks)
)
# We must unify chunks with "trc_chunks" here because
# headers and live mask may have different chunking.
# We don't take the time axis for headers / live
# Still lazy computation
traces_seq = traces.rechunk(seq_trc_chunks)
headers_seq = headers.rechunk(seq_trc_chunks[:-1])
live_seq = live_mask.rechunk(seq_trc_chunks[:-1])
# Build a Dask graph to do the computation
# Name of task. Using uuid1 is important because
# we could potentially generate these from different machines
task_name = "block-to-sgy-part-" + str(uuid.uuid1())
trace_keys = flatten(traces_seq.__dask_keys__())
header_keys = flatten(headers_seq.__dask_keys__())
live_keys = flatten(live_seq.__dask_keys__())
all_keys = zip(trace_keys, header_keys, live_keys)
# tmp file root
out_dir = path.dirname(output_segy_path)
task_graph_dict = {}
block_file_paths = []
for idx, (trace_key, header_key, live_key) in enumerate(all_keys):
block_file_name = f".{idx}_{uuid.uuid1()}._segyblock"
block_file_path = path.join(out_dir, block_file_name)
block_file_paths.append(block_file_path)
block_args = (
block_file_path,
trace_key,
header_key,
live_key,
num_samp,
sample_format,
endian,
)
task_graph_dict[(task_name, idx)] = (write_block_to_segy,) + block_args
# Make actual graph
task_graph = HighLevelGraph.from_collections(
task_name,
task_graph_dict,
dependencies=[traces_seq, headers_seq, live_seq],
)
# Note this doesn't work with distributed.
tqdm_kw = dict(unit="block", dynamic_ncols=True)
block_progress = TqdmCallback(desc="Step 1 / 2 Writing Blocks", **tqdm_kw)
with block_progress:
block_exists = compute_as_if_collection(
cls=Array,
dsk=task_graph,
keys=list(task_graph_dict),
scheduler=client,
)
merge_args = [output_segy_path, block_file_paths, block_exists]
if client is not None:
_ = client.submit(merge_partial_segy, *merge_args).result()
else:
merge_partial_segy(*merge_args)