-
Notifications
You must be signed in to change notification settings - Fork 3
/
prepare_footprints.py
320 lines (266 loc) · 11.1 KB
/
prepare_footprints.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
"""
This python script takes footprint files for the Ice Wedge Polygon (IWP)
project and prepares them for use in the deduplication step of the the PDG
viz-workflow. For details on deduplication, see:
https://github.com/PermafrostDiscoveryGateway/viz-staging/blob/develop/docs/deduplication.md
Although this process is very specific to the IWP project, it is possible to
configure certain parameters to work with other projects, should they have a
similar footprint file naming convention.
INPUT DATA:
------------
The footprint filenames that are prepared by this script are provided with the
following naming convention: `selection_<directory_id>.shp`. Where
`directory_id` is gives a partial name for the subdirectory where the
associated IWP input data can be found. The matching IWP subdirectory name
excludes the `selection_` prefix, but includes an the `_iwp` suffix, e.g.
`<directory_id>_iwp`. This script searches recursively for IWP input
subdirectories that correspond to the footprint file name as described.
The input footprint files contain one feature for each footprint. Each of these
features correspond to a particular IWP input file, and each feature has a
'Name' property that is a partial match to the IWP input file name. The IWP
input file name is this 'Name' (file ID) property plus a variable two part
suffix following the format `<file_id>_suffix1_suffix2.shp`, e.g.
`<file_id>_u16rf3413_pansh.shp`. This script matches the file_ids in the
footprint file to the IWP input file names.
The second part of the file ID (when ID is split by underscore), is the
Date & Time that the IWP image was captured.
Here is an example:
/footprint_input_dir/
|
|- selection_<directory_id>.shp
| --------------------------
| geometry | Name | ...
| ... | <file_id1> | ... <- **FOOTPRINT 1**
| ... | <file_id2> | ... <- **FOOTPRINT 2**
...
... MATCHES TO ...
/IWP_input_dir/
|
| subdir1/
| subdir2/
...
| <directory_id>_iwp/
|- <file_id1>_suffix1_suffix2.shp <- **IWP INPUT 1**
|- <file_id2>_suffix1_suffix2.shp <- **IWP INPUT 2**
...
OUTPUT DATA:
-------------
The script matches footprint features to IWP input files as described above,
then re-saves each footprint as a single file with the same name and directory
structure as the IWP input files, as required by the PDG viz-workflow. It also
parses the date from the file ID and saves it as a new property, 'Date', in the
footprint file.
For details on the required format of footprints, see documentation here:
https://github.com/PermafrostDiscoveryGateway/viz-staging/blob/develop/docs/footprints.md.
Details of footprint matches or lack thereof are saved to several JSON files.
"""
import os
import warnings
import json
import geopandas as gpd
from pdgstaging import TileStager
# Some options that could change on a per-project basis
options = {
'base_dir': '/home/pdg/data/ice-wedge-polygon-data/version 01',
'subdir_suffix': '_iwp',
'footprint_file_prefix': 'selection_',
'dir_vector_input_data': '',
'dir_footprints_in': 'footprints/original_footprints',
'dir_footprints_out': 'footprints/staged_footprints',
'ext_footprints_in': '.shp',
'ext_footprints_out': '.gpkg',
'prop_file_id': 'Name',
'prop_date': 'Date',
# Directories to skip when searching for IWP input data that matches
# footprint file names
'dirs_skip': [
'web_tiles_1TB_testrun',
'high_ice/russia/russia',
'high_ice/medium_ice/alaska_m',
'high_ice/medium_ice/russia_m',
'footprints'
],
# Where to save records of footprint matches and failures
'filename_unmatched_footprints':
'footprints/footprint_files_unmatched_to_subdirs.json',
'filename_matched_footprints':
'footprints/footprint_files_matched_to_subdirs.json',
'filename_multimatch_footprints':
'footprints/footprint_files_multimatch_to_subdirs.json',
'filename_unmatched_footprint_features':
'footprints/footprint_features_unmatched_to_files.json',
'filename_matched_footprint_features':
'footprints/footprint_features_matched_to_files.json'
}
# Add base directory to options
path_opts = [
'dir_vector_input_data',
'dir_footprints_in',
'dir_footprints_out',
'filename_unmatched_footprints',
'filename_matched_footprints',
'filename_multimatch_footprints',
'filename_unmatched_footprint_features',
'filename_matched_footprint_features']
for o in path_opts:
options[o] = os.path.join(options['base_dir'], options[o])
for i in range(0, len(options['dirs_skip'])):
options['dirs_skip'][i] = os.path.join(
options['base_dir'], options['dirs_skip'][i])
def get_base_name(path):
"""
Get the base name of a file, without the extension
"""
return os.path.basename(path).split('.')[0]
def date_from_id(string):
"""
Parse date from IWP file name
"""
# Split string by underscore
parts = string.split('_')
# Return the second part, represents date & time
return parts[1]
def id_from_input_path(input):
"""
Get just the IWP file 'ID' code from the full path name that
includes a two-part suffix
"""
input = get_base_name(input)
parts = input.split('_')
parts = parts[:-2]
input = '_'.join(parts)
return input
def subdir_from_footprint_path(footprint_path):
"""
Get the IWP subdirectory name from the footprint file path. The
sub-directory of the input file is the the filename of the matching
footprint, minus the prefix 'selection_', plus the suffix '_iwp'
"""
subdir = get_base_name(footprint_path)
subdir = subdir.removeprefix(options['footprint_file_prefix'])
subdir += options['subdir_suffix']
return subdir
def get_input_subdirs(rootdir='.', dirs_skip=[], recursive=True):
"""
List all subdirectories in a directory that include the '_iwp' suffix,
excluding the ones contained within in the dirs_skip list.
"""
dirs = []
for item in os.scandir(rootdir):
if item.is_dir() and (item.path not in dirs_skip):
if item.name.endswith(options['subdir_suffix']):
dirs.append(item.path)
if recursive:
subdirs = get_input_subdirs(item.path, dirs_skip, recursive)
if subdirs:
dirs.extend(subdirs)
return dirs
# Create a tile stager with the location of the input IWP data files, and the
# *DESIRED* location of the footprint files. dir_footprints is where we will
# save the footprints that are prepared for staging.
tileStager = TileStager({
'dir_input': options['dir_vector_input_data'],
'dir_footprints': options['dir_footprints_out'],
'ext_input': options['ext_footprints_in'],
'ext_footprints': options['ext_footprints_out']
})
# To help create and parse paths
pathManager = tileStager.tiles
# To get options from the configuration for the workflow
config = tileStager.config
# Add the directory where the footprints are currently stored
pathManager.add_base_dir(
name='footprints_original',
dir_path=options['dir_footprints_in'],
ext=options['ext_footprints_in'],
)
# Get the paths to all of the original footprint files
footprint_paths = pathManager.get_filenames_from_dir('footprints_original')
dir_input = config.get('dir_input')
ext_input = config.get('ext_input')
# Get a list of all possible input data sub-directories
all_input_subdirs = get_input_subdirs(dir_input, options['dirs_skip'])
# For record keeping
directory_matches = []
file_matches = []
for footprint_path in footprint_paths:
# Get the dir that contains the corresponding input files
subdir_name = subdir_from_footprint_path(footprint_path)
# Find the dir in all_input_subdirs that matches the subdir_name
subdir_input = [dir for dir in all_input_subdirs if dir.split(
os.sep)[-1] == subdir_name]
# Record the match (or lack thereof)
directory_match = {
'original_footprint_path': footprint_path,
'status': 'no_match',
'match': None
}
if len(subdir_input) > 1:
directory_match['status'] = 'multiple_matches'
directory_match['match'] = subdir_input
elif len(subdir_input) == 1:
subdir_input = subdir_input[0]
directory_match['status'] = 'matched'
directory_match['match'] = subdir_input
directory_matches.append(directory_match)
# Stop here if not 1 and only 1 match
if directory_match['status'] != 'matched':
continue
# Get the paths & IDs of input files in the matched sub-directory
pathManager.add_base_dir(subdir_input, subdir_input, ext_input)
subset_iwp_files = pathManager.get_filenames_from_dir(subdir_input)
subset_ids = [id_from_input_path(i) for i in subset_iwp_files]
# Read the footprint GDF, split into 1 GeoDataFrame per row
fp = gpd.read_file(footprint_path)
fp_list = [v for k, v in fp.groupby('Name', as_index=False)]
# For each GDF (row)
for fp_row in fp_list:
# Set the date
fp_row.reset_index(drop=True, inplace=True)
fp_id = fp_row.Name[0]
date = date_from_id(fp_id)
fp_row['Date'] = date
file_match = {
'original_footprint_path': footprint_path,
'footprint_id': fp_id,
'date': date,
'status': 'no_match',
'match': None
}
# Find the corresponding IWP file
try:
# Match footprint ID (from GDF) to IWP ID (from filename)
id_index = subset_ids.index(fp_id)
matching_file = subset_iwp_files[id_index]
# Create the file path expected by the viz-workflow
footprint_path_staging = config.footprint_path_from_input(
matching_file, check_exists=False)
# Make any necessary parent directories
pathManager.create_dirs(footprint_path_staging)
# Save the single-polygon vector file to the filepath
with warnings.catch_warnings():
warnings.simplefilter('ignore', FutureWarning)
fp_row.to_file(footprint_path_staging)
# Record the match
file_match['status'] = 'matched'
file_match['match'] = matching_file
file_matches.append(file_match)
except ValueError:
file_matches.append(file_match)
# Filter records and map to output filenames
record_map = {
options['filename_unmatched_footprints']:
[d for d in directory_matches if d['status'] == 'no_match'],
options['filename_matched_footprints']:
[d for d in directory_matches if d['status'] == 'matched'],
options['filename_multimatch_footprints']:
[d for d in directory_matches if d['status'] == 'multiple_matches'],
options['filename_unmatched_footprint_features']:
[f for f in file_matches if f['status'] == 'no_match'],
options['filename_matched_footprint_features']:
[f for f in file_matches if f['status'] == 'matched']
}
# Write all of the records to the output files as JSON
for filename, records in record_map.items():
with open(filename, 'w') as f:
json.dump(records, f, indent=2)