This repository has been archived by the owner on Jun 3, 2020. It is now read-only.
/
__init__.py
511 lines (456 loc) · 16 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
"""Dataset modules
Each considered dataset is represented by its own module, and its own class
that inherits from the generic Dataset class.
"""
import abc
import json
from multiprocessing import Pool
import os
import cv2
import daiquiri
import geopandas as gpd
import numpy as np
from osgeo import gdal
from PIL import Image
from deeposlandia import geometries
logger = daiquiri.getLogger(__name__)
AVAILABLE_DATASETS = ("shapes", "mapillary", "aerial", "tanzania")
GEOGRAPHIC_DATASETS = ("aerial", "tanzania")
class Dataset(metaclass=abc.ABCMeta):
"""Generic class that describes the behavior of a Dataset object: it is
initialized at least with an image size, its label are added always through
the same manner, it can be serialized (save) and deserialized (load)
from/to a `.json` file
Attributes
----------
image_size : int
Size of considered images (height=width), raw images will be resized
during the preprocessing
"""
def __init__(self, image_size):
if not image_size % 16 == 0:
raise ValueError(
"The chosen image size is not divisible "
"per 16. To train a neural network with "
"such an input size may fail."
)
self.image_size = image_size
self.label_info = []
self.image_info = []
@property
def label_ids(self):
"""Return the list of labels ids taken into account in the dataset
They can be grouped.
Returns
-------
list
List of label ids
"""
return [
label_id
for label_id, attr in enumerate(self.label_info)
if attr["is_evaluate"]
]
@property
def labels(self):
"""Return the description of label that will be evaluated during the
process
"""
return [label for label in self.label_info if label["is_evaluate"]]
def get_nb_labels(self, see_all=False):
"""Return the number of labels
Parameters
----------
see_all : boolean
If True, consider all labels, otherwise consider only labels for
which `is_evaluate` is True
"""
if see_all:
return len(self.label_info)
else:
return len(self.label_ids)
def get_nb_images(self):
""" `image_info` getter, return the size of `image_info`, i.e. the
number of images in the dataset
"""
return len(self.image_info)
def get_label_popularity(self):
"""Return the label popularity in the current dataset, *i.e.* the
proportion of images that contain corresponding object
"""
labels = [img["labels"] for img in self.image_info]
if self.get_nb_images() == 0:
logger.error("No images in the dataset.")
return None
else:
return np.round(
np.divide(
sum(np.array([list(l.values()) for l in labels])),
self.get_nb_images(),
),
3,
)
def add_label(
self,
label_id,
label_name,
color,
is_evaluate,
category=None,
aggregated_label_ids=None,
contained_labels=None,
):
""" Add a new label to the dataset with label id `label_id`
Parameters
----------
label_id : integer
Id of the new label
label_name : str
String designing the new label name
color : list
List of three integers (between 0 and 255) that characterizes the
label (useful for semantic segmentation result printing)
is_evaluate : bool
category : str
String designing the category of the dataset label
aggregate_label_ids : list (optional)
List of label ids aggregated by the current label_id
contained_labels : list
List of raw labels aggregated by the current label
"""
if label_id in self.label_info:
logger.error(
"Label %s already stored into the label set.", label_id
)
return None
category = label_name if category is None else category
contains = label_name if contained_labels is None else contained_labels
self.label_info.append(
{
"name": label_name,
"id": label_id,
"category": category,
"is_evaluate": is_evaluate,
"aggregate": aggregated_label_ids,
"contains": contains,
"color": color,
}
)
def save(self, filename):
"""Save dataset in a json file indicated by `filename`
Parameters
----------
filename : str
String designing the relative path where the dataset must be saved
"""
with open(filename, "w") as fp:
json.dump(
{
"image_size": self.image_size,
"labels": self.label_info,
"images": self.image_info,
},
fp,
)
logger.info("The dataset has been saved into %s", filename)
def load(self, filename, nb_images=None):
"""Load a dataset from a json file indicated by `filename` ; use dict
comprehension instead of direct assignments in order to convert dict
keys to integers
Parameters
----------
filename : str
String designing the relative path from where the dataset must be
loaded
nb_images : integer
Number of images that must be loaded (if None, the whole dataset is
loaded)
"""
with open(filename) as fp:
ds = json.load(fp)
self.image_size = ds["image_size"]
self.label_info = ds["labels"]
if nb_images is None:
self.image_info = ds["images"]
else:
self.image_info = ds["images"][:nb_images]
logger.info("The dataset has been loaded from %s", filename)
@abc.abstractmethod
def populate(self):
"""
"""
pass
class GeoreferencedDataset(Dataset):
"""Generic class that describes the behavior of "Dataset" objects
corresponding to aerial images
"""
def _generate_preprocessed_filenames(
self, image_filename, output_dir, x, y, suffix=None
):
"""Generate preprocessed image and label filenames on the file system,
starting from a raw image filename
Parameters
----------
image_filename : str
Original image filename
output_dir : str
Output folder for preprocessed material
x : int
Extracted image west coordinates
y : int
Extracted image north coordinates
suffix : str
Preprocessed filename complement
Returns
-------
dict
Preprocessed image and corresponding label filenames
"""
basename_decomp = os.path.splitext(os.path.basename(image_filename))
img_id_str = (
str(self.image_size)
+ "_"
+ str(self.image_size)
+ "_"
+ str(x)
+ "_"
+ str(y)
)
img_id_str = (
img_id_str if suffix is None else img_id_str + "_" + suffix
)
new_filename = basename_decomp[0] + "_" + img_id_str + ".png"
out_image_name = os.path.join(output_dir, "images", new_filename)
out_label_name = out_image_name.replace("images", "labels")
return {"image": out_image_name, "labels": out_label_name}
def _serialize(
self,
tile_image,
labelled_image,
label_dict,
image_filename,
output_dir,
x,
y,
suffix=None,
):
"""Serialize a tiled image generated from an original high-resolution
raster as well as the labelled version of the tile
The method returns a dict that contains image-related file paths.
Parameters
----------
tile_image : PIL.Image
labelled_image : PIL.Image
label_dict : dict
image_filename : str
output_dir : str
x : int
y : int
Returns
-------
dict
Information related to the serialized tile (file paths, encountered
labels)
"""
dirs = self._generate_preprocessed_filenames(
image_filename, output_dir, x, y, suffix
)
try:
tile_image.verify()
labelled_image.verify()
tile_image.save(dirs["image"])
labelled_image.save(dirs["labels"])
return {
"raw_filename": image_filename,
"image_filename": dirs["image"],
"label_filename": dirs["labels"],
"labels": label_dict,
}
except SyntaxError as se:
logger.error(
"The image %s is corrupt, hence not serialized.",
image_filename
)
return None
def _preprocess_tile(
self, x, y, image_filename, output_dir, raster, labels=None
):
"""Preprocess one single tile built from `image_filename`, with respect
to pixel coordinates `(x, y)`
Parameters
----------
x : int
Horizontal pixel coordinate (*i.e.* west bound)
y : int
Vertical pixel coordinate (*i.e.* north bound)
image_filename : str
Full path towards the image on the disk
output_dir : str
Output path where preprocessed image must be saved
raster : osgeo.gdal.Dataset
Original georeferenced raster
labels : geopandas.GeoDataFrame
Raw image labels (*i.e.* georeferenced buildings)
Returns
-------
dict
Key/values with the filenames and label ids
"""
dirs = self._generate_preprocessed_filenames(
image_filename, output_dir, x, y
)
gdal.Translate(
dirs["image"],
raster,
format="PNG",
srcWin=[x, y, self.image_size, self.image_size],
)
return {
"raw_filename": image_filename,
"image_filename": dirs["image"],
}
def _preprocess_for_inference(self, image_filename, output_dir):
"""Resize/crop then save the training & label images
Parameters
----------
image_filename : str
Full path towards the image on the disk
output_dir : str
Output path where preprocessed image must be saved
Returns
-------
dict
Key/values with the filenames and label ids
"""
raster = gdal.Open(image_filename)
raw_img_width = raster.RasterXSize
raw_img_height = raster.RasterYSize
result_dicts = []
logger.info("Image filename: %s", image_filename)
logger.info("Raw image size: %s, %s", raw_img_width, raw_img_height)
for x in range(0, raw_img_width, self.image_size):
for y in range(0, raw_img_height, self.image_size):
tile_results = self._preprocess_tile(
x, y, image_filename, output_dir, raster
)
result_dicts.append(tile_results)
del raster
return result_dicts
def load_mask(self, buildings, raster_features, min_x, min_y):
"""Translate georeferenced buildings as numpy arrays in order to
prepare image analysis
Parameters
----------
buildings : geopandas.GeoDataFrame
Georeferenced building labels, with a ̀condition` column that
contains building type and a `geometry` column that describe the
geolocalization
raster_features : dict
Geographical features of raw original image
min_x : int
Minimal tile x-coordinates (west bound)
min_y : int
Minimal tile y-coordinates (north bound)
Returns
-------
numpy.array
Array-versionned building labels, that link each pixel to the
specified class, *i.e.* `B(x, y)=i` if pixel `(x, y)` belongs to class
`i`. In this dataset, the labels are `complete`, `incomplete` or
̀foundation`.
"""
mask = np.zeros(
shape=(self.image_size, self.image_size), dtype=np.uint8
)
if buildings.shape[0] == 0:
return mask
for idx, row in buildings.iterrows():
points = geometries.extract_points_from_polygon(
row["geometry"], raster_features, min_x, min_y
)
label_id = [
label["id"]
for label in self.labels
if label["name"] == row["condition"].lower()
][0]
mask = cv2.fillPoly(mask, [points], label_id)
return mask
def populate(
self,
output_dir,
input_dir,
nb_images=0,
nb_tiles_per_image=0,
labelling=True,
nb_processes=1,
):
""" Populate the dataset with images contained into `datadir` directory
Parameters
----------
output_dir : str
Path of the directory where the preprocessed image must be saved
input_dir : str
Path of the directory that contains input images
nb_images : integer
Number of images to be considered in the dataset; if None, consider the whole
repository
nb_tiles_per_image : integer
Number of tiles that must be picked into the raw image, for labelled datasets
labelling : boolean
If True labels are recovered from dataset, otherwise dummy label are generated
nb_processes : int
Number of processes on which to run the preprocessing
"""
image_list = os.listdir(os.path.join(input_dir, "images"))
image_list_longname = [
os.path.join(input_dir, "images", l)
for l in image_list
if not l.startswith(".")
]
nb_image_files = len(image_list_longname)
if nb_image_files < nb_images:
logger.warning(
"Asking to preprocess %s images, but only got %s files",
nb_images, nb_image_files)
nb_images = nb_image_files
logger.warning("Preprocessing %s images..", nb_images)
image_list_longname = np.random.choice(
image_list_longname, nb_images, replace=False
)
logger.info("Getting %s images to preprocess...", nb_images)
if labelling:
if nb_processes == 1:
for x in image_list_longname:
self.image_info.append(
self._preprocess_for_training(
x, output_dir, nb_tiles_per_image
)
)
else:
with Pool(processes=nb_processes) as p:
self.image_info = p.starmap(
self._preprocess_for_training,
[
(x, output_dir, nb_tiles_per_image)
for x in image_list_longname
],
)
else:
if nb_processes == 1:
for x in image_list_longname:
self.image_info.append(
self._preprocess_for_inference(x, output_dir)
)
else:
with Pool(processes=nb_processes) as p:
self.image_info = p.starmap(
self._preprocess_for_inference,
[(x, output_dir) for x in image_list_longname],
)
self.image_info = [
item for sublist in self.image_info for item in sublist
]
logger.info(
"Saved %s images in the preprocessed dataset.",
len(self.image_info),
)