-
Notifications
You must be signed in to change notification settings - Fork 3
/
Deduplicator.py
731 lines (617 loc) · 29.5 KB
/
Deduplicator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
import operator
import geopandas as gpd
import pandas as pd
import uuid
import itertools
import warnings
import os
from datetime import datetime
import numpy as np
from filelock import FileLock
import logging
from . import logging_config
# NOTE: DO NOT IMPORT ConfigManager, TilePathManager, Grid
# because causes config import error for rasterization step
logger = logging_config.logger
def keep_rules_to_sort_order(keep_rules):
"""
Convert a list of keep rules to the format required for the pandas
sort_values function.
Parameters
----------
keep_rules : list
A list of tuples in the form (property, operator), as required for the
deduplication methods.
Returns
-------
tuple
The first element in the tuple is the list of properties to sort by,
and the second element is the list of booleans indicating whether the
corresponding property should be sorted in ascending or descending
order. To be used for the `by` and `ascending` arguments of the pandas
sort_values function.
"""
sort_props = [x[0] for x in keep_rules]
sort_order = [x[1] == 'smaller' for x in keep_rules]
return sort_props, sort_order
def clip_gdf(gdf = None, boundary = None, method = 'intersects', prop_duplicated = None):
"""
Remove polygons from a GeoDataFrame that fall outside of some boundary.
Determine if the polygons in the GeoDataFrame are within the boundary
by using an sjoin operation, with a specified method.
Parameters
----------
gdf : GeoDataFrame
The GeoDataFrame to clip.
boundary : GeoDataFrame
A GeoDataFrame that contains polygons representing the boundaries.
method : str
The predicate to use for the sjoin operation. Can be one of:
'contains', 'contains_properly', 'covers', 'crosses', 'intersects',
'overlaps', 'touches', 'within' (any option listed by
geopandas.GeoDataFrame.sindex.valid_query_predicates)
Returns
-------
A dictionary containing the clipped GeoDataFrame (the value of the
'to_keep' key), and the GeoDataFrame of polygons that were removed
(the value of the 'to_remove' key).
"""
# Temporary property to use during the sjoin
# assigns an identifier to
prop_in_fp_temp = 'WITHIN_BOUNDARY_' + uuid.uuid4().hex
# define the boundary as the footprint gdf's geometry column
boundary = boundary.copy().filter(['geometry'])
boundary[prop_in_fp_temp] = True
# use sjoin to determine if the polygons are within the footprint
# then drop the column called `index_right`
gdf = gdf.sjoin(boundary, how='left', predicate=method) \
.drop(['index_right'], axis=1)
# create a gdf called `within` that contains only the gdf rows where the
# values in prop_in_fp_temp are not null (are within the footprint)
within = gdf[gdf[prop_in_fp_temp].notnull()]
# drop the column from `within` object that's used
# to identify polys that fall outside the footprint
within = within.drop([prop_in_fp_temp], axis=1)
# create a gdf called `outside` that contains only the gdf rows where the
# values in prop_in_fp_temp are null (are not within the footprint)
outside = gdf[~gdf[prop_in_fp_temp].notnull()]
outside = outside.drop([prop_in_fp_temp], axis=1)
outside[prop_duplicated] = True
within[prop_duplicated] = False
# stack the gdf's
gdf_with_labels = pd.concat([outside, within], ignore_index = True)
gdf_with_labels.reset_index(drop = True, inplace = True)
return gdf_with_labels
def deduplicate_neighbors(
gdf,
split_by=None,
prop_area=None,
prop_centroid_x=None,
prop_centroid_y=None,
keep_rules=[],
overlap_tolerance=0.5,
overlap_both=True,
centroid_tolerance=None,
distance_crs='EPSG:3857',
return_intersections=False,
prop_duplicated='staging_duplicated'
# defaults to these options only if aren't already specified in config
):
"""
Identify duplicated polygons from two sources in a single GeoDataFrame, even
when geometries are not identical.
Two polygons are considered duplicates of one another when they:
1. have a different value for a specific property (e.g. 'source_file'),
AND
2. are overlapping, AND
3. have centroids within a given distance, OR
4. at least a certain proportion of one or both polygons is overlapped
by the other
The deduplication can be set to ignore centroid distance or ignore the
overlap proportion.
When two polygons are identified as duplicates, only one is kept in the returned 'to_keep'
dataframe. Which one is kept is determined by the 'keep_rules' parameter,
which indicates the property to compare, and the rule to apply ('larger' or
'smaller'). For example, to keep the newest polygon, set a keep_rule for a
more recent date property, e.g. ('Date', 'larger').
A dictionary is returned with the deduplicated GeoDataFrame ('to_keep'), the
GeoDataFrame with the dropped duplicates ('to_remove'), and optionally, the
intersection geometries of duplicated polygons ('intersections').
Parameters
----------
gdf : GeoDataFrame
The GeoDataFrame to deduplicate.
split_by : str
The name of the property in the GeoDataFrame to use to identify
duplicated polygons. Two polygons must have a different value for this
property to be considered duplicated. e.g. 'source_file.' Every
pairwise combination of unique values for this property will be
compared, so ideally, this property should not have too many unique
values, or the deduplication will be very slow.
prop_area : str, optional
If the area has already been calculated for the GeoDataFrame, give the
name of the property containing the area. Area must be in the same unit
as the CRS of the GeoDataFrame. If set to None, the area will be
calculated.
prop_centroid_x : str, optional
If the centroid has already been calculated for the GeoDataFrame, give
the name of the property containing the centroid x-coordinate. Centroid
coordinates must be in the same CRS of the GeoDataFrame. If set to
None, the centroid will be calculated.
prop_centroid_y : str, optional
If the centroid has already been calculated for the GeoDataFrame, give
the name of the property containing the centroid y-coordinate. Centroid
coordinates must be in the same CRS of the GeoDataFrame. If set to
None, the centroid will be calculated.
keep_rules : list, optional
Rules that define which of the polygons to keep when two or more are
duplicates. A list of tuples of the form (property, operator), where
property is the name of a property in the GDF to use for the
comparison. The operator is either 'larger' or 'smaller'. If the rule
is 'larger', the polygons with the largest value for the property will
be kept. If the rule is 'smaller', the polygon with the smallest value
for the property will be kept. When two properties are equal, then the
next property in the list will be checked.
overlap_tolerance : float, optional
The minimum proportion of a polygon's area that must be overlapped by
another polygon to be considered a duplicate. Default is 0.5. Set to
None to ignore overlap proportions when comparing polygons, and set a
centroid threshold instead. Note that both an overlap_tolerance AND a
centroid_tolerance can be used.
overlap_both : bool, optional
If True, then the overlap_tolerance proportion must be True for both of
the intersecting polygons to be considered a duplicate. If False, then
the overlap_tolerance proportion must be True for only one of the
intersecting polygons to be considered a duplicate. Default is True.
centroid_tolerance : float, optional
The maximum distance between the centroids of two polygons to be
considered a duplicate. Default is None. Set to None to ignore centroid
distances when comparing polygons, and set an overlap threshold
instead. Note that both an overlap_tolerance AND a centroid_tolerance
can be used. The unit of the distance is the unit of the distance_crs
property (e.g. meters for EPSG:3857), or the unit of the GeoDataFrame
if distance_crs is None.
distance_crs : str, optional
The CRS to use for the centroid distance calculation. Default is
EPSG:3857. Centroid points will be re-projected to this CRS before
calculating the distance between them. centroid_tolerance will use the
units of this CRS. Set to None to skip the re-projection and use the
CRS of the GeoDataFrame.
return_intersections : bool, optional
If True, the GeoDataFrame with the intersection geometries is returned
in addition to the GeoDataFrames with the deduplicated and removed
geometries. Default is False. This option is not currently available in
this release.
prop_duplicated : str, optional
Defaults to "staging_duplicated". The column name / property to use to flag
duplicates when label is True.
Returns
-------
GeoDataFrame
Returns the input GDF with the polygons flagged as duplicates.
Duplicates are marked as True in the prop_duplicated column.
'intersections' represents the GeoDataFrame with the intersections.
It has not been integrated into the function again since the deduplication
approach changed from returning a dictionary to returning a labeled GDF.
"""
if split_by is None:
raise ValueError('An overlap property must be specified for '
'deduplication.')
if (keep_rules is None) or (len(keep_rules) == 0):
raise ValueError('A list of keep rules must be specified for '
'deduplication.')
# Use uuid for the properties that will be temporarily created so they do
# not conflict with existing properties
rand_id = uuid.uuid4().hex
prop_int_area = 'intersect_area_' + rand_id
prop_int_id = 'intersect_id_' + rand_id
prop_id = 'temp_id_' + rand_id
gdf = gdf.copy()
# Add an ID that we can refer to later, and make a copy of the original
gdf[prop_id] = list(range(len(gdf)))
gdf_original = gdf.copy()
# Save the CRS
crs = gdf.crs
if not crs:
# TODO: raise error?
return
# Calculate centroids if needed
if (prop_centroid_x is None) or (prop_centroid_y is None):
if centroid_tolerance is not None:
prop_centroid_x = 'cent_x_' + rand_id
prop_centroid_y = 'cent_y_' + rand_id
centroids = gdf.centroid
gdf[prop_centroid_x] = centroids.x
gdf[prop_centroid_y] = centroids.y
# Calculate area if needed
if (prop_area is None):
if overlap_tolerance is not None:
prop_area = 'area_' + rand_id
gdf[prop_area] = gdf.area
# Organize keep rules for pandas sort_values
sort_props, sort_order = keep_rules_to_sort_order(keep_rules)
# Remove all unnecessary columns
working_cols = [
'geometry',
prop_id,
split_by,
prop_area,
prop_centroid_x,
prop_centroid_y] + sort_props
working_cols = [i for i in working_cols if i is not None]
gdf.drop(gdf.columns.difference(working_cols), axis=1, inplace=True)
# Split the gdf by the overlap property (e.g. source file)
gdf_grouped = gdf.groupby(split_by)
gdfs = [gdf_grouped.get_group(x) for x in gdf_grouped.groups]
to_return = {
'to_keep': None,
'intersections': None,
'to_remove': None
}
# If there is only one group, then there is nothing to deduplicate
if len(gdfs) < 2:
to_return['to_keep'] = gdf_original.drop(columns=[prop_id])
to_return = label_duplicates(to_return, prop_duplicated)
# end the deduplicate_neighbors() here, no need to continue
return to_return
# Set will hold all of the polygons IDs that we will remove
ids_to_remove = set()
# For every pairwise combination of two (subsetted) GDFs
for gdf_pair in itertools.combinations(gdfs, 2):
g1 = gdf_pair[0]
g2 = gdf_pair[1]
# Remove any of the ids_to_remove (so we don't process twice)
g1 = g1[~g1[prop_id].isin(ids_to_remove)]
g2 = g2[~g2[prop_id].isin(ids_to_remove)]
if (len(g1) == 0) or (len(g2) == 0):
continue
# Polygons must intersect to be considered duplicated
duplicates = g1.overlay(g2, how='intersection')
# Add a unique ID for each new intersection geometry
duplicates[prop_int_id] = list(range(0, len(duplicates)))
# Identify a polygon as duplicated if the area of overlap is >
# threshold proportion
if overlap_tolerance is not None:
# Filter out any intersections that do not meet the threshold %
# overlap Get the area of the intersection
# Catch the UserWarning that is raised if area is calculated
# using the a non-projected coordinate system.
with warnings.catch_warnings():
warnings.simplefilter('ignore', UserWarning)
duplicates[prop_int_area] = duplicates.area
duplicates[prop_int_area + '_1'] = duplicates[prop_int_area] / \
duplicates[prop_area + '_1']
duplicates[prop_int_area + '_2'] = duplicates[prop_int_area] / \
duplicates[prop_area + '_2']
# Must the overlap propotion be > threshold for both or just one
# polygon
op = operator.and_ if overlap_both else operator.or_
# Remove polygons from list of duplicates that intersect but have
# less than the threshold overlap proportion.
duplicates = duplicates[op(
(duplicates[prop_int_area + '_1'] > overlap_tolerance),
(duplicates[prop_int_area + '_2'] > overlap_tolerance)
)]
# Remove columns no longer needed
duplicates = duplicates.drop(
columns=[
prop_int_area + '_1',
prop_int_area + '_2',
prop_int_area
])
# Identify a polygon as duplicated when the distance between two
# centroids is <= the threshold distance
if centroid_tolerance is not None:
# Calculate distance between centroids of each polygon, in unit of
# measurement CRS
points = []
for i in [1, 2]:
i = str(i)
x = duplicates[prop_centroid_x + '_' + i]
y = duplicates[prop_centroid_y + '_' + i]
p = gpd.GeoSeries(gpd.points_from_xy(x, y, crs=crs))
if(distance_crs):
p = p.to_crs(distance_crs)
points.append(p)
centroid_distances = points[0].distance(points[1], align=False)
duplicated_by_centroid = centroid_distances <= centroid_tolerance
duplicated_by_centroid = duplicated_by_centroid.tolist()
# Remove polygons from list of duplicates that intersect but have
# centroids further apart than the threshold distance
duplicates = duplicates[duplicated_by_centroid]
# If we want to return the intersections
if return_intersections:
to_return['intersections'] = duplicates.copy()
to_return['intersections'].drop(
[prop_int_id, prop_id + '_1', prop_id + '_2'],
axis=1, inplace=True
)
# Can treat as a regular DF now
duplicates = duplicates.drop(['geometry'], axis=1)
# Reshape from wide to long
reshape_dict = {}
for c in working_cols:
cols = [col for col in duplicates.columns if col.startswith(c)]
if len(cols) > 1:
reshape_dict[c] = cols
duplicates = pd.lreshape(duplicates, reshape_dict)
# Sort to determine which of the duplicates to keep, which should be
# removed
duplicates.sort_values(
by=sort_props,
ascending=sort_order,
inplace=True)
# 'remove' here is correct, should not be 'to_remove'
duplicates['remove'] = duplicates.duplicated(prop_int_id, keep='first')
# Get the IDs of only the polygons to remove, and add them to the list
# 'remove' here is correct, should not be 'to_remove'
ids_to_remove_pair = duplicates[duplicates['remove']][prop_id]
ids_to_remove.update(ids_to_remove_pair)
# Remove polygons from the original gdf when they have an ID that is in the
# remove list
remove = gdf_original[prop_id].isin(ids_to_remove)
# Remove the prop_id column, no longer needed
gdf_original.drop([prop_id], axis=1, inplace=True)
to_return['to_remove'] = gdf_original[remove]
to_return['to_keep'] = gdf_original[~remove]
to_return = label_duplicates(to_return, prop_duplicated)
if prop_duplicated in to_return.columns:
if True in to_return[prop_duplicated].values:
sum_true = (to_return[prop_duplicated] == True).value_counts()[True]
logger.info(f"Sum of True values in the {prop_duplicated} col is: {sum_true}")
else:
sum_true = 0
logger.info(f"Sum of True values in the {prop_duplicated} col is: {sum_true}")
else:
logger.info(f"{prop_duplicated} is not a column present after labeling.")
return to_return
def deduplicate_by_footprint(
gdf,
split_by,
footprints,
keep_rules=[],
return_intersections=False,
prop_duplicated='staging_duplicated'
# defaults to these options only if aren't already specified in config
):
"""
Label polygons to remove if they occur within an area where associated footprints
overlap. This method can be used to remove polygons that originate from
different overlapping files.
Parameters
----------
gdf : GeoDataFrame
The GeoDataFrame to deduplicate
split_by : str
The name of the property in the GeoDataFrame that links the polygon to
the footprint, e.g. 'source_file'. This property will be used to divide
the GeoDataFrame into groups. The groups with the preferred polygons
will be identified using the `rank` parameter. Polygons from less
preferred groups will be removed when they occur within an area where
footprints overlap. Every pairwise combination of unique values for
this property will be compared, so ideally, this property should not
have too many unique values, or the deduplication will be very slow.
footprints : dict
A dictionary that maps the unique values of `split_by` to a
GeoDataFrame of or path (string) to the footprint of the associated
file
keep_rules : str
One or more rules that define which footprint is preferred. Polygons
that occur in an area where footprints overlap will be removed if they
are not associated with the preferred footprint. Keep_rules are given
as a list of tuples in the form (property, operator). The property is
the name of a property in the footprint file. The operator is either
'larger' or 'smaller'. If the rule is 'larger', then polygons from the
footprint with the largest value for the property will be kept, and
vice versa for smaller. When two properties are equal, then the next
property in the list will be checked.
return_intersections : bool, optional
If true, the polygons that represent the intersections between
footprints will be returned. Default is False. Not currently available
in this release 0.9.0. return_intersections is to be integrated again
in future releases.
prop_duplicated : str, optional
Defaults to "staging_duplicated". The column name / property to use to
flag duplicates.
Returns
-------
GeoDataFrame
Returns input GDF with the polygons flagged as duplicates if they meet
the criteria outlined in the docs. Duplicates are marked as True in the
prop_duplicated column.
`intersections` represents the polygon area where the footprints overlap.
It has not been integrated into the function again since the deduplication
approach changed from returning a dictionary to returning a labeled GDF.
This will be integrated again in releases after 0.9.0.
"""
logger.info(f"Executing footprint deduplication.")
gdf = gdf.copy()
# `to_remove` list will hold the polygons that fit either of the criteria:
# 1. were previously labeled True for `duplicated` col because poly
# fell outside footprint
# 2. are labeled as True for `duplicated` col within this function
# based on overlap of footprints
to_remove = []
# subset the gdf to just polys that were identified as dups because
# they fell outside the footprint, earlier in the staging step
known_dups = gdf[gdf[prop_duplicated] == True]
logger.info(f"Length of known_dups is {len(known_dups)}.")
# Will hold the polygons that defined the footprint intersections
intersections = []
# To make sure footprints and overlap GeoDataFrame are in the correct CRS
crs = gdf.crs
# Get the unique values of the split_by property. Divide the GeoDataFrame.
# split_by is the filename, so data is grouped by input file
gdf_grouped = gdf.groupby(split_by)
# create a dict with each key being an input file,
# and the value of each key is the polygons that came from that file
gdf_dict = {}
for g in gdf_grouped.groups:
gdf_dict[g] = gdf_grouped.get_group(g)
names = list(gdf_grouped.groups)
# If the values of footprints dict are strings, then assume the strings are
# paths and load the footprints as individual GeoDataFrames
if all([isinstance(v, str) for v in footprints.values()]):
for name, path in footprints.items():
try:
footprints[name] = gpd.read_file(path)
except Exception:
footprints[name] = None
warnings.warn(f'Footprint missing for {name}')
# Add a column to the footprint GeoDataFrame that contains the filename of the footprint
prop_filename_temp = 'filename_' + uuid.uuid4().hex
for name, fp_gdf in footprints.items():
fp_gdf[prop_filename_temp] = name
# Make sure the footprints are in the same CRS as the GeoDataFrame
fp_gdf.to_crs(crs, inplace=True)
footprints[name] = fp_gdf
# Rank the footprints according to the keep_rules
# to determine which file's polygons to keep where two
# footprints overlap
footprints_concat = gpd.GeoDataFrame(pd.concat(
footprints.values(), ignore_index=True))
sort_props, sort_order = keep_rules_to_sort_order(keep_rules)
footprints_concat.sort_values(
by=sort_props,
ascending=sort_order,
inplace=True)
rank = footprints_concat[prop_filename_temp].tolist()
# Find overlapping section of footprints
# For every pairwise combination of two (subsetted) GDFs
for pair in itertools.combinations(names, 2):
name1 = pair[0]
name2 = pair[1]
footprint1 = footprints.get(name1)
footprint2 = footprints.get(name2)
if(footprint1 is None or footprint2 is None):
continue
# Get overlap between two footprints
overlap = gpd.GeoDataFrame(
geometry=footprint1.intersection(footprint2))
overlap['overlap'] = True
overlap.to_crs(crs, inplace=True)
intersections.append(overlap)
# Identify which GDF is not preferred in the pair. This is the GDF that
# we will remove polygons from.
least_preferred = name1
if rank.index(name2) > rank.index(name1):
least_preferred = name2
to_reduce = gdf_dict.get(least_preferred)
# Find polygons in the non-preferred GDF that intersect with overlap
# and remove them.
overlap_boolean = to_reduce.sjoin(overlap, how='left')['overlap']
overlap_boolean = overlap_boolean.fillna(False)
# subset the least_preferred GDF for only polygons that do not fall within
# the region where the footprints overlap
reduced = to_reduce[~overlap_boolean]
# Update the dictionary with the reduced GDF
gdf_dict[least_preferred] = reduced
# Save the removed polygons to the list defined at start of function
# that holds all dataframes with polys labeled as duplicates
to_remove.append(to_reduce[overlap_boolean])
# Recombine the GDFs from the dictionary:
keep = pd.concat(gdf_dict.values(), ignore_index=True)
keep.reset_index(drop = True, inplace = True)
# combine the list of dataframes (called `to_remove`) into one dataframe
# along rows (stack the dataframes)
to_remove = pd.concat(to_remove, ignore_index = True)
to_remove.reset_index(drop = True, inplace = True)
# the two duplicate dataframes have the same columns, so concatenate them (stack)
to_remove_all = pd.concat([to_remove, known_dups], ignore_index = True)
to_remove_all.reset_index(drop = True, inplace = True)
# ensure that the `keep` gdf has the `prop_duplicated` col too,
# and values in the `prop_duplicated` column = False
if keep is not None:
# if the column `prop_duplicated` is already present,
# set values to True if already set to True
# if the value is not already True, set it to False
if prop_duplicated in keep.columns:
keep[prop_duplicated] = \
keep[prop_duplicated].apply(
lambda x: True if x is True else False)
# if the column `prop_duplicated` is not already present,
# create it and set all values to False
else:
keep[prop_duplicated] = False
# ensure that the labels for all of `prop_duplicated` are True for to_remove_all
if to_remove_all is not None:
to_remove_all[prop_duplicated] = True
# combine the keep and to_remove_all into 1 gdf,
# stack them because already have the same columns
to_return = pd.concat([keep, to_remove_all], ignore_index = True)
to_return.reset_index(drop = True, inplace = True)
#if return_intersections:
# to_return['intersections'] = pd.concat(intersections, ignore_index=True)
# need to re-integrate this optional step later,
# not currently available since removing dict step during cliping to FP
if prop_duplicated in to_return.columns:
if True in to_return[prop_duplicated].values:
sum_true = (to_return[prop_duplicated] == True).value_counts()[True]
logger.info(f"Sum of True values in the {prop_duplicated}"
f" col is: {sum_true}")
else:
sum_true = 0
logger.info(f"Sum of True values in the {prop_duplicated}"
f"col is: {sum_true}")
else:
logger.info(f"{prop_duplicated} is not a column present after labeling.")
return to_return
def label_duplicates(deduplicate_output, prop_duplicated):
"""Recombine the keep & removed GDFs and mark the removed as duplicates
Parameters
----------
deduplicate_output : dict
The output of one of the deduplication methods
prop_duplicated : str
The column name to use to flag duplicate polygons
Returns
-------
GeoDataFrame
The combined GDF with duplicated polygons flagged.
"""
not_duplicates = deduplicate_output['to_keep']
duplicates = deduplicate_output['to_remove']
if duplicates is not None:
duplicates[prop_duplicated] = True
if not_duplicates is not None:
if prop_duplicated in not_duplicates.columns:
# if the gdf of polygons to keep exists and
# contains the column `prop_duplicated`
# set values to True if already set to True
# if the value is not already True, set it to False
not_duplicates[prop_duplicated] = \
not_duplicates[prop_duplicated].apply(
lambda x: True if x is True else False)
else:
# if the column `prop_duplicated` is not already present,
# create it and set all values to False
not_duplicates[prop_duplicated] = False
combined = pd.concat([not_duplicates, duplicates], ignore_index=True)
combined.reset_index(drop=True, inplace=True)
if prop_duplicated not in combined.columns:
msg = (f"Config set to dedup but {prop_duplicated} column"
f" not in gdf after labeling duplicates."
f"\nColumns are: {combined.columns}")
logger.info(msg)
if combined[prop_duplicated].isna().any():
logger.info(f"{prop_duplicated} column has NA values.")
return combined
# def plot_duplicates(deduplicate_output, split_by):
# """
# Plot the output of deduplication (useful for testing & choosing thresholds)
# """
# do = deduplicate_output
# ax = do['to_keep'].plot(column=split_by, cmap='Dark2', alpha=0.6)
# do['to_remove'].plot(
# ax=ax,
# facecolor='none',
# edgecolor='k',
# alpha=0.6,
# linewidth=0.5)
# do['intersections'].plot(
# ax=ax,
# facecolor='none',
# edgecolor='red',
# alpha=0.6,
# linewidth=0.5)
# return ax