In [None]:
import os
import operator
from collections import Counter
from collections import namedtuple

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from skimage.util import montage
import cv2

from IPython.display import display
from ipywidgets import Button, Layout, Box, VBox, Output

from sdcdup.utils import overlap_tag_pairs
from sdcdup.utils import generate_overlap_tag_slices
from sdcdup.utils import boundingbox_corners
from sdcdup.utils import generate_tag_pair_lookup
from sdcdup.utils import channel_shift
from sdcdup.utils import load_duplicate_truth
from sdcdup.utils import update_duplicate_truth
from sdcdup.utils import update_tile_cliques

from sdcdup.features.image_features import SDCImageContainer
from sdcdup.features.image_features import load_image_overlap_properties

%matplotlib inline
%reload_ext autoreload
%autoreload 2

EPS = np.finfo(np.float32).eps

RED = (244, 67, 54)  #F44336 
GREEN = (76, 175, 80)  #4CAF50 
LIGHT_BLUE = (3, 169, 244)  #03A9F4

SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 16
BIGGEST_SIZE = 20
plt.rc('font', size=BIGGEST_SIZE)         # controls default text sizes
plt.rc('axes', titlesize=BIGGEST_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=BIGGEST_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGEST_SIZE)  # fontsize of the figure title

montage_rgb = lambda x: np.stack([montage(x[:, :, :, i]) for i in range(x.shape[3])], -1)
montage_pad = lambda x, *args, **kwargs: montage(x, padding_width=10, *args, **kwargs)

train_image_dir = 'data/raw/train_768/'
image_md5hash_grids_file = 'data/interim/image_md5hash_grids.pkl'
image_bm0hash_grids_file = 'data/interim/image_bm0hash_grids.pkl'
image_cm0hash_grids_file = 'data/interim/image_cm0hash_grids.pkl'
image_greycop_grids_file = 'data/interim/image_greycop_grids.pkl'
image_entropy_grids_file = 'data/interim/image_entropy_grids.pkl'
image_issolid_grids_file = 'data/interim/image_issolid_grids.pkl'
image_shipcnt_grids_file = 'data/interim/image_shipcnt_grids.pkl'

overlap_tag_slices = generate_overlap_tag_slices()
img_overlap_index_maps = generate_tag_pair_lookup()

dtick = 256
n_ticks = 768 // dtick + 1
ticks = [i * dtick for i in range(n_ticks)]

In [None]:
# SENDTOMODULE
class ImgMod:
    """
    Reads a single image to be modified by hls.
    """

    def __init__(self, filename):
        self.filename = filename
        self.img_id = filename.split('/')[-1]

        self._hls_chan = None
        self._hls_gain = None

        self._parent_bgr = None
        self._parent_hls = None
        self._parent_rgb = None
        self._cv2_hls = None
        self._cv2_bgr = None
        self._cv2_rgb = None

    def brightness_shift(self, chan, gain):
        self._hls_chan = chan
        self._hls_gain = gain
        self._cv2_hls = None
        return self.cv2_rgb
    
    def scale(self, minval, maxval):
        m = 255.0 * (maxval - minval)
        res = m * (self.parent_bgr - minval)
        return np.around(res).astype(np.uint8)
    
    @property
    def shape(self):
        return self.parent_bgr.shape
    
    @property
    def parent_bgr(self):
        if self._parent_bgr is None:
            self._parent_bgr = cv2.imread(self.filename)
        return self._parent_bgr

    @property
    def parent_hls(self):
        if self._parent_hls is None:
            self._parent_hls = self.to_hls(self.parent_bgr)
        return self._parent_hls

    @property
    def parent_rgb(self):
        if self._parent_rgb is None:
            self._parent_rgb = self.to_rgb(self.parent_bgr)
        return self._parent_rgb

    @property
    def cv2_hls(self):
        if self._cv2_hls is None:
            if self._hls_gain is None:
                self._cv2_hls = self.parent_hls
            else:
                self._cv2_hls = channel_shift(self.parent_hls, self._hls_chan, self._hls_gain)
        return self._cv2_hls

    @property
    def cv2_bgr(self):
        if self._cv2_bgr is None:
            self._cv2_bgr = self.to_bgr(self.cv2_hls)
        return self._cv2_bgr

    @property
    def cv2_rgb(self):
        if self._cv2_rgb is None:
            self._cv2_rgb = self.to_rgb(self.cv2_bgr)
        return self._cv2_rgb

    def to_hls(self, bgr):
        return cv2.cvtColor(bgr, cv2.COLOR_BGR2HLS_FULL)

    def to_bgr(self, hls):
        return cv2.cvtColor(hls, cv2.COLOR_HLS2BGR_FULL)

    def to_rgb(self, bgr):
        return cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)

In [None]:
sdcic = SDCImageContainer()
sdcic.preprocess_image_properties(
    image_md5hash_grids_file,
    image_bm0hash_grids_file,
    image_cm0hash_grids_file,
    image_greycop_grids_file,
    image_entropy_grids_file,
    image_issolid_grids_file)
sdcic.preprocess_label_properties(
    image_shipcnt_grids_file)

In [None]:
dup_truth = load_duplicate_truth()
print(len(dup_truth))

In [None]:
# score_types = ['bmh', 'cmh', 'con', 'hom', 'eng', 'cor', 'epy', 'enp', 'pix', 'px0', 'shp']
n_matching_tiles_list = [9, 6, 4, 3, 2, 1]
overlap_image_maps = load_image_overlap_properties(n_matching_tiles_list)
print(len(overlap_image_maps))

## search for reasonable thresholds

In [None]:
bmh_arr = []
pix_arr = []
px0_arr = []
shp_arr = []

for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
    for img1_overlap_tag, scores in overlap_maps.items():
        if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
            continue

        bmh_arr += list(scores.bmh)
        pix_arr += list(scores.pix)
        px0_arr += list(scores.px0)
        shp_arr += list(scores.shp)

overlap_limits_df = pd.DataFrame()

overlap_limits_df['bmh'] = pd.Series(bmh_arr)
overlap_limits_df['pix'] = pd.Series(pix_arr)
overlap_limits_df['px0'] = pd.Series(px0_arr)
overlap_limits_df['shp'] = pd.Series(shp_arr)

overlap_limits_df.describe(percentiles=[.01, .02, .05, .1, .25, .5, .75, .9, .95, .98, .99])

## Filter

In [None]:
#  |-----|--------------|-----|
# min  lower          upper  max

metric_tags = ['bmh', 'pix', 'px0', 'shp']
Overlap_Scores_Lower_Limit = namedtuple('overlap_scores_lower_limit', metric_tags)
Overlap_Scores_Upper_Limit = namedtuple('overlap_scores_upper_limit', metric_tags)

osl_lower = Overlap_Scores_Lower_Limit(0.5, 0, 0, 0)
osl_upper = Overlap_Scores_Upper_Limit(1.0, 1e7, 1e5, 1e4)

Overlap_Idx_Scores = namedtuple('overlap_idx_scores', [
    'idx', 
    'bmh_min', 'pix_min', 'px0_min', 'shp_min', 
    'bmh_max', 'pix_max', 'px0_max', 'shp_max'])

bmh_min = 0
pix_min = 0
px0_min = 0
shp_min = 0

bmh_max = 1
pix_max = 256*256*3*255
px0_max = 256*256
shp_max = 256*256

bmh_min_hits = 0
pix_min_hits = 0
px0_min_hits = 0
shp_min_hits = 0

bmh_max_hits = 0
pix_max_hits = 0
px0_max_hits = 0
shp_max_hits = 0

overlap_candidates = []
for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
    for img1_overlap_tag, scores in overlap_maps.items():
        if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
            continue

        constraint_hits = 0
        
        bmh_min = np.min(scores.bmh)
        if bmh_min < osl_lower.bmh:
            bmh_min_hits += 1
            constraint_hits += 1
            
        pix_min = np.min(scores.pix)
        if pix_min < osl_lower.pix:
            pix_min_hits += 1
            constraint_hits += 1

        px0_min = np.min(scores.px0)
        if px0_min < osl_lower.px0:
            px0_min_hits += 1
            constraint_hits += 1

        shp_min = np.min(scores.shp)
        if shp_min < osl_lower.shp:
            shp_min_hits += 1
            constraint_hits += 1

            
        bmh_max = np.max(scores.bmh)
        if bmh_max > osl_upper.bmh:
            bmh_max_hits += 1
            constraint_hits += 1
            
        pix_max = np.max(scores.pix)
        if pix_max > osl_upper.pix:
            pix_max_hits += 1
            constraint_hits += 1

        px0_max = np.max(scores.px0)
        if px0_max > osl_upper.px0:
            px0_max_hits += 1
            constraint_hits += 1

        shp_max = np.max(scores.shp)
        if shp_max > osl_upper.shp:
            shp_max_hits += 1
            constraint_hits += 1

        if constraint_hits < 2:
            continue
            
        overlap_scores = Overlap_Idx_Scores(
            (img1_id, img2_id, img1_overlap_tag), 
            bmh_min, pix_min, px0_min, shp_min, 
            bmh_max, pix_max, px0_max, shp_max)
        overlap_candidates.append(overlap_scores)
        
print(len(overlap_candidates))
print(bmh_min_hits, pix_min_hits, px0_min_hits, shp_min_hits)
print(bmh_max_hits, pix_max_hits, px0_max_hits, shp_max_hits)

In [None]:
solid_hashes = set()
for img_id, tile_issolid_grid in sdcic.tile_issolid_grids.items():
    idxs = set(np.where(tile_issolid_grid >= 0)[0])
    for idx in idxs:
        if np.all(tile_issolid_grid[idx] >= 0):
            solid_hashes.add(sdcic.tile_md5hash_grids[img_id][idx])

print('solid hashes', solid_hashes)

tile_hash_dup_cliques = nx.Graph()
tile_hash_dif_cliques = nx.Graph()

for (img1_id, img2_id, img1_overlap_tag), is_dup in dup_truth.items():
    for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:
        tile1_hash = sdcic.tile_md5hash_grids[img1_id][idx1]
        tile2_hash = sdcic.tile_md5hash_grids[img2_id][idx2]
        if is_dup:
            if tile1_hash in solid_hashes or tile2_hash in solid_hashes:
                continue
            update_tile_cliques(tile_hash_dup_cliques, tile1_hash, tile2_hash)
        else:
            if tile1_hash == tile2_hash:
                continue
            tile_hash_dif_cliques.add_edge(tile1_hash, tile2_hash)

print(tile_hash_dup_cliques.number_of_nodes(), tile_hash_dif_cliques.number_of_nodes())

neighbor_counts = Counter()
for tile_hashes in nx.connected_components(tile_hash_dup_cliques):
    neighbor_counts[len(tile_hashes)] += 1

list(sorted(neighbor_counts.items()))

In [None]:
duplicate_candidates = []
for candidate in tqdm_notebook(sorted(overlap_candidates, key=operator.attrgetter('shp_max', 'px0_max'), reverse=True)):
    duplicate_candidates.append(candidate.idx)
print(len(duplicate_candidates))

# Create an interactive widget for tagging duplicate overlaps.

In [None]:
candidates_iter = iter(duplicate_candidates)
n_candidates = len(duplicate_candidates)
candidates_idx = 0

overlap_labels = {}
auto_overlap_labels = {}

box_layout = Layout(display='flex',
                    flex_flow='row',
                    align_items='stretch',
                    width='100%')

white_tile_hash = '0139c6c3'

def get_next_img_pair():
    
    global img1_id, img2_id, img1_overlap_tag, candidates_idx
    n_skip = 0
    i_skip = 0
    
    while True:
        img1_id, img2_id, img1_overlap_tag = next(candidates_iter)
        candidates_idx += 1
        
        if i_skip < n_skip:
            i_skip += 1
            continue
            
        assert img1_id < img2_id
        
        # Make sure we don't revisit anything we've already seen (and labelled) before
        if (img1_id, img2_id, img1_overlap_tag) in overlap_labels:
            continue
        
        # If a candidate already has a label, skip it.
        if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
            continue
        
#         if (img1_id, img2_id, img1_overlap_tag) in weak_preds:
#             break
#         else:
#             continue
        
        is_dup = 1
        for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:

            tile1_hash = sdcic.tile_md5hash_grids[img1_id][idx1]
            tile2_hash = sdcic.tile_md5hash_grids[img2_id][idx2]

            if tile1_hash in tile_hash_dif_cliques and tile2_hash in set(nx.neighbors(tile_hash_dif_cliques, tile1_hash)):
                is_dup = 0
                break
            elif tile1_hash in tile_hash_dup_cliques and tile2_hash in set(nx.neighbors(tile_hash_dup_cliques, tile1_hash)):
                continue
            else:
                is_dup = -1

        if is_dup == 1:
            auto_overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = 1
            update_tile_hash_dup_cliques(img1_id, img2_id, img1_overlap_tag)
            continue
        elif is_dup == 0:
            auto_overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = 0
            update_tile_hash_dif_cliques(img1_id, img2_id, img1_overlap_tag)
            continue
            
        break

    return img1_id, img2_id, img1_overlap_tag


def draw_images(img1_id, img2_id, img1_overlap_tag):
    global candidates_idx
    
    scores = overlap_image_maps[(img1_id, img2_id)][img1_overlap_tag]
    imgmod1 = ImgMod(os.path.join(train_image_dir, img1_id))
    imgmod2 = ImgMod(os.path.join(train_image_dir, img2_id))
    
    slice1 = overlap_tag_slices[img1_overlap_tag]
    slice2 = overlap_tag_slices[overlap_tag_pairs[img1_overlap_tag]]

    m12 = np.median(np.vstack([imgmod1.parent_rgb[slice1], imgmod2.parent_rgb[slice2]]), axis=(0, 1), keepdims=True).astype(np.uint8)
    img1_drop = imgmod1.parent_rgb - m12
    img2_drop = imgmod2.parent_rgb - m12
        
#     brightness_level = -100 if np.sum(m12) >= 384 else 100
#     img1 = imgmod1.brightness_shift('L', brightness_level)
#     img2 = imgmod2.brightness_shift('L', brightness_level)
    img1 = imgmod1.parent_rgb
    img2 = imgmod2.parent_rgb
    
    if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
        bbox_color = GREEN if dup_truth[(img1_id, img2_id, img1_overlap_tag)] else RED
    else:
        bbox_color = LIGHT_BLUE
        
    bbox_thickness = 5
    offset = (bbox_thickness // 2) + 1
    offset_array = np.array([[offset], [-offset]])
    img1_bbox_pt1, img1_bbox_pt2 = boundingbox_corners[img1_overlap_tag] + offset_array
    img2_bbox_pt1, img2_bbox_pt2 = boundingbox_corners[overlap_tag_pairs[img1_overlap_tag]] + offset_array
    
    img1[slice1] = imgmod1.parent_rgb[slice1]
    img2[slice2] = imgmod2.parent_rgb[slice2]
    cv2.rectangle(img1, tuple(img1_bbox_pt1), tuple(img1_bbox_pt2), bbox_color, bbox_thickness)
    cv2.rectangle(img2, tuple(img2_bbox_pt1), tuple(img2_bbox_pt2), bbox_color, bbox_thickness)

    fig, ax = plt.subplots(2, 2, figsize=(16, 16))
    ax[0][0].imshow(img1)
    ax[0][0].set_title(f'{img1_id}')
    ax[0][0].set_xticks(ticks)
    ax[0][0].set_yticks(ticks)

    ax[0][1].imshow(img2)
    ax[0][1].set_title(f'{img2_id}')
    ax[0][1].set_xticks(ticks)
    ax[0][1].set_yticks(ticks)
    
    img1[slice1] = img1_drop[slice1]
    img2[slice2] = img2_drop[slice2]
    cv2.rectangle(img1, tuple(img1_bbox_pt1), tuple(img1_bbox_pt2), bbox_color, bbox_thickness)
    cv2.rectangle(img2, tuple(img2_bbox_pt1), tuple(img2_bbox_pt2), bbox_color, bbox_thickness)

    ax[1][0].imshow(img1)
    ax[1][0].set_title(f'bmh: {np.min(scores.bmh):7.5f} shp: {np.min(scores.shp)}')
    ax[1][0].set_xticks(ticks)
    ax[1][0].set_yticks(ticks)

    ax[1][1].imshow(img2)
    ax[1][1].set_title(f'px0: {np.min(scores.px0)} pix: {np.min(scores.pix)}')
    ax[1][1].set_xticks(ticks)
    ax[1][1].set_yticks(ticks)
    
    return ax


def redraw(img1_id, img2_id, img1_overlap_tag):
    out.clear_output(True)
    with out:
        ax = draw_images(img1_id, img2_id, img1_overlap_tag)
        plt.show()
    

def update_tile_hash_dup_cliques(img1_id, img2_id, img1_overlap_tag):
    for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:
        tile1_hash = sdcic.tile_md5hash_grids[img1_id][idx1]
        tile2_hash = sdcic.tile_md5hash_grids[img2_id][idx2]
        if tile1_hash in solid_hashes or tile2_hash in solid_hashes:
            continue
        update_tile_cliques(tile_hash_dup_cliques, tile1_hash, tile2_hash)


def update_tile_hash_dif_cliques(img1_id, img2_id, img1_overlap_tag):
    for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:
        tile1_hash = sdcic.tile_md5hash_grids[img1_id][idx1]
        tile2_hash = sdcic.tile_md5hash_grids[img2_id][idx2]
        if tile1_hash == tile2_hash:
            continue
        tile_hash_dif_cliques.add_edge(tile1_hash, tile2_hash)


def on_same_button_clicked(b):
    global img1_id, img2_id, img1_overlap_tag
    overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = 1
    update_tile_hash_dup_cliques(img1_id, img2_id, img1_overlap_tag)
    img1_id, img2_id, img1_overlap_tag = get_next_img_pair()
    redraw(img1_id, img2_id, img1_overlap_tag)

    
def on_diff_button_clicked(b):
    global img1_id, img2_id, img1_overlap_tag
    overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = 0
    update_tile_hash_dif_cliques(img1_id, img2_id, img1_overlap_tag)
    img1_id, img2_id, img1_overlap_tag = get_next_img_pair()
    redraw(img1_id, img2_id, img1_overlap_tag)

    
def on_skip_button_clicked(b):
    global img1_id, img2_id, img1_overlap_tag
    img1_id, img2_id, img1_overlap_tag = get_next_img_pair()
    redraw(img1_id, img2_id, img1_overlap_tag)


same_button = Button(
    description='Same',
    disabled=False,
    layout=Layout(flex='1 1 auto', width='auto'), 
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Overlays are the same',
    icon='check'
)

diff_button = Button(
    description='Diff',
    disabled=False,
    layout=Layout(flex='1 1 auto', width='auto'), 
    button_style='danger', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Overlays are different',
    icon='x'
)

skip_button = Button(
    description='Skip',
    disabled=False,
    layout=Layout(flex='1 1 auto', width='auto'), 
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Not sure.  Skip for now.',
    icon='?'
)

same_button.on_click(on_same_button_clicked)
diff_button.on_click(on_diff_button_clicked)
skip_button.on_click(on_skip_button_clicked)

out = Output()
buttons_3 = Box(children=[same_button, diff_button, skip_button], layout=box_layout)
display(VBox([out, buttons_3]))

img1_id, img2_id, img1_overlap_tag = get_next_img_pair()
with out:
    ax = draw_images(img1_id, img2_id, img1_overlap_tag)
    plt.show()

In [None]:
# print details of current iteration
overlap_image_maps[(img1_id, img2_id)][img1_overlap_tag]

In [None]:
# print our progress (how many more of these do we have to do???)
len(overlap_labels), len(auto_overlap_labels), candidates_idx, n_candidates, 100*len(overlap_labels)/n_candidates

In [None]:
# undo last 
# (woops!!! I accidentally hit 'Diff' but I meant to hit 'Same'!!! Don't worry, 
# it happens more than I'd like to admit.  Run this cell to purge the most recent click.  Remember, 
# the blue 'Skip' doesn't actually append anything to overlap_labels when you click it. So if you accidentally 
# hit 'Skip', don't worry about it. You'll most likely see your example again next time you run the dup_tool.)
for k in overlap_labels:
    continue
print(k)
del overlap_labels[k]

In [None]:
# Merge overlap_labels into truth. Do not shut down this notebook until you have run this cell.
# Otherwise, all the tedious labelling work you just did will be lost.  I intended it this way.
# This tool does not write out to file every time you hit the green or red button above.
# We shouldn't save truth unless we are absolutely sure we are ready to.
print(len(dup_truth))
dup_truth = update_duplicate_truth(overlap_labels, auto=False)
print(len(dup_truth))

In [None]:
# Auto1
auto_overlap_labels_0 = {}

for candidate in overlap_candidates:
    
    img1_id, img2_id, img1_overlap_tag = candidate.idx
    assert img1_id < img2_id
    if (img1_id, img2_id, img1_overlap_tag) in overlap_labels:
        continue
    if (img1_id, img2_id, img1_overlap_tag) in auto_overlap_labels_0:
        continue

    for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:
        
        tile1_hash = sdcic.tile_md5hash_grids[img1_id][idx1]
        tile2_hash = sdcic.tile_md5hash_grids[img2_id][idx2]
        
        if tile1_hash in tile_hash_dif_cliques and tile2_hash in set(nx.neighbors(tile_hash_dif_cliques, tile1_hash)):
            auto_overlap_labels_0[(img1_id, img2_id, img1_overlap_tag)] = 0
            break

print(len(auto_overlap_labels_0))

In [None]:
# Auto2
auto_overlap_labels_1 = {}

for candidate in overlap_candidates:
    
    img1_id, img2_id, img1_overlap_tag = candidate.idx
    assert img1_id < img2_id
    if (img1_id, img2_id, img1_overlap_tag) in overlap_labels:
        continue
    if (img1_id, img2_id, img1_overlap_tag) in auto_overlap_labels_1:
        continue

    for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:
        
        tile1_hash = sdcic.tile_md5hash_grids[img1_id][idx1]
        tile2_hash = sdcic.tile_md5hash_grids[img2_id][idx2]
        
        if tile1_hash in tile_hash_dup_cliques and tile2_hash in set(nx.neighbors(tile_hash_dup_cliques, tile1_hash)):
            continue
        else:
            break
    else:
        auto_overlap_labels_1[(img1_id, img2_id, img1_overlap_tag)] = 1

print(len(auto_overlap_labels_1))

In [None]:
# Auto3
auto_overlap_labels = {}
for key in auto_overlap_labels_0:
    assert key not in auto_overlap_labels_1
auto_overlap_labels.update(auto_overlap_labels_0)
auto_overlap_labels.update(auto_overlap_labels_1)
print(len(auto_overlap_labels))

In [None]:
# Need to verify, but I think running Auto1, Auto2, then Auto3 is exactly the same as just running this cell.

# If you're stopping early from labelling, maybe your're tired or just plain sick of doing it, 
# you can run this to pick up any unvisited overlaps that our cliques would have gotten.
for candidate in candidates_iter:
    img1_id, img2_id, img1_overlap_tag = candidate.idx
    
    if (img1_id, img2_id, img1_overlap_tag) in auto_overlap_labels:
        continue

    is_dup = 1
    for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:
        
        tile1_hash = sdcic.tile_md5hash_grids[img1_id][idx1]
        tile2_hash = sdcic.tile_md5hash_grids[img2_id][idx2]
        
        if tile1_hash in tile_hash_dif_cliques and tile2_hash in set(nx.neighbors(tile_hash_dif_cliques, tile1_hash)):
            is_dup = 0
            break
        elif tile1_hash in tile_hash_dup_cliques and tile2_hash in set(nx.neighbors(tile_hash_dup_cliques, tile1_hash)):
            continue
        else:
            is_dup = -1

    if is_dup == -1:
        continue
    
    auto_overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = is_dup
        
print(len(auto_overlap_labels))

In [None]:
dup_truth = update_duplicate_truth(auto_overlap_labels, auto=True)
print(len(dup_truth))