In [None]:
import os
import operator
from collections import Counter
from collections import namedtuple

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from IPython.display import display
from ipywidgets import Button, Layout, Box, VBox, Output

from sdcdup.utils import generate_overlap_tag_slices
from sdcdup.utils import generate_tag_pair_lookup
from sdcdup.utils import get_project_root
from sdcdup.utils import get_overlap_matches
from sdcdup.utils import load_duplicate_truth
from sdcdup.utils import update_duplicate_truth
from sdcdup.utils import update_tile_cliques
from sdcdup.utils import ImgMod
from sdcdup.features import SDCImageContainer
from sdcdup.visualization import get_ticks
from sdcdup.visualization import show_image_pair
from sdcdup.visualization import ChannelShift

%load_ext dotenv
%dotenv
%matplotlib inline
%reload_ext autoreload
%autoreload 2

RED = (244, 67, 54)  #F44336
GREEN = (76, 175, 80)  #4CAF50
BLUE = (3, 169, 244)  #03A9F4

SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 16
BIGGEST_SIZE = 20
plt.rc('font', size=BIGGEST_SIZE)         # controls default text sizes
plt.rc('axes', titlesize=BIGGEST_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=BIGGEST_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGEST_SIZE)  # fontsize of the figure title

project_root = get_project_root()
train_image_dir = os.path.join(project_root, os.getenv('RAW_DATA_DIR'), 'train_768')

overlap_tag_slices = generate_overlap_tag_slices()
tag_pair_lookup = generate_tag_pair_lookup()

ticks = get_ticks()

In [None]:
matches_files = [
    'matches_bmh96_0.9.csv',
]
sdcic = SDCImageContainer()
sdcic.matches = get_overlap_matches(matches_files)

In [None]:
score_types = ['bmh96', 'pix', 'avg', 'dnn']
overlap_image_maps = sdcic.load_image_overlap_properties(matches_files, score_types=score_types)
print(len(overlap_image_maps))

In [None]:
dup_truth = load_duplicate_truth()
print(len(dup_truth))

## search for reasonable thresholds

In [None]:
score_arrays = {}
for score_type in score_types:
    score_arrays[score_type] = []
    
for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
    for img1_overlap_tag, scores in overlap_maps.items():
        if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
            continue
        scores_dict = scores._asdict()
        for score_type in score_types:
            score_arrays[score_type] += list(scores_dict[score_type])

print(len(score_arrays[score_type]))

In [None]:
overlap_limits_df = pd.DataFrame()

for score_type in score_types:
    overlap_limits_df[score_type] = pd.Series(score_arrays[score_type])

overlap_limits_df.describe(percentiles=[.01, .02, .05, .1, .25, .5, .75, .9, .95, .98, .99])

## Filter

In [None]:
#  |-----|--------------|-----|
# min  lower          upper  max

overlap_metric_ids = ['bmh96', 'cmh', 'pix', 'px0']
Overlap_Scores_Limit = namedtuple('overlap_scores_limit', overlap_metric_ids)

osl_lower = Overlap_Scores_Limit(0.99, 0.99, 0, 0)
osl_upper = Overlap_Scores_Limit(1.0, 1.0, 1e4, 1e4)

osl_lower = osl_lower._asdict()
osl_upper = osl_upper._asdict()

scores_min = {metric_id + '_min': 0 for metric_id in overlap_metric_ids}
scores_max = {metric_id + '_max': 0 for metric_id in overlap_metric_ids}
scores_hilo = {**scores_min, **scores_max}
scores_hilo_keys = [s for s in scores_hilo]
Overlap_Idx_Scores = namedtuple('overlap_idx_scores', ['idx', *scores_hilo_keys])

min_hits = {metric_id: 0 for metric_id in overlap_metric_ids}
max_hits = {metric_id: 0 for metric_id in overlap_metric_ids}

overlap_candidates = []
for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
    for img1_overlap_tag, scores in overlap_maps.items():
        if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
            continue

        constraint_hits = 0

        overlap_metrics_dict = scores._asdict()

        for m_id in overlap_metric_ids:
            scores_hilo[m_id + '_min'] = np.min(overlap_metrics_dict[m_id])
            if scores_hilo[m_id + '_min'] < osl_lower[m_id]:
                min_hits[m_id] += 1
                constraint_hits += 1

        for m_id in overlap_metric_ids:
            scores_hilo[m_id + '_max'] = np.max(overlap_metrics_dict[m_id])
            if scores_hilo[m_id + '_max'] > osl_upper[m_id]:
                max_hits[m_id] += 1
                constraint_hits += 1

        if constraint_hits <= 2:
            continue

        overlap_scores = Overlap_Idx_Scores(
            (img1_id, img2_id, img1_overlap_tag), *[scores_hilo[key] for key in scores_hilo_keys])

        overlap_candidates.append(overlap_scores)

print(len(overlap_candidates))
print(min_hits['bmh96'], min_hits['cmh'], min_hits['pix'], min_hits['px0'])
print(max_hits['bmh96'], max_hits['cmh'], max_hits['pix'], max_hits['px0'])

In [None]:
solid_hashes = set()
for img_id, tile_issolid_grid in sdcic.img_metrics['sol'].items():
    idxs = set(np.where(tile_issolid_grid >= 0)[0])
    for idx in idxs:
        if np.all(tile_issolid_grid[idx] >= 0):
            solid_hashes.add(sdcic.img_metrics['md5'][img_id][idx])

print('solid hashes', solid_hashes)

tile_hash_dup_cliques = nx.Graph()
tile_hash_dif_cliques = nx.Graph()

for (img1_id, img2_id, img1_overlap_tag), is_dup in dup_truth.items():
    for idx1, idx2 in tag_pair_lookup[img1_overlap_tag]:
        tile1_hash = sdcic.img_metrics['md5'][img1_id][idx1]
        tile2_hash = sdcic.img_metrics['md5'][img2_id][idx2]
        if is_dup:
            if tile1_hash in solid_hashes or tile2_hash in solid_hashes:
                continue
            update_tile_cliques(tile_hash_dup_cliques, tile1_hash, tile2_hash)
        else:
            if tile1_hash == tile2_hash:
                continue
            tile_hash_dif_cliques.add_edge(tile1_hash, tile2_hash)

print(tile_hash_dup_cliques.number_of_nodes(), tile_hash_dif_cliques.number_of_nodes())

neighbor_counts = Counter()
for tile_hashes in nx.connected_components(tile_hash_dup_cliques):
    neighbor_counts[len(tile_hashes)] += 1

list(sorted(neighbor_counts.items()))

In [None]:
duplicate_candidates = []
for candidate in tqdm_notebook(sorted(overlap_candidates, key=operator.attrgetter('bmh96_min', 'px0_max'), reverse=True)):
    duplicate_candidates.append(candidate.idx)
print(len(duplicate_candidates))

# Create an interactive widget for tagging duplicate overlaps.

In [None]:
candidates_iter = iter(duplicate_candidates)
n_candidates = len(duplicate_candidates)
candidates_idx = 0

overlap_labels = {}
auto_overlap_labels = {}

draw_bboxes = True
bbox_thickness = 4

box_layout = Layout(display='flex',
                    flex_flow='row',
                    align_items='stretch',
                    width='100%')

def get_next_img_pair():
    
    global img1_id, img2_id, img1_overlap_tag, candidates_idx
    n_skip = 0
    i_skip = 0
    
    while True:
        img1_id, img2_id, img1_overlap_tag = next(candidates_iter)
        candidates_idx += 1
        
        if i_skip < n_skip:
            i_skip += 1
            continue
            
        assert img1_id < img2_id
        
        # Make sure we don't revisit anything we've already seen (and labeled) before
        if (img1_id, img2_id, img1_overlap_tag) in overlap_labels:
            continue
        
        # If a candidate already has a label, skip it.
        if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
            continue
        
        is_dup = 1
        for idx1, idx2 in tag_pair_lookup[img1_overlap_tag]:

            tile1_hash = sdcic.img_metrics['md5'][img1_id][idx1]
            tile2_hash = sdcic.img_metrics['md5'][img2_id][idx2]

            if tile1_hash in tile_hash_dif_cliques and tile2_hash in set(nx.neighbors(tile_hash_dif_cliques, tile1_hash)):
                is_dup = 0
                break
            elif tile1_hash in tile_hash_dup_cliques and tile2_hash in set(nx.neighbors(tile_hash_dup_cliques, tile1_hash)):
                continue
            else:
                is_dup = -1

        if is_dup == 1:
            auto_overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = 1
            update_tile_hash_dup_cliques(img1_id, img2_id, img1_overlap_tag)
            continue
        elif is_dup == 0:
            auto_overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = 0
            update_tile_hash_dif_cliques(img1_id, img2_id, img1_overlap_tag)
            continue
            
        break

    return img1_id, img2_id, img1_overlap_tag


def draw_images(img1_id, img2_id, img1_overlap_tag):
    
    if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
        bbox_color = GREEN if dup_truth[(img1_id, img2_id, img1_overlap_tag)] else RED
    else:
        bbox_color = BLUE

    fig, ax = plt.subplots(2, 2, figsize=(16, 16))
    
    imgmod1 = ImgMod(os.path.join(train_image_dir, img1_id))
    imgmod2 = ImgMod(os.path.join(train_image_dir, img2_id))
    
    show_image_pair(ax[0][0], ax[0][1], imgmod1, imgmod2, img1_overlap_tag, draw_bboxes, bbox_thickness, bbox_color, img1_id, img2_id, ticks)

    scores = overlap_image_maps[(img1_id, img2_id)][img1_overlap_tag]
    title1 = f'bmh96: {np.min(scores.bmh96):7.5f} dnn: {np.min(scores.dnn):7.5f}'
    title2 = f'avg: {np.max(scores.avg)} pix: {np.max(scores.pix)}'

    show_image_pair(ax[1][0], ax[1][1], imgmod1, imgmod2, img1_overlap_tag, draw_bboxes, bbox_thickness, bbox_color, title1, title2, ticks, shift=ChannelShift('median', False))

    return ax


def redraw(img1_id, img2_id, img1_overlap_tag):
    out.clear_output(True)
    with out:
        ax = draw_images(img1_id, img2_id, img1_overlap_tag)
        plt.show()
    

def update_tile_hash_dup_cliques(img1_id, img2_id, img1_overlap_tag):
    for idx1, idx2 in tag_pair_lookup[img1_overlap_tag]:
        tile1_hash = sdcic.img_metrics['md5'][img1_id][idx1]
        tile2_hash = sdcic.img_metrics['md5'][img2_id][idx2]
        if tile1_hash in solid_hashes or tile2_hash in solid_hashes:
            continue
        update_tile_cliques(tile_hash_dup_cliques, tile1_hash, tile2_hash)


def update_tile_hash_dif_cliques(img1_id, img2_id, img1_overlap_tag):
    for idx1, idx2 in tag_pair_lookup[img1_overlap_tag]:
        tile1_hash = sdcic.img_metrics['md5'][img1_id][idx1]
        tile2_hash = sdcic.img_metrics['md5'][img2_id][idx2]
        if tile1_hash == tile2_hash:
            continue
        tile_hash_dif_cliques.add_edge(tile1_hash, tile2_hash)


def on_same_button_clicked(b):
    global img1_id, img2_id, img1_overlap_tag
    overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = 1
    update_tile_hash_dup_cliques(img1_id, img2_id, img1_overlap_tag)
    img1_id, img2_id, img1_overlap_tag = get_next_img_pair()
    redraw(img1_id, img2_id, img1_overlap_tag)

    
def on_diff_button_clicked(b):
    global img1_id, img2_id, img1_overlap_tag
    overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = 0
    update_tile_hash_dif_cliques(img1_id, img2_id, img1_overlap_tag)
    img1_id, img2_id, img1_overlap_tag = get_next_img_pair()
    redraw(img1_id, img2_id, img1_overlap_tag)

    
def on_skip_button_clicked(b):
    global img1_id, img2_id, img1_overlap_tag
    img1_id, img2_id, img1_overlap_tag = get_next_img_pair()
    redraw(img1_id, img2_id, img1_overlap_tag)


same_button = Button(
    description='Same',
    disabled=False,
    layout=Layout(flex='1 1 auto', width='auto'), 
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Overlays are the same',
    icon='check'
)

diff_button = Button(
    description='Diff',
    disabled=False,
    layout=Layout(flex='1 1 auto', width='auto'), 
    button_style='danger', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Overlays are different',
    icon='x'
)

skip_button = Button(
    description='Skip',
    disabled=False,
    layout=Layout(flex='1 1 auto', width='auto'), 
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Not sure.  Skip for now.',
    icon='?'
)

same_button.on_click(on_same_button_clicked)
diff_button.on_click(on_diff_button_clicked)
skip_button.on_click(on_skip_button_clicked)

out = Output()
buttons_3 = Box(children=[same_button, diff_button, skip_button], layout=box_layout)
display(VBox([out, buttons_3]))

img1_id, img2_id, img1_overlap_tag = get_next_img_pair()
with out:
    ax = draw_images(img1_id, img2_id, img1_overlap_tag)
    plt.show()

In [None]:
# print details of current iteration
print(img1_id, img2_id, img1_overlap_tag)
overlap_image_maps[(img1_id, img2_id)][img1_overlap_tag]

In [None]:
# print our progress (how many more of these do we have to do???)
print(f'{len(overlap_labels)} labels verified')
print(f'{candidates_idx:>6}/{n_candidates:>6} -> {100*candidates_idx/n_candidates:>.5f}% complete')

In [None]:
# undo last 
# (woops!!! Accidentally hit 'Diff' but meant to hit 'Same'? Don't worry, it happens more than I'd
# like to admit.  Run this cell to purge the most recent click.  Remember, the blue 'Skip' doesn't 
# actually append anything to overlap_labels when you click it. So if you accidentally hit 'Skip', but 
# didn't mean to, don't worry about it. You'll likely see the example again next time you run the dup_tool.)
# TODO: Make sure we also remove any automatically added labels since our last click.
for k in overlap_labels:
    continue
print(k)
del overlap_labels[k]

In [None]:
# Merge overlap_labels into truth. Do not shut down this notebook until you have run this cell.
# Otherwise, all the tedious labeling work you just did will be lost.  I intended it this way.
# This tool does not write out to file every time you hit the green or red button above.
# We shouldn't save truth unless we are absolutely sure we are ready to.
print(len(dup_truth))
dup_truth = update_duplicate_truth(overlap_labels, verified=True)
print(len(dup_truth))

In [None]:
# If you're stopping early from labeling, maybe your're tired or just plain sick of doing it, 
# Consider running [auto_truth_from_cliques.ipynb](notebooks/labeling/auto_truth_from_cliques.ipynb) 
# to pick up any unvisited overlaps that our cliques would have gotten.