In [None]:
import io
import os
import time
import hashlib
import operator
import h5py
from collections import Counter
from collections import namedtuple
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from skimage.util import montage
import cv2
from cv2 import img_hash
import torch

from sdcdup.utils import overlap_tag_pairs
from sdcdup.utils import overlap_tag_maps
from sdcdup.utils import generate_overlap_tag_slices
from sdcdup.utils import boundingbox_corners
from sdcdup.utils import generate_tag_pair_lookup
from sdcdup.utils import fuzzy_compare
from sdcdup.utils import get_tile
from sdcdup.utils import get_hamming_distance_score
from sdcdup.utils import channel_shift
from sdcdup.utils import read_duplicate_truth
from sdcdup.utils import update_duplicate_truth
from sdcdup.utils import read_image_duplicate_tiles
from sdcdup.utils import write_image_duplicate_tiles
from sdcdup.utils import read_image_image_duplicate_tiles
from sdcdup.utils import update_image_image_duplicate_tiles
from sdcdup.utils import generate_overlap_tag_nines_mask

from test_friend_circles import SDCImageContainer

from dupnet import load_checkpoint

%matplotlib inline
%reload_ext autoreload
%autoreload 2

EPS = np.finfo(np.float32).eps

SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 16
BIGGEST_SIZE = 20
plt.rc('font', size=BIGGEST_SIZE)         # controls default text sizes
plt.rc('axes', titlesize=BIGGEST_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=BIGGEST_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)   # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)   # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)   # legend fontsize
plt.rc('figure', titlesize=BIGGEST_SIZE)  # fontsize of the figure title

montage_rgb = lambda x: np.stack([montage(x[:, :, :, i]) for i in range(x.shape[3])], -1)
montage_pad = lambda x, *args, **kwargs: montage(x, padding_width=10, *args, **kwargs)
zeros_mask = np.zeros((256*3, 256*3, 1), dtype=np.float32)

ship_dir = "data/input"
train_image_dir = os.path.join(ship_dir, "train_768")
train_mask_dir = os.path.join(ship_dir, 'train_masks_768')
train_seg_file = os.path.join(ship_dir, "fullmasks_768.h5")
image_greycop_grids_file = os.path.join("data", "image_greycop_grids.pkl")
image_md5hash_grids_file = os.path.join("data", "image_md5hash_grids.pkl")
image_bm0hash_grids_file = os.path.join("data", "image_bm0hash_grids.pkl")
image_cm0hash_grids_file = os.path.join("data", "image_cm0hash_grids.pkl")
image_entropy_grids_file = os.path.join("data", "image_entropy_grids.pkl")
image_duplicate_tiles_file = os.path.join("data", "image_duplicate_tiles.txt")
image_image_duplicate_tiles_file = os.path.join("data", "image_image_duplicate_tiles.txt")
duplicate_truth_file = os.path.join('data', 'duplicate_truth.txt')

overlap_tag_slices = generate_overlap_tag_slices()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def get_channel_entropy(ctr, img_size=1769472):  # 768x768x3
    ctr_norm = {k: v / img_size for k, v in sorted(ctr.items())}
    ctr_entropy = {k: -v * np.log(v) for k, v in ctr_norm.items()}
    entropy = np.sum([k * v for k, v in ctr_entropy.items()])
    return entropy

def get_entropy(img_id):
    img = cv2.imread(os.path.join(train_image_dir, img_id))
    img_grad = np.gradient(img.astype(np.int), axis=(0, 1))
    entropy_list = []
    for channel_grad in img_grad:
        ctr = Counter(np.abs(channel_grad).flatten())
        entropy_list.append(get_channel_entropy(ctr, img.size))
    return np.array(entropy_list)

def get_entropy1(img_id):
    img = cv2.imread(os.path.join(train_image_dir, img_id))
    img_grad = np.gradient(img.astype(np.int), 0.5, axis=(0, 1))
    entropy_list = []
    for channel_grad in img_grad:
        ctr = Counter(np.abs(channel_grad).astype(np.uint8).flatten())
        entropy_list.append(ctr)
    return entropy_list

def get_entropy2(img1_id, img2_id):
    entropy1_list = get_entropy1(img1_id)
    entropy2_list = get_entropy1(img2_id)
    entropy_list = []
    for ctr1, ctr2 in zip(entropy1_list, entropy2_list):
        ctr = (ctr1 - ctr2) + (ctr2 - ctr1)
        entropy_list.append(get_channel_entropy(ctr))
    return np.array(entropy_list)

In [None]:
class ImgMod:
    """
    Reads a single image to be modified by hls.
    """

    def __init__(self, filename):
        self.filename = filename
        self.img_id = filename.split('/')[-1]

        self._hls_chan = None
        self._hls_gain = None

        self._parent_bgr = None
        self._parent_hls = None
        self._parent_rgb = None
        self._cv2_hls = None
        self._cv2_bgr = None
        self._cv2_rgb = None

    def channel_shift(self, chan, gain):
        self._hls_chan = chan
        self._hls_gain = gain
        self._cv2_hls = None
        return self.cv2_rgb
    
    def scale(self, minval, maxval):
        m = 255.0 * (maxval - minval)
        res = m * (self.parent_bgr - minval)
        return np.around(res).astype(np.uint8)
    
    @property
    def shape(self):
        return self.parent_bgr.shape
    
    @property
    def parent_bgr(self):
        if self._parent_bgr is None:
            self._parent_bgr = cv2.imread(self.filename)
        return self._parent_bgr

    @property
    def parent_hls(self):
        if self._parent_hls is None:
            self._parent_hls = self.to_hls(self.parent_bgr)
        return self._parent_hls

    @property
    def parent_rgb(self):
        if self._parent_rgb is None:
            self._parent_rgb = self.to_rgb(self.parent_bgr)
        return self._parent_rgb

    @property
    def cv2_hls(self):
        if self._cv2_hls is None:
            if self._hls_gain == None:
                self._cv2_hls = self.parent_hls
            else:
                self._cv2_hls = channel_shift(self.parent_hls, self._hls_chan, self._hls_gain)
        return self._cv2_hls

    @property
    def cv2_bgr(self):
        if self._cv2_bgr is None:
            self._cv2_bgr = self.to_bgr(self.cv2_hls)
        return self._cv2_bgr

    @property
    def cv2_rgb(self):
        if self._cv2_rgb is None:
            self._cv2_rgb = self.to_rgb(self.cv2_bgr)
        return self._cv2_rgb

    def to_hls(self, bgr):
        return cv2.cvtColor(bgr, cv2.COLOR_BGR2HLS_FULL)

    def to_bgr(self, hls):
        return cv2.cvtColor(hls, cv2.COLOR_HLS2BGR_FULL)

    def to_rgb(self, bgr):
        return cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)

In [None]:
sdcic = SDCImageContainer(train_image_dir)
sdcic.preprocess_image_properties(
    image_greycop_grids_file,
    image_md5hash_grids_file,
    image_bm0hash_grids_file,
    image_cm0hash_grids_file,
    image_entropy_grids_file,
    image_duplicate_tiles_file)

In [None]:
dup_truth = read_duplicate_truth(duplicate_truth_file)
image_image_duplicate_tiles = read_image_image_duplicate_tiles(image_image_duplicate_tiles_file)
overlap_tag_nines_mask = generate_overlap_tag_nines_mask()
len(dup_truth), len(image_image_duplicate_tiles)

In [None]:
def load_image_image_properties(n_matching_tiles, overlap_image_maps):
    overlap_bmh_tile_scores_file = os.path.join("data", f"overlap_bmh_tile_scores_{n_matching_tiles}.pkl")
    overlap_cmh_tile_scores_file = os.path.join("data", f"overlap_cmh_tile_scores_{n_matching_tiles}.pkl")
    overlap_gcm_tile_scores_file = os.path.join("data", f"overlap_gcm_tile_scores_{n_matching_tiles}.pkl")
    overlap_enp_tile_scores_file = os.path.join("data", f"overlap_enp_tile_scores_{n_matching_tiles}.pkl")
    overlap_pix_tile_scores_file = os.path.join("data", f"overlap_pix_tile_scores_{n_matching_tiles}.pkl")
    
    df = pd.read_pickle(overlap_bmh_tile_scores_file)
    overlap_bmh_tile_scores = {}
    for img1_id, img2_id, img1_overlap_tag, *bmh_scores in tqdm_notebook(df.to_dict('split')['data']):
        if (img1_id, img2_id) not in overlap_bmh_tile_scores:
            overlap_bmh_tile_scores[(img1_id, img2_id)] = {}
        overlap_bmh_tile_scores[(img1_id, img2_id)][img1_overlap_tag] = np.array(bmh_scores)

    df = pd.read_pickle(overlap_cmh_tile_scores_file)
    overlap_cmh_tile_scores = {}
    for img1_id, img2_id, img1_overlap_tag, *cmh_scores in tqdm_notebook(df.to_dict('split')['data']):
        if (img1_id, img2_id) not in overlap_cmh_tile_scores:
            overlap_cmh_tile_scores[(img1_id, img2_id)] = {}
        overlap_cmh_tile_scores[(img1_id, img2_id)][img1_overlap_tag] = np.array(cmh_scores)

    df = pd.read_pickle(overlap_gcm_tile_scores_file)
    overlap_gcm_tile_scores = {}
    for img1_id, img2_id, img1_overlap_tag, *gcm_scores in tqdm_notebook(df.to_dict('split')['data']):
        if (img1_id, img2_id) not in overlap_gcm_tile_scores:
            overlap_gcm_tile_scores[(img1_id, img2_id)] = {}
        overlap_gcm_tile_scores[(img1_id, img2_id)][img1_overlap_tag] = np.array(gcm_scores)

    df = pd.read_pickle(overlap_enp_tile_scores_file)
    overlap_enp_tile_scores = {}
    for img1_id, img2_id, img1_overlap_tag, *enp_scores in tqdm_notebook(df.to_dict('split')['data']):
        if (img1_id, img2_id) not in overlap_enp_tile_scores:
            overlap_enp_tile_scores[(img1_id, img2_id)] = {}
        overlap_enp_tile_scores[(img1_id, img2_id)][img1_overlap_tag] = np.array(enp_scores)

    df = pd.read_pickle(overlap_pix_tile_scores_file)
    overlap_pix_tile_scores = {}
    for img1_id, img2_id, img1_overlap_tag, *pix_scores in tqdm_notebook(df.to_dict('split')['data']):
        if (img1_id, img2_id) not in overlap_pix_tile_scores:
            overlap_pix_tile_scores[(img1_id, img2_id)] = {}
        overlap_pix_tile_scores[(img1_id, img2_id)][img1_overlap_tag] = np.array(pix_scores)

    Overlap_Scores = namedtuple('overlap_scores', ['bmh', 'cmh', 'con', 'hom', 'eng', 'cor', 'epy', 'enp', 'pix'])
    for (img1_id, img2_id), img1_overlap_tags in tqdm_notebook(sorted(overlap_bmh_tile_scores.items())):
        for img1_overlap_tag in img1_overlap_tags:

            bmh_scores = overlap_bmh_tile_scores[(img1_id, img2_id)][img1_overlap_tag]
            cmh_scores = overlap_cmh_tile_scores[(img1_id, img2_id)][img1_overlap_tag]
            con_scores = overlap_gcm_tile_scores[(img1_id, img2_id)][img1_overlap_tag][:, 0]
            hom_scores = overlap_gcm_tile_scores[(img1_id, img2_id)][img1_overlap_tag][:, 1]
            eng_scores = overlap_gcm_tile_scores[(img1_id, img2_id)][img1_overlap_tag][:, 2]
            cor_scores = overlap_gcm_tile_scores[(img1_id, img2_id)][img1_overlap_tag][:, 3]
            epy_scores = overlap_gcm_tile_scores[(img1_id, img2_id)][img1_overlap_tag][:, 4]
            enp_scores = overlap_enp_tile_scores[(img1_id, img2_id)][img1_overlap_tag]
            pix_scores = overlap_pix_tile_scores[(img1_id, img2_id)][img1_overlap_tag]

            overlap_scores = Overlap_Scores(bmh_scores, cmh_scores, con_scores, hom_scores, eng_scores, cor_scores, epy_scores, enp_scores, pix_scores)

            if (img1_id, img2_id) not in overlap_image_maps:
                overlap_image_maps[(img1_id, img2_id)] = {}
            overlap_image_maps[(img1_id, img2_id)][img1_overlap_tag] = overlap_scores

In [None]:
n_matching_tiles_list = [9, 6, 4, 3, 2]
overlap_image_maps = {}
for n_matching_tiles in n_matching_tiles_list:
    load_image_image_properties(n_matching_tiles, overlap_image_maps)
    print(len(overlap_image_maps))

In [None]:
from torch.utils import data

img_overlap_index_maps = generate_tag_pair_lookup()
TilePairs = namedtuple('TilePairs', 'img1_id img2_id img1_overlap_tag overlap_idx idx1 idx2')

def get_img(img_id):
    return cv2.imread(os.path.join(train_image_dir, img_id))
    

class Dataset(data.Dataset):
    
    """Characterizes a dataset for PyTorch"""
    def __init__(self, tile_pairs, 
                 image_transform=None,
                 in_shape=(6, 256, 256), 
                 out_shape=(1,)):

        """Initialization"""
        self.sz = 256
        self.tile_pairs = tile_pairs
        self.image_transform = image_transform
        self.ij = ((0, 0), (0, 1), (0, 2),
                   (1, 0), (1, 1), (1, 2),
                   (2, 0), (2, 1), (2, 2))
        
        self.in_shape = in_shape
        self.out_shape = out_shape
        
    def __len__(self):
        """Denotes the total number of samples"""
        return len(self.tile_pairs)

    def __getitem__(self, index):
        """Generates one sample of data"""
        tp = self.tile_pairs[index]
        
        img1 = get_img(tp.img1_id)
        img2 = get_img(tp.img2_id)
        
        tile1 = cv2.cvtColor(self.get_tile(img1, *self.ij[tp.idx1]), cv2.COLOR_BGR2RGB).astype(np.float32) / 255.
        tile2 = cv2.cvtColor(self.get_tile(img2, *self.ij[tp.idx2]), cv2.COLOR_BGR2RGB).astype(np.float32) / 255.
        
        X = np.dstack([tile1, tile2])
        X = X.transpose((2, 0, 1))
        X = torch.from_numpy(X)
        return X
    
    def get_tile(self, img, i, j):
        return img[i * self.sz:(i + 1) * self.sz, j * self.sz:(j + 1) * self.sz, :]

In [None]:
def preprocess(x):
    return x.view(-1, 6, 256, 256).to(device)

class WrappedDataLoader:
    def __init__(self, dl, func):
        self.dl = dl
        self.func = func

    def __len__(self):
        return len(self.dl)

    def __iter__(self):
        batches = iter(self.dl)
        for b in batches:
            yield (self.func(b))

In [None]:
tile_pairs = []
for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
    for img1_overlap_tag in overlap_maps:
        if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
            continue
        for overlap_idx, (idx1, idx2) in enumerate(img_overlap_index_maps[img1_overlap_tag]):
            tile_pairs.append(TilePairs(img1_id, img2_id, img1_overlap_tag, overlap_idx, idx1, idx2))

print(len(tile_pairs))

In [None]:
test_ds = Dataset(tile_pairs)
test_dl = data.DataLoader(test_ds, batch_size=256, num_workers=12)
test_dl = WrappedDataLoader(test_dl, preprocess)
print(len(test_dl))

In [None]:
model = load_checkpoint('out/dup_model.last.pth')
model.cuda()

In [None]:
model.to(device)

model.eval()
with torch.no_grad():
    yprobs0 = [model(xb) for xb in tqdm_notebook(test_dl)]
    print(len(yprobs0))
    yprobs = np.vstack([l.cpu() for l in yprobs0]).reshape(-1)
    print(yprobs.shape)

In [None]:
yprobs_c = np.where(np.abs(yprobs - 0.5) < 0.47)[0]
print(yprobs_c.shape)

In [None]:
is_weak_pred = False
weak_preds = []
overlap_cnn_tile_scores = {}
for ii, (tp, yprob) in enumerate(zip(tile_pairs, yprobs)):
    if ii in yprobs_c:
        is_weak_pred = True
    if (tp.img1_id, tp.img2_id) not in overlap_cnn_tile_scores:
        overlap_cnn_tile_scores[(tp.img1_id, tp.img2_id)] = {}
    if tp.img1_overlap_tag not in overlap_cnn_tile_scores[(tp.img1_id, tp.img2_id)]:
        n_overlapping_tiles = len(img_overlap_index_maps[tp.img1_overlap_tag])
        cnn_scores = [None] * n_overlapping_tiles
        overlap_cnn_tile_scores[(tp.img1_id, tp.img2_id)][tp.img1_overlap_tag] = cnn_scores
    overlap_cnn_tile_scores[(tp.img1_id, tp.img2_id)][tp.img1_overlap_tag][tp.overlap_idx] = yprob
    if tp.overlap_idx == n_overlapping_tiles - 1 and is_weak_pred:
        weak_preds.append((tp.img1_id, tp.img2_id, tp.img1_overlap_tag))
        is_weak_pred = False

In [None]:
len(weak_preds)

## Find overlaps with ships

In [None]:
untested_image_pairs_with_ship_masks = []
with h5py.File(train_seg_file, 'r') as full_mask:
    for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
        # TODO: Find out which remaining tile pairs have masks but aren't in dup_truth.
        
        has_mask1 = img1_id in full_mask
        has_mask2 = img2_id in full_mask
        if not (has_mask1 and has_mask2):
            continue
            
        for img1_overlap_tag in overlap_maps:
            if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
                continue
            untested_image_pairs_with_ship_masks.append((img1_id, img2_id))
            break

len(overlap_image_maps), len(untested_image_pairs_with_ship_masks)

In [None]:
untested_overlaps_with_ship_masks = []
with h5py.File(train_seg_file, 'r') as full_mask:
    for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
        # TODO: Find out which remaining tile pairs have masks but aren't in dup_truth.
        
        has_mask1 = img1_id in full_mask
        has_mask2 = img2_id in full_mask
        if not (has_mask1 and has_mask2):
            continue
            
        for img1_overlap_tag in overlap_maps:
            if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
                continue

            mask1 = full_mask[img1_id][:] if has_mask1 else zeros_mask
            mask2 = full_mask[img2_id][:] if has_mask2 else zeros_mask
            
            mask1_slice_total = np.sum(mask1[overlap_tag_slices[img1_overlap_tag]])
            mask2_slice_total = np.sum(mask2[overlap_tag_slices[overlap_tag_pairs[img1_overlap_tag]]])
            
            if mask1_slice_total + mask2_slice_total < 1:
                continue

            untested_overlaps_with_ship_masks.append((img1_id, img2_id, img1_overlap_tag))

len(overlap_image_maps), len(untested_overlaps_with_ship_masks)

## Find overlapping images using hashlib
Update: The values between two supposedly exact 256x256 crops are not exact (See below).

In [None]:

md5hash_dict = defaultdict(list)
img_ids = os.listdir(train_image_dir)

for img_id in tqdm_notebook(img_ids):
    for h in sdcic.tile_md5hash_grids[img_id]:
        md5hash_dict[h].append(img_id)

dup_counts_dict = defaultdict(int)
for key, dups in md5hash_dict.items():
    dup_counts_dict[len(dups)] += 1

sorted_bin_sizes = sorted(dup_counts_dict.items())
sorted_bin_sizes

In [None]:
batch_size = 9
skip = 365
ii = 0
jj = 0
batch_limit = 9
samples_images = np.empty((batch_limit, 768, 768, 3), dtype=np.float32)

for hash_id, dups in md5hash_dict.items():
    ii += 1
    if len(dups) == batch_size:
        dups0 = list(set(dups))
        img_id = dups0[0]
        idx = sdcic.tile_md5hash_grids[img_id].index(hash_id)
        print(hash_id, len(dups), ii, sdcic.tile_entropy_grids[img_id][idx])
        if jj == min(dup_counts_dict[len(dups)], skip):
            break
        jj += 1

for i, c_img_id in enumerate(dups0[:batch_limit]):
    c_img = cv2.cvtColor(sdcic.get_img(c_img_id), cv2.COLOR_BGR2RGB)
    samples_images[i] = c_img.astype(np.float32) / 255.0

batch_rgb = montage_rgb(samples_images)
print(samples_images.shape)
print(batch_rgb.shape, batch_rgb.dtype)

fig, ax = plt.subplots(1, 1, figsize = (16, 16))
ax.imshow(batch_rgb, vmin=0, vmax=1)
plt.axis('off')
# plt.savefig(os.path.join('out', BASE_MODEL, f"{train_meta_filebase}_{score_str}_batch_{BATCH_NUM}.jpg"))
plt.show()

## Find overlapping images using cv2.img_hash

In [None]:
bm0hash_dict = defaultdict(list)
img_ids = os.listdir(train_image_dir)

for img_id in tqdm_notebook(img_ids):
    for h in sdcic.tile_bm0hash_grids[img_id]:
        bm0hash_dict[tuple(h)].append(img_id)  # hex

dup_counts_dict = defaultdict(int)
for key, dups in bm0hash_dict.items():
    dup_counts_dict[len(dups)] += 1

sorted_bin_sizes = sorted(dup_counts_dict.items())
sorted_bin_sizes

In [None]:
batch_size = 18
skip = 5
ii = 0
jj = 0
batch_limit = 9
samples_images = np.empty((batch_limit, 768, 768, 3), dtype=np.float32)

for hash_id, dups in bm0hash_dict.items():
    ii += 1
    if len(dups) == batch_size:
        dups0 = list(set(dups))
        img_id = dups0[0]
        idx = sdcic.tile_bm0hash_grids[img_id].index(hash_id)
        print(hash_id, len(dups), ii, sdcic.tile_entropy_grids[img_id][idx])
        if jj == min(dup_counts_dict[len(dups)], skip):
            break
        jj += 1

for i, c_img_id in enumerate(dups0[:batch_limit]):
    c_img = cv2.cvtColor(sdcic.get_img(c_img_id), cv2.COLOR_BGR2RGB)
    samples_images[i] = c_img.astype(np.float32) / 255.0

batch_rgb = montage_rgb(samples_images)
print(samples_images.shape)
print(batch_rgb.shape, batch_rgb.dtype)

fig, ax = plt.subplots(1, 1, figsize = (16, 16))
ax.imshow(batch_rgb, vmin=0, vmax=1)
plt.axis('off')
# plt.savefig(os.path.join('out', BASE_MODEL, f"{train_meta_filebase}_{score_str}_batch_{BATCH_NUM}.jpg"))
plt.show()

## Here we explore dup detection using image gradients and cross entropy 

In [None]:
score_lim0 = 0
score_lim1 = 1
for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
    if img1_id > img2_id:
        # sanity check
        raise ValueError(f'img1_id ({img1_id}) should be lexicographically smaller than img2_id ({img2_id})')
    for img1_overlap_tag, scores in overlap_maps.items():
        if (img1_id, img2_id, img1_overlap_tag) not in dup_truth:
            continue
        
        is_dup = dup_truth[(img1_id, img2_id, img1_overlap_tag)]

        if is_dup == 0 and np.max(scores.enp) > score_lim0:
            score_lim0 = np.max(scores.enp)
            print_score = True
        elif is_dup == 1 and np.max(scores.enp) < score_lim1:
            score_lim1 = np.max(scores.enp)
            print_score = True
        else:
            print_score = False

        if print_score:
            img1_entropy_vec = get_entropy(img1_id)
            img2_entropy_vec = get_entropy(img2_id)
            img1_entropy_vec_norm = np.linalg.norm(img1_entropy_vec)
            img2_entropy_vec_norm = np.linalg.norm(img2_entropy_vec)
            n_vec = np.max([img1_entropy_vec_norm, img2_entropy_vec_norm])
            img1_scaled_vec = img1_entropy_vec / n_vec
            img2_scaled_vec = img2_entropy_vec / n_vec
            grad_score = 1.0 - np.linalg.norm(img1_scaled_vec - img2_scaled_vec)

            entropy2 = get_entropy2(img1_id, img2_id)
            entropy2_norm = np.linalg.norm(entropy2)
            
            print('')
            print(f"{is_dup}, {min(scores.bmh):7.5f}, {min(scores.cmh):7.5f}, {grad_score:7.5f}, {entropy2_norm}")
            print(img1_id, img1_entropy_vec, f"{img1_entropy_vec_norm}")
            print(img2_id, img2_entropy_vec, f"{img2_entropy_vec_norm}")
            print(get_entropy(img1_id))
            print(get_entropy(img2_id))
            print(entropy2)
            print(np.max(scores.enp))


In [None]:
img1_id = '691d5afc2.jpg'
img2_id = '56417e7af.jpg'

In [None]:
img1_entropy_vec = get_entropy(img1_id)
img2_entropy_vec = get_entropy(img2_id)
img1_entropy_vec_norm = np.linalg.norm(img1_entropy_vec)
img2_entropy_vec_norm = np.linalg.norm(img2_entropy_vec)
n_vec = np.max([img1_entropy_vec_norm, img1_entropy_vec_norm])
img1_scaled_vec = img1_entropy_vec / n_vec
img2_scaled_vec = img2_entropy_vec / n_vec
print('')
print(img1_id, img1_entropy_vec, f"{img1_entropy_vec_norm}")
print(img2_id, img2_entropy_vec, f"{img1_entropy_vec_norm}")
print(f"{np.linalg.norm(img1_scaled_vec - img2_scaled_vec)}")

In [None]:
df = pd.read_csv('dup_blacklist_6.csv', sep=', ')
for idx, row in df.iterrows():
    print(idx)
    img1_entropy_vec = get_entropy(row['ImageId1'])
    img1_entropy_vec_u = img1_entropy_vec / np.linalg.norm(img1_entropy_vec)
    print(row['ImageId1'], img1_entropy_vec)
    img2_entropy_vec = get_entropy(row['ImageId2'])
    img2_entropy_vec_u = img2_entropy_vec / np.linalg.norm(img2_entropy_vec)
    print(row['ImageId2'], img2_entropy_vec)
    print(np.dot(img1_entropy_vec_u, img2_entropy_vec_u), np.linalg.norm(img1_entropy_vec - img2_entropy_vec))

## search for reasonable thresholds

In [None]:
bmh_scores = defaultdict()
cmh_scores = defaultdict()
pix_scores = defaultdict(int)

for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
    if img1_id > img2_id:
        # sanity check
        raise ValueError(f'img1_id ({img1_id}) should be lexicographically smaller than img2_id ({img2_id})')
    for img1_overlap_tag, scores in overlap_maps.items():
        if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
            continue

        for i in range(len(scores.bmh)):
            idx = (img1_id, img2_id, img1_overlap_tag, i)
            bmh_scores[idx] = scores.bmh[i]
            cmh_scores[idx] = scores.cmh[i]
            pix_scores[idx] = scores.pix[i]

overlap_scores_df = pd.DataFrame()
overlap_scores_df['bmh'] = pd.Series(bmh_scores)
overlap_scores_df['cmh'] = pd.Series(cmh_scores)
overlap_scores_df['pix'] = pd.Series(pix_scores)

overlap_scores_df.describe(percentiles=[.01, .05, .1, .25, .5, .75, .90, .95, .99])

In [None]:
bmh_arr = []
cmh_arr = []
con_arr = []
hom_arr = []
eng_arr = []
cor_arr = []
epy_arr = []
enp_arr = []
pix_arr = []

for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
    for img1_overlap_tag, scores in overlap_maps.items():
        if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
            continue

        bmh_arr += list(scores.bmh)
        cmh_arr += list(scores.cmh)
        con_arr += list(scores.con)
        hom_arr += list(scores.hom)
        eng_arr += list(scores.eng)
        cor_arr += list(scores.cor)
        epy_arr += list(scores.epy)
        enp_arr += list(scores.enp)
        pix_arr += list(scores.pix)

In [None]:
overlap_limits_df = pd.DataFrame()
overlap_limits_df['bmh'] = pd.Series(bmh_arr)
overlap_limits_df['cmh'] = pd.Series(cmh_arr)
overlap_limits_df['con'] = pd.Series(con_arr)
overlap_limits_df['hom'] = pd.Series(hom_arr)
overlap_limits_df['eng'] = pd.Series(eng_arr)
overlap_limits_df['cor'] = pd.Series(cor_arr)
overlap_limits_df['epy'] = pd.Series(epy_arr)
overlap_limits_df['enp'] = pd.Series(enp_arr)
overlap_limits_df['pix'] = pd.Series(pix_arr)

In [None]:
overlap_limits_df.describe(percentiles=[.001, .01, .02, .05, .1, .5, .9, .95, .98, .99, 0.999])

In [None]:
#  |-----|--------------|-----|
# min  lower          upper  max

metric_tags = ['bmh', 'cmh', 'con', 'hom', 'eng', 'cor', 'epy', 'enp', 'pix']
Overlap_Scores_Lower_Limit = namedtuple('overlap_scores_lower_limit', metric_tags)
Overlap_Scores_Upper_Limit = namedtuple('overlap_scores_upper_limit', metric_tags)

osl_lower = Overlap_Scores_Lower_Limit(0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 50)
osl_upper = Overlap_Scores_Upper_Limit(0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 4500000)

## Filter

In [None]:
Overlap_Idx_Scores = namedtuple('overlap_idx_scores', [
    'idx', 
    'bmh_min', 'cmh_min', 'con_min', 'hom_min', 'eng_min', 'cor_min', 'epy_min', 'enp_min', 'pix_min', 
    'bmh_max', 'cmh_max', 'con_max', 'hom_max', 'eng_max', 'cor_max', 'epy_max', 'enp_max', 'pix_max'])

bmh_min_hits = 0
cmh_min_hits = 0
con_min_hits = 0
hom_min_hits = 0
eng_min_hits = 0
cor_min_hits = 0
epy_min_hits = 0
enp_min_hits = 0
pix_min_hits = 0

bmh_max_hits = 0
cmh_max_hits = 0
con_max_hits = 0
hom_max_hits = 0
eng_max_hits = 0
cor_max_hits = 0
epy_max_hits = 0
enp_max_hits = 0
pix_max_hits = 0

flat_score_good = 0
flat_score_bad = 0
print_first_good = True
print_first_bad = True
tst1_ct = 0
tst2_ct = 0
tst3_ct = 0
tst4_ct = 0
tst5_ct = 0
tst6_ct = 0
tst7_ct = 0
tst1 = False
tst2 = False
tst3 = False
tst4 = False
tst5 = False
tst6 = False
tst7 = False
n_not_dups = 0

overlap_candidates = []
for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
    for img1_overlap_tag, scores in overlap_maps.items():

        if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
            
#             if max(scores.pix) > 256*256*3*255 * 0.5:
#                 continue
            continue
            is_dup = dup_truth[(img1_id, img2_id, img1_overlap_tag)]
            
            img1_nine, img2_nine = image_image_duplicate_tiles[(img1_id, img2_id)]
            img1_mask = img1_nine[overlap_tag_nines_mask[img1_overlap_tag]]
            img2_mask = img2_nine[overlap_tag_nines_mask[overlap_tag_pairs[img1_overlap_tag]]]
            
            img1_match_indices = np.where(img1_mask != 9)[0]
            img2_match_indices = np.where(img2_mask != 9)[0]
            img12_match_indices = np.intersect1d(img1_match_indices, img2_match_indices)
            
#             if len(img12_match_indices) == 0:
#                 continue
                
            tst1 = np.any(img1_mask != img2_mask)
            tst2 = max(scores.pix) > 10000
            tst3 = np.all(img1_mask == img2_mask)
            tst4 = max(scores.pix) > 100000
#             tst5 = sum(scores.pyr[img12_match_indices]) == len(img12_match_indices)
            tst6 = len(img12_match_indices) > 0
            
            if not is_dup:
                n_not_dups += 1
                if tst1:
                    tst1_ct += 1
                if tst2:
                    tst2_ct += 1
                if tst3:
                    tst3_ct += 1
                if tst4:
                    tst4_ct += 1
                if tst5:
                    tst5_ct += 1
                if tst6:
                    tst6_ct += 1
                
            if tst1:
                if is_dup:
                    flat_score_bad += 1
                    if print_first_bad:
        
                        print('hit1: bad', is_dup)
                        print((img1_id, img2_id, img1_overlap_tag))
                        print(img1_mask)
                        print(img2_mask)
                        print(img12_match_indices)
                        print(img1_mask[img12_match_indices])
                        print(scores.pyr)
                        print(scores.pix)
                        print_first_bad = False
                else:
                    flat_score_good += 1
#                 continue
                
            if tst3 and tst4 and tst6:
                if is_dup:
                    flat_score_bad += 1
                    if print_first_bad:
        
                        print('hit2: bad', is_dup)
                        print((img1_id, img2_id, img1_overlap_tag))
                        print(img1_mask)
                        print(img2_mask)
                        print(img12_match_indices)
                        print(img1_mask[img12_match_indices])
                        print(scores.pyr)
                        print(scores.pix)
                        print_first_bad = False
                else:
                    flat_score_good += 1
#                 continue
                
            if tst5 and tst2 and tst6:
                if is_dup:
                    flat_score_bad += 1
                    if print_first_bad:
        
                        print('hit3: bad', is_dup)
                        print((img1_id, img2_id, img1_overlap_tag))
                        print(img1_mask)
                        print(img2_mask)
                        print(img12_match_indices)
                        print(img1_mask[img12_match_indices])
                        print(scores.pyr)
                        print(scores.pix)
                        print_first_bad = False
                else:
                    flat_score_good += 1
                    if print_first_good:
        
                        print('hit: good', is_dup)
                        print((img1_id, img2_id, img1_overlap_tag))
                        print(img1_mask)
                        print(img2_mask)
                        print(img12_match_indices)
                        print(img1_mask[img12_match_indices])
                        print(scores.pyr)
                        print(scores.pix)
                        print_first_good = False
                
#             continue

    # This is here so I don't forget to address small 2 tile or 1 tile overlaps later.
#     if len(img1_mask) <= 2:
#         print(img1_mask, img2_mask)
#         continue
    
    # (0, 3, 6) == (0, 3, 6) is exact duplicate
#     if len(set(img1_mask)) == len(img1_mask) and np.all(img1_mask == img2_mask) and 9 not in img1_mask:
#         continue
    
    # (0, 0, 0) == (0, 0, 0) skip probably is duplicate of white clouds, or blue border.
#     if len(set(img1_mask) | set(img2_mask)) == 1 and 9 not in img1_mask:
#         continue
    
    # (0, 0, 0) == (2, 2, 2) is NOT duplicate. probably white clouds overlap with blue boarder
#     if len(set(img1_mask)) == 1 and len(set(img2_mask)) == 1 and set(img1_mask) != set(img2_mask) and 9 not in img1_mask and 9 not in img2_mask:
#         continue
        
#     if len(set(img1_mask)) == 1 and len(set(img2_mask)) != 1 and 9 not in img1_mask:
#         if set(img1_mask) != set(img2_mask):
#         continue
    
#     if np.min(scores.enp) <= 0.995 or np.min(scores.enp) >= 0.999:
#         continue

        constraint_hits = 0
        
        bmh_min = np.min(scores.bmh)
        if bmh_min < osl_lower.bmh:
            bmh_min_hits += 1
            constraint_hits += 1
            
        cmh_min = np.min(scores.cmh)
        if cmh_min < osl_lower.cmh:
            cmh_min_hits += 1
            constraint_hits += 1
            
        con_min = np.min(scores.con)
        if con_min < osl_lower.con:
            con_min_hits += 1
            constraint_hits += 1
            
        hom_min = np.min(scores.hom)
        if hom_min < osl_lower.hom:
            hom_min_hits += 1
            constraint_hits += 1
            
        eng_min = np.min(scores.eng)
        if eng_min < osl_lower.eng:
            eng_min_hits += 1
            constraint_hits += 1
            
        cor_min = np.min(scores.cor)
        if cor_min < osl_lower.cor:
            cor_min_hits += 1
            constraint_hits += 1
            
        epy_min = np.min(scores.epy)
        if epy_min < osl_lower.epy:
            epy_min_hits += 1
            constraint_hits += 1
            
        enp_min = np.min(scores.enp)
        if enp_min < osl_lower.enp:
            enp_min_hits += 1
            constraint_hits += 1
            
        pix_min = np.min(scores.pix)
        if pix_min < osl_lower.pix:
            pix_min_hits += 1
            constraint_hits += 1

            
        bmh_max = np.max(scores.bmh)
        if bmh_max > osl_upper.bmh:
            bmh_max_hits += 1
            constraint_hits += 1
            
        cmh_max = np.max(scores.cmh)
        if cmh_max > osl_upper.cmh:
            cmh_max_hits += 1
            constraint_hits += 1
            
        con_max = np.max(scores.con)
        if con_max > osl_upper.con:
            con_max_hits += 1
            constraint_hits += 1

        hom_max = np.max(scores.hom)
        if hom_max > osl_upper.hom:
            hom_max_hits += 1
            constraint_hits += 1
            
        eng_max = np.max(scores.eng)
        if eng_max > osl_upper.eng:
            eng_max_hits += 1
            constraint_hits += 1

        cor_max = np.max(scores.cor)
        if cor_max > osl_upper.cor:
            cor_max_hits += 1
            constraint_hits += 1

        epy_max = np.max(scores.epy)
        if epy_max > osl_upper.epy:
            epy_max_hits += 1
            constraint_hits += 1

        enp_max = np.max(scores.enp)
        if enp_max > osl_upper.enp:
            enp_max_hits += 1
            constraint_hits += 1

        pix_max = np.max(scores.pix)
        if pix_max > osl_upper.pix:
            pix_max_hits += 1
            constraint_hits += 1

        if constraint_hits < 0:
            continue
            
        idx = (img1_id, img2_id, img1_overlap_tag)
        overlap_scores = Overlap_Idx_Scores(
            idx, 
            bmh_min, cmh_min, con_min, hom_min, eng_min, cor_min, epy_min, enp_min, pix_min, 
            bmh_max, cmh_max, con_max, hom_max, eng_max, cor_max, epy_max, enp_max, pix_max)
        overlap_candidates.append(overlap_scores)

In [None]:
print(len(overlap_candidates))
print(bmh_min_hits, cmh_min_hits, con_min_hits, hom_min_hits, eng_min_hits, cor_min_hits, epy_min_hits, enp_min_hits, pix_min_hits)
print(bmh_max_hits, cmh_max_hits, con_max_hits, hom_max_hits, eng_max_hits, cor_max_hits, epy_max_hits, enp_max_hits, pix_max_hits)

In [None]:
print(len(dup_truth), n_not_dups, flat_score_good, flat_score_bad)
print(tst1_ct, tst2_ct, tst3_ct, tst4_ct, tst5_ct, tst6_ct)

## Sort

In [None]:
duplicate_candidates = []
for overlap_candidate in tqdm_notebook(sorted(overlap_candidates, key=operator.attrgetter('con_max', 'pix_max'), reverse=True)):
    duplicate_candidates.append(overlap_candidate.idx)
print(len(duplicate_candidates))

# Create an interactive widget for tagging duplicate overlaps.

In [None]:
from IPython.display import display
from ipywidgets import Button, Image, Layout, Box, HBox, VBox, Output

In [None]:
candidates_iter = iter(duplicate_candidates)
n_candidates = len(duplicate_candidates)

In [None]:
overlap_labels = {}

In [None]:
img1_id = None
img2_id = None
img1_overlap_tag = None
candidates_idx = 0
box_layout = Layout(display='flex',
                    flex_flow='row',
                    align_items='stretch',
                    width='100%')

RED = (244, 67, 54) #F44336 
GREEN = (76, 175, 80) #4CAF50 
LIGHT_BLUE = (3, 169, 244) #03A9F4


def get_next_img_pair():
    global img1_id, img2_id, img1_overlap_tag, candidates_idx
    n_skip = 0
    i_skip = 0
    while True:
        img1_id, img2_id, img1_overlap_tag = next(candidates_iter)
        candidates_idx += 1
        
        if i_skip < n_skip:
            i_skip += 1
            continue
            
        assert img1_id < img2_id
        
        if (img1_id, img2_id, img1_overlap_tag) in overlap_labels:
            continue
        
#         if (img1_id, img2_id, img1_overlap_tag) in weak_preds:
#             break
#         else:
#             continue
        
        scores = overlap_image_maps[(img1_id, img2_id)][img1_overlap_tag]
        
        if max(scores.pix) > 256*256*3*255 * 0.33:
            overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = 0
            continue
        
        img1_nine, img2_nine = image_image_duplicate_tiles[(img1_id, img2_id)]
        img1_mask = img1_nine[overlap_tag_nines_mask[img1_overlap_tag]]
        img2_mask = img2_nine[overlap_tag_nines_mask[overlap_tag_pairs[img1_overlap_tag]]]

        img1_match_indices = np.where(img1_mask != 9)[0]
        img2_match_indices = np.where(img2_mask != 9)[0]
        img12_match_indices = np.intersect1d(img1_match_indices, img2_match_indices)
        
        tst1 = np.any(img1_mask != img2_mask)
        tst2 = max(scores.pix) > 10000
        tst3 = np.all(img1_mask == img2_mask)
        tst4 = max(scores.pix) > 100000
        tst6 = len(img12_match_indices) > 0
        tst7 = len(np.union1d(img1_mask[img1_match_indices], img2_mask[img2_match_indices])) == 1
        tst8 = max(scores.pix) == 0
        
#         if tst1:
#             overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = 0
#             continue

#         if tst3 and tst4 and tst6:
#             overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = 0
#             continue

#         if tst7 and tst8: # perfect white on white
#             continue

        break

    return img1_id, img2_id, img1_overlap_tag

def draw_images(img1_id, img2_id, img1_overlap_tag):
    global candidates_idx
    
    scores = overlap_image_maps[(img1_id, img2_id)][img1_overlap_tag]
    imgmod1 = ImgMod(os.path.join(train_image_dir, img1_id))
    imgmod2 = ImgMod(os.path.join(train_image_dir, img2_id))
    
    dtick = 256
    n_ticks = imgmod1.shape[1] // dtick + 1
    ticks = [i * dtick for i in range(n_ticks)]

    slice1 = overlap_tag_slices[img1_overlap_tag]
    slice2 = overlap_tag_slices[overlap_tag_pairs[img1_overlap_tag]]

    m12 = np.median(np.vstack([imgmod1.parent_rgb[slice1], imgmod2.parent_rgb[slice2]]), axis=(0, 1), keepdims=True).astype(np.uint8)
    img1_drop = imgmod1.parent_rgb - m12
    img2_drop = imgmod2.parent_rgb - m12
        
    brightness_level = -100 if np.sum(m12) >= 384 else 100
    img1 = imgmod1.channel_shift('L', brightness_level)
    img2 = imgmod2.channel_shift('L', brightness_level)
    
    img1_nine, img2_nine = image_image_duplicate_tiles[(img1_id, img2_id)]
    img1_mask = img1_nine[overlap_tag_nines_mask[img1_overlap_tag]]
    img2_mask = img2_nine[overlap_tag_nines_mask[overlap_tag_pairs[img1_overlap_tag]]]
    img1_dups_str = ' '.join(list(map(str, img1_mask)))
    img2_dups_str = ' '.join(list(map(str, img2_mask)))
    
    if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
        bbox_color = GREEN if dup_truth[(img1_id, img2_id, img1_overlap_tag)] else RED
    else:
        bbox_color = LIGHT_BLUE
        
    bbox_thickness = 4
    offset = (bbox_thickness // 2) + 1
    offset_array = np.array([[offset], [-offset]])
    img1_bbox_pt1, img1_bbox_pt2 = boundingbox_corners[img1_overlap_tag] + offset_array
    img2_bbox_pt1, img2_bbox_pt2 = boundingbox_corners[overlap_tag_pairs[img1_overlap_tag]] + offset_array
    
    img1[slice1] = img1_drop[slice1]
    img2[slice2] = img2_drop[slice2]
    cv2.rectangle(img1, tuple(img1_bbox_pt1), tuple(img1_bbox_pt2), bbox_color, bbox_thickness)
    cv2.rectangle(img2, tuple(img2_bbox_pt1), tuple(img2_bbox_pt2), bbox_color, bbox_thickness)

    fig, ax = plt.subplots(2, 2, figsize=(15, 15))
    ax[0][0].imshow(img1)
#     ax[0][0].set_title(f'{candidates_idx/n_candidates:6.3f} {img1_id} {min(scores.bmh):7.5f}')
    ax[0][0].set_title(f'{img1_id} {min(scores.bmh):7.5f} {min(scores.cmh):7.5f}')
    ax[0][0].set_xticks(ticks)
    ax[0][0].set_yticks(ticks)

    ax[0][1].imshow(img2)
    ax[0][1].set_title(f'{img2_id} con: {min(scores.con):7.5f} {max(scores.con):7.5f}')
    ax[0][1].set_xticks(ticks)
    ax[0][1].set_yticks(ticks)
    
    img1[slice1] = imgmod1.parent_rgb[slice1]
    img2[slice2] = imgmod2.parent_rgb[slice2]
    cv2.rectangle(img1, tuple(img1_bbox_pt1), tuple(img1_bbox_pt2), bbox_color, bbox_thickness)
    cv2.rectangle(img2, tuple(img2_bbox_pt1), tuple(img2_bbox_pt2), bbox_color, bbox_thickness)

    ax[1][0].imshow(img1)
    ax[1][0].set_title(f'({img1_dups_str}) hom: {np.min(scores.hom):7.5f} {np.max(scores.hom):7.5f}')
    ax[1][0].set_xticks(ticks)
    ax[1][0].set_yticks(ticks)

    ax[1][1].imshow(img2)
    ax[1][1].set_title(f'({img2_dups_str}) eng: {np.min(scores.eng):7.5f} {np.max(scores.eng):7.5f}')# {max(scores.pix)}')
    ax[1][1].set_xticks(ticks)
    ax[1][1].set_yticks(ticks)
    
    return ax
    
def redraw(img1_id, img2_id, img1_overlap_tag):
    out.clear_output(True)
    with out:
        ax = draw_images(img1_id, img2_id, img1_overlap_tag)
        plt.show()
    

In [None]:
same_button = Button(
    description='Same',
    disabled=False,
    layout=Layout(flex='1 1 auto', width='auto'), 
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Overlays are the same',
    icon='check'
)

diff_button = Button(
    description='Diff',
    disabled=False,
    layout=Layout(flex='1 1 auto', width='auto'), 
    button_style='danger', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Overlays are different',
    icon='x'
)

skip_button = Button(
    description='Skip',
    disabled=False,
    layout=Layout(flex='1 1 auto', width='auto'), 
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Not sure.  Skip for now.',
    icon='?'
)

def on_same_button_clicked(b):
    global img1_id, img2_id, img1_overlap_tag
    overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = 1
    img1_id, img2_id, img1_overlap_tag = get_next_img_pair()
    redraw(img1_id, img2_id, img1_overlap_tag)
    
def on_diff_button_clicked(b):
    global img1_id, img2_id, img1_overlap_tag
    overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = 0
    img1_id, img2_id, img1_overlap_tag = get_next_img_pair()
    redraw(img1_id, img2_id, img1_overlap_tag)
    
def on_skip_button_clicked(b):
    global img1_id, img2_id, img1_overlap_tag
    img1_id, img2_id, img1_overlap_tag = get_next_img_pair()
    redraw(img1_id, img2_id, img1_overlap_tag)

same_button.on_click(on_same_button_clicked)
diff_button.on_click(on_diff_button_clicked)
skip_button.on_click(on_skip_button_clicked)

In [None]:
out = Output()
buttons_3 = Box(children=[same_button, diff_button, skip_button], layout=box_layout)
display(VBox([out, buttons_3]))

img1_id, img2_id, img1_overlap_tag = get_next_img_pair()
with out:
    ax = draw_images(img1_id, img2_id, img1_overlap_tag)
    plt.show()

In [None]:
# print details of current iteration
scores = overlap_image_maps[(img1_id, img2_id)][img1_overlap_tag]
scores

In [None]:
# print our progress
len(overlap_labels), 100*len(overlap_labels)/len(duplicate_candidates)

In [None]:
# undo last
for k in overlap_labels:
    continue
print(k)
del overlap_labels[k]

### Merge overlap_labels into truth

In [None]:
update_duplicate_truth(duplicate_truth_file, overlap_labels)

## After we create image_duplicate_tiles.txt, lets check to see how many duplicate tiles we actually have.

In [None]:
image_duplicate_tiles = read_image_duplicate_tiles(image_duplicate_tiles_file)

In [None]:
dup_tiles = []
dup_hashes = {}
dup_files = []
dup_counts = {}
for img_id, img_dup9 in sdcic.image_duplicate_tiles.items():
    img = None
    c0 = Counter(img_dup9)
    for i, c in c0.items():
        if c == 1:
            continue
        for ii in np.where(img_dup9 == i)[0]:
            new_hash = sdcic.tile_md5hash_grids[img_id][ii]
            if new_hash in dup_hashes:
                dup_counts[new_hash] += 1
                if img_id not in dup_hashes[new_hash]:
                    dup_hashes[new_hash][img_id] = []
                dup_hashes[new_hash][img_id].append(ii)
            else:
                dup_counts[new_hash] = 1
                dup_hashes[new_hash] = {}
                dup_hashes[new_hash][img_id] = []
                dup_hashes[new_hash][img_id].append(ii)
                dup_files.append(img_id)
                img = sdcic.get_img(img_id)
                new_tile = sdcic.get_tile(img, ii)
                dup_tiles.append(new_tile)
                print(len(dup_files)-1, new_hash, img_id, ii)
dup_counts

In [None]:
dup_hashes

In [None]:
for ii, dup_tile in enumerate(dup_tiles):
    print(ii)
    print(dup_tile[2, 2], dup_tile[2, -2])
    print(dup_tile[-2, 2], dup_tile[-2, -2])

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
img1 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, dup_files[0])), cv2.COLOR_BGR2RGB)
ax1.imshow(img1)
img2 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, dup_files[5])), cv2.COLOR_BGR2RGB)
ax2.imshow(img2)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
img1 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, dup_files[1])), cv2.COLOR_BGR2RGB)
ax1.imshow(img1)
img2 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, dup_files[2])), cv2.COLOR_BGR2RGB)
ax2.imshow(img2)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
img1 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, dup_files[3])), cv2.COLOR_BGR2RGB)
ax1.imshow(img1)
img2 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, dup_files[4])), cv2.COLOR_BGR2RGB)
ax2.imshow(img2)

In [None]:
black_tile = np.zeros((256, 256, 3), dtype=np.uint8)
white_tile = black_tile + 255
blue_tile = np.copy(black_tile) 
blue_tile[:, :, 0] = 255
red_tile = np.copy(black_tile) 
red_tile[:, :, 2] = 255
color_tiles = [black_tile, white_tile, blue_tile, red_tile]
for color_tile in color_tiles:
    print(img_hash.blockMeanHash(color_tile, mode=0)[0])
    print(hashlib.md5(color_tile.tobytes()).hexdigest())


In [None]:
black_images = ['03ffa7680.jpg', '8d5521663.jpg', '5a70ef013.jpg', '9a2f9d347.jpg', '37a912dca.jpg', '4add7aa1d.jpg', '3db3ef7cc.jpg', '73fec0637.jpg', '7df214d98.jpg', 'c2955cd21.jpg', 'de018b2a8.jpg', '8ce769141.jpg', 'fc0e22a0a.jpg', '770c46cd4.jpg', 'd6e432b79.jpg', 'd5d1b6fb8.jpg', '0e4d7dd93.jpg', '9ddeed533.jpg', 'addc11de0.jpg', '65418dfe4.jpg', '119d6a3d6.jpg', '1b287c905.jpg', 'b264b0f96.jpg', '996f92939.jpg', 'e5c3b1f59.jpg']
fig, ax = plt.subplots(5, 5, figsize=(15, 15))
for i, img_id in enumerate(black_images):
    img = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, img_id)), cv2.COLOR_BGR2RGB)
    ax[i // 5, i % 5].imshow(img)
    ax[i // 5, i % 5].set_title(img_id)
plt.tight_layout()

## Check the performance of DupNet

In [None]:
ytrue = []
tile_pairs = []
for (img1_id, img2_id, img1_overlap_tag), is_dup in tqdm_notebook(dup_truth.items()):
    for overlap_idx, (idx1, idx2) in enumerate(img_overlap_index_maps[img1_overlap_tag]):
        tile_pairs.append(TilePairs(img1_id, img2_id, img1_overlap_tag, overlap_idx, idx1, idx2))
        ytrue.append(is_dup)
print(len(tile_pairs))

In [None]:
test_ds = Dataset(tile_pairs)
test_dl = data.DataLoader(test_ds, batch_size=256, num_workers=12)
test_dl = WrappedDataLoader(test_dl, preprocess)
print(len(test_dl))

In [None]:
model = load_checkpoint('out/dup_model.last.pth')
model.cuda()
model.to(device)

In [None]:
model.eval()
with torch.no_grad():
    yprobs0 = [model(xb) for xb in tqdm_notebook(test_dl)]
    yprobs = np.vstack([l.cpu() for l in yprobs0]).reshape(-1)
print(len(yprobs0), yprobs.shape)    

In [None]:
overlap_cnn_tile_scores = {}
for tp, yprob in zip(tile_pairs, yprobs):
    if (tp.img1_id, tp.img2_id) not in overlap_cnn_tile_scores:
        overlap_cnn_tile_scores[(tp.img1_id, tp.img2_id)] = {}
    
    if tp.img1_overlap_tag not in overlap_cnn_tile_scores[(tp.img1_id, tp.img2_id)]:
        cnn_scores = np.zeros(len(img_overlap_index_maps[tp.img1_overlap_tag]))
        overlap_cnn_tile_scores[(tp.img1_id, tp.img2_id)][tp.img1_overlap_tag] = cnn_scores
    
    overlap_cnn_tile_scores[(tp.img1_id, tp.img2_id)][tp.img1_overlap_tag][tp.overlap_idx] = yprob

In [None]:
DNN_Stats = namedtuple('dnn_stats', ['yprob', 'ypred', 'ytrue', 'loss'])

ii = 1
dup_dict = {}
for (img1_id, img2_id, img1_overlap_tag), ytrue in tqdm_notebook(dup_truth.items()):
    ii += 1
    assert img1_id < img2_id

    if (img1_id, img2_id, img1_overlap_tag) in dup_dict:
        continue
    if (img1_id, img2_id) not in overlap_image_maps:
        continue
    if img1_overlap_tag not in overlap_image_maps[(img1_id, img2_id)]:
        continue
    scores = overlap_image_maps[(img1_id, img2_id)][img1_overlap_tag]
    if len(scores.pix) < 2:
        continue
    
    dcnn_scores_raw = overlap_cnn_tile_scores[(img1_id, img2_id)][img1_overlap_tag]
    dcnn_scores = (dcnn_scores_raw > 0.5) * 1
    yprob = np.max(dcnn_scores_raw)
    ypred = (yprob > 0.5) * 1
    assert ypred <= 1
    
    if ytrue:
        bce = - ytrue * np.log(yprob)
    else:
        bce = - (1 - ytrue) * np.log(1 - yprob)
    
    dup_dict[(img1_id, img2_id, img1_overlap_tag)] = DNN_Stats(yprob, ypred, ytrue, bce)

In [None]:
DNN_Stats2 = namedtuple('dnn_stats', ['key', 'yprob', 'ypred', 'ytrue', 'loss'])
dup_dict_flat = []
for keys, dnns in tqdm_notebook(dup_dict.items()):
    dup_dict_flat.append(DNN_Stats2(keys, dnns.yprob, dnns.ypred, dnns.ytrue, dnns.loss))

In [None]:
id_tags = []
for dnns in tqdm_notebook(sorted(dup_dict_flat, key=operator.attrgetter('loss'), reverse=True)):
    if dnns.ytrue == dnns.ypred:
        # Skip the ones the dnn got correct.
        continue
    id_tags.append(dnns.key)
len(id_tags)

In [None]:
n_samples = 8
aa = 0
use_median_shift = True

test_files = id_tags[aa * n_samples: (aa + 1) * n_samples]#[::-1]
for f in test_files:
    print(f, dup_dict[f])

dtick = 256
n_ticks = 768 // dtick + 1
ticks = [i * dtick for i in range(n_ticks)]

fig, m_axs = plt.subplots(n_samples, 2, figsize = (12, 6 * n_samples))
for ii, (img1_id, img2_id, img1_overlap_tag) in enumerate(test_files):
    (ax1, ax2) = m_axs[ii]
    yprob, ypred, is_dup, loss = dup_dict[(img1_id, img2_id, img1_overlap_tag)]
    
    imgmod1 = ImgMod(os.path.join(train_image_dir, img1_id))
    imgmod2 = ImgMod(os.path.join(train_image_dir, img2_id))

    slice1 = overlap_tag_slices[img1_overlap_tag]
    slice2 = overlap_tag_slices[overlap_tag_pairs[img1_overlap_tag]]

    m12 = np.median(np.vstack([imgmod1.parent_rgb[slice1], imgmod2.parent_rgb[slice2]]), axis=(0, 1), keepdims=True).astype(np.uint8)
    
    brightness_level = -100 if np.sum(m12) >= 384 else 100
    img1 = imgmod1.channel_shift('L', brightness_level)
    img2 = imgmod2.channel_shift('L', brightness_level)
    
    if use_median_shift:
        img1_drop = imgmod1.parent_rgb - m12
        img2_drop = imgmod2.parent_rgb - m12
    else:        
        img1_drop = imgmod1.parent_rgb
        img2_drop = imgmod2.parent_rgb
    
    img1[slice1] = img1_drop[slice1]
    img2[slice2] = img2_drop[slice2]

    ax1.imshow(img1)
    ax1.set_title(f'{img1_id} {yprob:6.4} ({is_dup})')
    ax1.set_xticks(ticks)
    ax1.set_yticks(ticks)

    ax2.imshow(img2)
    ax2.set_title(f'{img2_id} {loss:4.2f}')
    ax2.set_xticks(ticks)
    ax2.set_yticks(ticks)

plt.tight_layout()
# fig.savefig(os.path.join('temp', BASE_MODEL, f"{train_meta_filebase}_{score_str}_batch_{BATCH_NUM}_row_{aa+1}.jpg"))

## Try out a decision tree classifier for dup_truth

In [None]:
from sklearn import tree
import graphviz

In [None]:
missing_maps = 0
missing_tags = 0
L = []
X = []
Y = []
for (img1_id, img2_id, img1_overlap_tag), is_dup in dup_truth.items():
    
    if (img1_id, img2_id) not in overlap_image_maps:
        missing_maps += 1
        continue
    overlap_maps = overlap_image_maps[(img1_id, img2_id)]
    if img1_overlap_tag not in overlap_maps:
        missing_tags += 1
        continue
    scores = overlap_maps[img1_overlap_tag]
    if len(scores.pix) < 2:
        continue
    
    L.append((img1_id, img2_id, img1_overlap_tag))
    X.append([
        dup_dict[(img1_id, img2_id, img1_overlap_tag)].ypred,
        dup_dict[(img1_id, img2_id, img1_overlap_tag)].loss,
#         min(scores.bmh),
#         min(scores.cmh),
#         max(scores.pix), 
#         min(scores.pyr),
#         max(scores.enp),
    ])
    Y.append([is_dup])

L = np.array(L)
X = np.array(X)
Y = np.array(Y)
# X = [[0, 0], [1, 1]]
# Y = [0, 1]

print(missing_maps)
print(missing_tags)
print(len(X))
print(len(Y), sum(Y))

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)
dot_data = tree.export_graphviz(
    clf, 
    out_file=None, 
    feature_names=[
        'ypred',
        'loss',
#         'min(bmh)', 
#         'min(cmh)', 
#         'max(pix)', 
#         'min(pyr)', 
#         'max(enp)',
    ], 
    filled=True, 
    rounded=True, 
    special_characters=True, 
    leaves_parallel=True) 

graph = graphviz.Source(dot_data) 
graph

In [None]:
all_nodes = clf.apply(X)

In [None]:
nodes = np.where(all_nodes == 3)
np.argmin(X[nodes]), np.min(X[nodes]), np.argmax(X[nodes]), np.max(X[nodes])

In [None]:
idx = 36
print(L[nodes][idx], Y[nodes][idx], X[nodes][idx])
print(overlap_image_maps[(L[nodes][idx][0], L[nodes][idx][1])])

In [None]:
# 9
tricky_examples_9 = [
    ['e28669903.jpg', 'ed2998ef7.jpg', '0022', 1],
    ['66482462b.jpg', 'e2497099c.jpg', '0022', 1],
    ['73fec0637.jpg', '8b0219c19.jpg', '0022', 0]]
# 6
tricky_examples_6 = [
    ['d164aea52.jpg', 'fded6e12d.jpg', '0011', 1],
    ['00ce2c1c0.jpg', '68ef625ba.jpg', '0122', 1],
    ['01178499a.jpg', '7a7a0034a.jpg', '0012', 1],
    ['012d8cca1.jpg', 'bc45cee87.jpg', '0021', 1],
    ['089858a56.jpg', '903a8b121.jpg', '1022', 1],
    ['1ebdf2f08.jpg', 'b1bfb768c.jpg', '0012', 1],  # [91.          0.99781223]
    ['2323bf875.jpg', 'b5da61fce.jpg', '0021', 1],  # [2.05663500e+06 9.98277186e-01]
    ['468bf9178.jpg', '6090b3a8b.jpg', '1022', 1],  # [1.30900000e+03 9.97640283e-01]
    ['d843fc5ca.jpg', 'e805070df.jpg', '1022', 1],
    ['0ef6cd331.jpg', 'e6a6f80cd.jpg', '1022', 0],  # [1.72270000e+04 9.98394555e-01]
    ['d4f0aaa70.jpg', 'd84d4a78a.jpg', '0012', 0],  # [5.95230000e+04 9.98578088e-01] 
    ['7f2be2b0a.jpg', '84dcdc7af.jpg', '0021', 0]]

# 4
tricky_examples_4 = [
    ['0a33ce967.jpg', '3964f0cee.jpg', '0011', 1],
    ['0318fc519.jpg', 'b7feb225a.jpg', '1021', 1],
    ['7234a3a53.jpg', 'dc6534704.jpg', '1021', 1],
    ['de6fb187d.jpg', 'ea6dc23b7.jpg', '1021', 1],  # [223.           0.99544613]
    ['000194a2d.jpg', '384765ab2.jpg', '1022', 1],
    ['c3193fb05.jpg', 'cc68e7818.jpg', '0112', 0],  # [2.16300000e+04 9.98311792e-01]
    ['331987f64.jpg', '4869b48b6.jpg', '0112', 0],
    ['42f02a4a4.jpg', '7d31648ff.jpg', '1122', 0],
    ['cd3c59923.jpg', 'efdd03319.jpg', '1021', 0],  # [6.70246000e+05 9.99894307e-01] 
    ['0c279107f.jpg', '3b1314d5d.jpg', '1021', 0]]

# 3
tricky_examples_3 = [
    ['2f6c0deaa.jpg', 'e44a4f5b0.jpg', '0222', 1],  # [24.          0.99509307]
    ['204906e27.jpg', '892a69b4b.jpg', '0002', 1],  # [6.31644000e+05 9.97614902e-01]
    ['4c56d2f00.jpg', 'dcd94e973.jpg', '2022', 1],  # [6.31635000e+05 9.97534103e-01]
    ['b645cd49b.jpg', 'f2e554691.jpg', '2022', 1],  # [3.76847000e+05 9.96659721e-01]
    ['b998c7415.jpg', 'd4d26f700.jpg', '2022', 1],  # [3.76847000e+05 9.96680501e-01]
    ['0ef6cd331.jpg', '3a9e579aa.jpg', '2022', 0],  # [1.62810000e+04 9.98394555e-01]
    ['0ef6cd331.jpg', '813c8ec35.jpg', '0222', 0],  # [1.79442000e+05 9.98195859e-01]
    ['813c8ec35.jpg', 'caa94ffc3.jpg', '0020', 0],  # [1.76759000e+05 9.99834742e-01]
    ['0256ef90d.jpg', '46da51931.jpg', '0020', 0],  # [3.70260000e+05 9.99319673e-01]
    ['a61b3e245.jpg', 'd84d4a78a.jpg', '2022', 0],  # [2.59134100e+06 9.99175738e-01]
    ['0ee790381.jpg', 'ac87bcee5.jpg', '0020', 0],
    ['2095da0cb.jpg', '45b1a4561.jpg', '2022', 0]]

# 2


In [None]:
def plot_image_pair(img1_id, img2_id, img1_overlap_tag, is_dup):
    
    imgmod1 = ImgMod(os.path.join(train_image_dir, img1_id))
    imgmod2 = ImgMod(os.path.join(train_image_dir, img2_id))
    dcnn_scores_raw = gen_dcnn_scores(imgmod1.parent_rgb, imgmod2.parent_rgb, img1_overlap_tag, model)
    print(dcnn_scores_raw)
#     print(sdcic.tile_entropy_grids[img1_id])
#     print(sdcic.tile_entropy_grids[img2_id])
    print(is_dup, overlap_image_maps[(img1_id, img2_id)])
    
    dtick = 256
    n_ticks = imgmod1.shape[1] // dtick + 1
    ticks = [i * dtick for i in range(n_ticks)]

    slice1 = overlap_tag_slices[img1_overlap_tag]
    slice2 = overlap_tag_slices[overlap_tag_pairs[img1_overlap_tag]]

    cmh2 = img_hash.colorMomentHash(imgmod1.parent_rgb[slice1])
    cmh1 = img_hash.colorMomentHash(imgmod2.parent_rgb[slice2])
    score0_norm = np.linalg.norm(cmh1 - cmh2)
    score0_expnorm = np.exp(-score0_norm)
    print(len(imgmod1.parent_rgb[slice1]), len(cmh1[0]))
#     print(cmh1.reshape((6, 7)))
#     print(cmh2.reshape((6, 7)))
    print(score0_expnorm, score0_norm)
    
    m12 = np.median(np.vstack([imgmod1.parent_rgb[slice1], imgmod2.parent_rgb[slice2]]), axis=(0, 1), keepdims=True).astype(np.uint8)
    
    brightness_level = -100 if np.sum(m12) >= 384 else 100
    img1 = imgmod1.channel_shift('L', brightness_level)
    img2 = imgmod2.channel_shift('L', brightness_level)
    
    img1_drop = imgmod1.parent_rgb - m12
    img2_drop = imgmod2.parent_rgb - m12
    
    img1[slice1] = img1_drop[slice1]
    img2[slice2] = img2_drop[slice2]

    img1_overlap_map = overlap_tag_maps[img1_overlap_tag]
    img2_overlap_map = overlap_tag_maps[overlap_tag_pairs[img1_overlap_tag]]

    img1_nine, img2_nine = image_image_duplicate_tiles[(img1_id, img2_id)]
    img1_mask = img1_nine[overlap_tag_nines_mask[img1_overlap_tag]]
    img2_mask = img2_nine[overlap_tag_nines_mask[overlap_tag_pairs[img1_overlap_tag]]]

    print(img1_mask)
    print(img2_mask)
    
    for idx1, idx2 in zip(img1_overlap_map, img2_overlap_map):
        
        print(f'tile {idx1} / tile {idx2}')
        tile1 = get_tile(imgmod1.parent_rgb, idx1)
        tile2 = get_tile(imgmod2.parent_rgb, idx2)
        score0 = fuzzy_compare(tile1, tile2)
        
        bmh1_0 = img_hash.blockMeanHash(tile1)
        bmh2_0 = img_hash.blockMeanHash(tile2)
        score0_hamm = get_hamming_distance_score(bmh1_0, bmh2_0, normalize=True)
#         print(bmh1_0)
#         print(bmh2_0)

        cmh1_0 = img_hash.colorMomentHash(tile1)
        cmh2_0 = img_hash.colorMomentHash(tile2)
        score0_norm = np.linalg.norm(cmh1_0 - cmh2_0)
        score0_expnorm = np.exp(-score0_norm)
#         print(cmh1_0.reshape((6, 7)))
#         print(cmh2_0.reshape((6, 7)))
        
        tile1_drop = get_tile(img1_drop, idx1)
        tile2_drop = get_tile(img2_drop, idx2)
        score1 = fuzzy_compare(tile1_drop, tile2_drop)

        bmh1_1 = img_hash.blockMeanHash(tile1_drop)
        bmh2_1 = img_hash.blockMeanHash(tile2_drop)
        score1_hamm = get_hamming_distance_score(bmh1_1, bmh2_1, normalize=True)
#         print(bmh1_1)
#         print(bmh2_1)
        
        cmh1_1 = img_hash.colorMomentHash(tile1_drop)
        cmh2_1 = img_hash.colorMomentHash(tile2_drop)
        score1_norm = np.linalg.norm(cmh1_1 - cmh2_1)
        score1_expnorm = np.exp(-score1_norm)
#         print(cmh1_1.reshape((6, 7)))
#         print(cmh2_1.reshape((6, 7)))
        
        m12_tile = np.median(np.vstack([tile1, tile2]), axis=(0, 1), keepdims=True).astype(np.uint8)
        tile1_drop = tile1 - m12_tile
        tile2_drop = tile2 - m12_tile
        score2 = fuzzy_compare(tile1_drop, tile2_drop)
        
        bmh1_2 = img_hash.blockMeanHash(tile1_drop)
        bmh2_2 = img_hash.blockMeanHash(tile2_drop)
        score2_hamm = get_hamming_distance_score(bmh1_2, bmh2_2, normalize=True)
#         print(bmh1_2)
#         print(bmh2_2)
        
        cmh1_2 = img_hash.colorMomentHash(tile1_drop)
        cmh2_2 = img_hash.colorMomentHash(tile2_drop)
        score2_norm = np.linalg.norm(cmh1_2 - cmh2_2)
        score2_expnorm = np.exp(-score2_norm)
#         print(cmh1_2.reshape((6, 7)))
#         print(cmh2_2.reshape((6, 7)))
        
        print(f'{score0:10.8f}, {score0_hamm:10.8f}, {score0_norm:10.8f}, {score0_expnorm:10.8f}')
        print(f'{score1:10.8f}, {score1_hamm:10.8f}, {score1_norm:10.8f}, {score1_expnorm:10.8f}', m12)
        print(f'{score2:10.8f}, {score2_hamm:10.8f}, {score2_norm:10.8f}, {score2_expnorm:10.8f}', m12_tile)
        
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
    
    ax1.imshow(img1)
    ax1.set_title(f'{img1_id}')
    ax1.set_xticks(ticks)
    ax1.set_yticks(ticks)
    
    ax2.imshow(img2)
    ax2.set_title(f'{img2_id}')
    ax2.set_xticks(ticks)
    ax2.set_yticks(ticks)

In [None]:
plot_image_pair(*tricky_examples_6[0])

In [None]:
for (img1_id, img2_id, img1_overlap_tag), (is_dup, tiles_score) in dup_dict.items():
#     if is_dup < 0:
#         continue
    
    filename = os.path.join('temp', f"{img1_id}_{img2_id}")
    if os.path.exists(filename):
        continue

    imgmod1 = ImgMod(os.path.join(train_image_dir, img1_id))
    imgmod2 = ImgMod(os.path.join(train_image_dir, img2_id))
    
    img1 = imgmod1.parent_rgb
    img2 = imgmod2.parent_rgb
    
    dtick = 256
    n_ticks = imgmod1.shape[1] // dtick + 1
    ticks = [i * dtick for i in range(n_ticks)]


    scores = overlap_image_maps[(img1_id, img2_id)][img1_overlap_tag]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
    
    ax1.imshow(img1)
    ax1.set_title(f'{img1_id} ({min(scores.bmh):7.5f})')
    
    ax2.imshow(img2)
    ax2.set_title(f'{img2_id} ({min(scores.cmh):7.5f})')
    
    fig.savefig(filename)
    fig.clear()